In [None]:
import os
import glob
from pathlib import Path
import yaml

import logging

import pandas as pd 
import numpy as np 

import matplotlib.pyplot as plt 
import seaborn as sns 

from sklearn.model_selection import train_test_split, KFold

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score


# Custom Utilities Module
from utils.paths import get_paths
from utils.file_io import load_data


# Show more columns
pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 200)

# Initiate Logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
# Get Path's Object
paths = get_paths()

logger.info(f"Project Root Path Loaded: {paths.root}")    
    
logger.info(f"Project Data Path Loaded: {paths.data}")
logger.info(f"Data Raw Path Loaded: {paths.data_raw}")

logger.info(f"Data Bronze Path Loaded: {paths.data_bronze}")


In [None]:
# Load Data

df = load_data(paths.data_raw / "data subfolder directory name", "file_name.cv")

In [None]:
# Basic Dataframe Information/Summary

print("Shape:", df.shape)
print("\nData types:")
print(df.dtypes)

print("\nMemory usage (MB):")
print(df.memory_usage(deep=True).sum() / (1024 ** 2))

print("\nFirst 15 rows:")
display(df.head(15))

print("\nBasic numeric summary:")
display(df.describe().T)

print("\nBasic object / categorical summary:")
display(df.describe(include="object").T)

In [None]:
df.head(25)