In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import missingno as msno

In [2]:
def fill_missing_median(df, columns=None):
    df_copy = df.copy()
    if columns is None:
        numeric_cols = df_copy.select_dtypes(include=np.number).columns
        columns = [col for col in numeric_cols if col not in ['year']]
    
    for col in columns:
        if col in df_copy.columns:
            df_copy[col] = df_copy[col].fillna(df_copy[col].median())
    return df_copy

def drop_missing(df, columns=None, threshold=None):
    df_copy = df.copy()
    
    if columns is not None:
        return df_copy.dropna(subset=columns)
    
    if threshold is not None:
        return df_copy.dropna(thresh=int(threshold * len(df_copy.columns)))
    
    return df_copy.dropna()

def normalize_data(df, columns=None, method='minmax'):
    df_copy = df.copy()
    
    if columns is None:
        numeric_cols = df_copy.select_dtypes(include=np.number).columns
        columns = [col for col in numeric_cols if col not in ['year']]
    
    columns = [col for col in columns if col in df_copy.columns]
    
    if not columns:
        return df_copy
    
    if method == 'minmax':
        scaler = MinMaxScaler()
    else:
        scaler = StandardScaler()
    
    df_copy[columns] = scaler.fit_transform(df_copy[columns])
    return df_copy

In [3]:
origin_df = pd.read_csv('C:/Users/Zhangchensi/bootcamp_Chensi_Zhang/project/data/raw/raw_data.csv')
origin_df

Unnamed: 0.1,Unnamed: 0,open,high,low,close,volume,year
0,2015-01-02,111.390,111.44,107.350,109.33,53204626.0,2015
1,2015-01-05,108.290,108.65,105.410,106.25,64285491.0,2015
2,2015-01-06,106.540,107.43,104.630,106.26,65797116.0,2015
3,2015-01-07,107.200,108.20,106.695,107.75,40105934.0,2015
4,2015-01-08,109.230,112.15,108.700,111.89,59364547.0,2015
...,...,...,...,...,...,...,...
2668,2025-08-13,231.070,235.00,230.430,233.33,69878546.0,2025
2669,2025-08-14,234.055,235.12,230.850,232.78,51916275.0,2025
2670,2025-08-15,234.000,234.28,229.335,231.59,56038657.0,2025
2671,2025-08-18,231.700,233.12,230.110,230.89,37476188.0,2025


In [4]:
cleaned_df = fill_missing_median(origin_df)
cleaned_df = drop_missing(cleaned_df)
cleaned_df = normalize_data(cleaned_df)
cleaned_df

Unnamed: 0.1,Unnamed: 0,open,high,low,close,volume,year
0,2015-01-02,0.050354,0.046686,0.043518,0.045676,0.130251,2015
1,2015-01-05,0.043057,0.040097,0.038797,0.038268,0.164745,2015
2,2015-01-06,0.038937,0.037216,0.036898,0.038292,0.169450,2015
3,2015-01-07,0.040491,0.039035,0.041924,0.041876,0.089477,2015
4,2015-01-08,0.045269,0.048362,0.046804,0.051834,0.149426,2015
...,...,...,...,...,...,...,...
2668,2025-08-13,0.332094,0.338466,0.343085,0.343933,0.182155,2025
2669,2025-08-14,0.339121,0.338749,0.344107,0.342610,0.126241,2025
2670,2025-08-15,0.338991,0.336765,0.340420,0.339747,0.139073,2025
2671,2025-08-18,0.333577,0.334026,0.342306,0.338064,0.081290,2025


In [5]:
cleaned_df = cleaned_df.iloc[:, 1:]
cleaned_df

Unnamed: 0,open,high,low,close,volume,year
0,0.050354,0.046686,0.043518,0.045676,0.130251,2015
1,0.043057,0.040097,0.038797,0.038268,0.164745,2015
2,0.038937,0.037216,0.036898,0.038292,0.169450,2015
3,0.040491,0.039035,0.041924,0.041876,0.089477,2015
4,0.045269,0.048362,0.046804,0.051834,0.149426,2015
...,...,...,...,...,...,...
2668,0.332094,0.338466,0.343085,0.343933,0.182155,2025
2669,0.339121,0.338749,0.344107,0.342610,0.126241,2025
2670,0.338991,0.336765,0.340420,0.339747,0.139073,2025
2671,0.333577,0.334026,0.342306,0.338064,0.081290,2025
