In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import missingno as msno

In [2]:
def fill_missing_median(df, columns=None):
    df_copy = df.copy()
    if columns is None:
        columns = df.select_dtypes(include=np.number).columns
    for col in columns:
        df_copy[col] = df_copy[col].fillna(df_copy[col].median())
    return df_copy

def drop_missing(df, columns=None, threshold=None):
    df_copy = df.copy()
    if columns is not None:
        return df_copy.dropna(subset=columns)
    if threshold is not None:
        return df_copy.dropna(thresh=int(threshold*df_copy.shape[1]))
    return df_copy.dropna()

def normalize_data(df, columns=None, method='minmax'):
    df_copy = df.copy()
    if columns is None:
        columns = df_copy.select_dtypes(include=np.number).columns
    if method=='minmax':
        scaler = MinMaxScaler()
    else:
        scaler = StandardScaler()
    df_copy[columns] = scaler.fit_transform(df_copy[columns])
    return df_copy

In [3]:
origin_df = pd.read_csv('C:/Users/Zhangchensi/bootcamp_Chensi_Zhang/data/raw/origin_data.csv')

In [4]:
cleaned_df = fill_missing_median(origin_df)
cleaned_df = drop_missing(cleaned_df)
cleaned_df = normalize_data(cleaned_df)
cleaned_df

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10
0,0.526922,0.551232,D,0.272910,0.715299,0.921844,0.368866,2020-08-07,0.319795,2020-07-15
1,0.437412,0.551232,A,0.055388,0.484296,0.550100,0.401290,2020-01-08,0.569774,2020-05-12
3,0.671595,0.389677,E,0.890232,0.305978,0.391784,0.304576,2020-10-01,0.725345,2020-08-07
5,0.423898,0.469062,A,0.855992,0.367781,0.309619,0.653744,2020-03-06,0.060987,2020-02-10
7,0.565084,0.782365,C,0.748238,0.937183,0.208417,0.265381,2020-03-26,0.423390,2020-01-02
...,...,...,...,...,...,...,...,...,...,...
493,0.335492,0.401581,D,0.854985,0.681864,0.045090,0.535259,2020-04-19,0.296066,2020-09-07
494,0.671609,0.628977,E,0.920443,0.161094,0.301603,0.535259,2020-07-21,0.447413,2020-03-24
496,0.310688,0.551232,E,0.620342,0.908815,0.912826,0.535259,2020-01-04,0.810685,2020-03-18
498,0.333472,0.474312,B,0.693857,0.528875,0.188377,0.617082,2020-06-20,0.464225,2020-10-11


In [5]:
origin_df.info()
origin_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   col_1   456 non-null    float64
 1   col_2   448 non-null    float64
 2   col_3   451 non-null    object 
 3   col_4   480 non-null    float64
 4   col_5   471 non-null    float64
 5   col_6   477 non-null    float64
 6   col_7   441 non-null    float64
 7   col_8   462 non-null    object 
 8   col_9   450 non-null    float64
 9   col_10  469 non-null    object 
dtypes: float64(7), object(3)
memory usage: 39.2+ KB


Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10
0,49.671415,,D,276.0,717.0,920.0,-98.465478,2020-08-07,-89.037706,2020-07-15
1,-13.82643,,A,60.0,489.0,549.0,-79.252345,2020-01-08,62.761121,2020-05-12
2,64.768854,120.650897,B,318.0,493.0,13.0,-42.359858,,-90.680773,2020-09-11
3,152.302986,-81.693567,E,889.0,313.0,391.0,-136.56174,2020-10-01,157.230626,2020-08-07
4,-23.415337,36.867331,E,51.0,605.0,346.0,-97.412185,2020-12-18,-90.097002,


In [6]:
cleaned_df.info()
cleaned_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 392 entries, 0 to 499
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   col_1   392 non-null    float64
 1   col_2   392 non-null    float64
 2   col_3   392 non-null    object 
 3   col_4   392 non-null    float64
 4   col_5   392 non-null    float64
 5   col_6   392 non-null    float64
 6   col_7   392 non-null    float64
 7   col_8   392 non-null    object 
 8   col_9   392 non-null    float64
 9   col_10  392 non-null    object 
dtypes: float64(7), object(3)
memory usage: 33.7+ KB


Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10
0,0.526922,0.551232,D,0.27291,0.715299,0.921844,0.368866,2020-08-07,0.319795,2020-07-15
1,0.437412,0.551232,A,0.055388,0.484296,0.5501,0.40129,2020-01-08,0.569774,2020-05-12
3,0.671595,0.389677,E,0.890232,0.305978,0.391784,0.304576,2020-10-01,0.725345,2020-08-07
5,0.423898,0.469062,A,0.855992,0.367781,0.309619,0.653744,2020-03-06,0.060987,2020-02-10
7,0.565084,0.782365,C,0.748238,0.937183,0.208417,0.265381,2020-03-26,0.42339,2020-01-02


In [7]:
cleaned_df.to_csv('C:/Users/Zhangchensi/bootcamp_Chensi_Zhang/data/processed/processed_data.csv')