In [21]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
class Preprocessor:

    def __init__(self, numerical_scaling='minmax'):
        self.numerical_scaling = numerical_scaling


    # PREPROCESSING
    # ---------------------
    def preprocess(self, df):

        df = self._handle_missing_values(df)
        df = self._handle_features(df)

        boolean_cols = df.select_dtypes(include='bool').columns
        df[boolean_cols] = df[boolean_cols].astype(int)
        
        return df



    # HANDLING - MISSING VALUES
    # --------------------------------
    def _handle_missing_values(self, df):
        df = self.handle_missing_numerical(df)
        df = self.handle_missing_categorical(df)
        
        return df

    # Numerical
    def handle_missing_numerical(self, df):
        mean_col = df.select_dtypes(include='number').mean()
        df = df.fillna(mean_col)
        return df

    # Categorical
    def handle_missing_categorical(self, df):
        mode_col = df.select_dtypes(include='object').mode() 
        for col in mode_col.columns:
            df[col] = df[col].fillna(mode_col[col].iloc[0])
            
        return df



    # HANDLING - FEATURES
    # ----------------------------
    def _handle_features(self, features):
        features = self.handle_categorical_features(features)        
        features = self.handle_numerical_features(features)

        return features
    
    
    # Categorical
    def handle_categorical_features(self, features):
        for column in features.columns:
            if features[column].dtype == 'object':    
                values = features[column].unique()

                if (len(values) > 2):
                    features = pd.get_dummies(features, columns=[column])
                elif (len(values) == 2):
                    features[column] = features[column].map({values[0]: 0, values[1]: 1})

        return features

    # Numerical
    def handle_numerical_features(self, features):
        if self.numerical_scaling == 'standard':
            scaler = StandardScaler()
        elif self.numerical_scaling == 'minmax':
            scaler = MinMaxScaler()

        numeric_cols = features.select_dtypes(include=['number']).columns
        features[numeric_cols] = scaler.fit_transform(features[numeric_cols])
        return features

In [22]:
def dataset_one():
    data = pd.read_csv("./1st/WA_Fn-UseC_-Telco-Customer-Churn.csv")

    # Initial data cleaning
    df_filtered = data.drop(columns=["customerID"])
    df_filtered.dropna(subset=['Churn'], inplace=True)

    # Splitting the data into features and target
    features = df_filtered.drop(columns=["Churn"])
    target = data["Churn"].map({"Yes": 1, "No": 0})

    # handling special cases
    features["TotalCharges"] = pd.to_numeric(features["TotalCharges"], errors='coerce')

    # preprocessing
    preprocessor = Preprocessor()
    features = preprocessor.preprocess(features)

    # saving the preprocessed dataset
    preprocessed_dataset = pd.concat([features, target], axis=1) 
    preprocessed_dataset = preprocessed_dataset.rename(columns={"Churn": "label"})

    preprocessed_dataset.to_csv("./dataset.csv", index=False)

In [23]:
def dataset_three():
    data = pd.read_csv("./3rd/creditcard.csv")

    # counts = data["Class"].value_counts()
    # print(counts)
    # counts -> 0: 284315, 1: 492
    # high class imbalance  

    #$initaial data cleaning
    df_filtered = data.dropna(subset=['Class'])

    # Splitting the data into features and target
    features = df_filtered.drop("Class", axis=1)
    target = df_filtered["Class"]

    # preprocessing
    preprocessor = Preprocessor()
    features = preprocessor.preprocess(features)

    # saving the preprocessed dataset
    preprocessed_dataset = pd.concat([features, target], axis=1) 
    preprocessed_dataset = preprocessed_dataset.rename(columns={"Class": "label"})

    # EXTRA PROCESS
    # taking a portion of data to balance the classes and efficiency
    zeros_df = preprocessed_dataset[preprocessed_dataset['label'] == 0].sample(n=20000, random_state=42)  
    ones_df = preprocessed_dataset[preprocessed_dataset['label'] == 1] 

    combined_df = pd.concat([zeros_df, ones_df])
    combined_df = combined_df.sample(frac=1, random_state=42)

    # print(combined_df["label"].value_counts())

    combined_df.to_csv("./dataset.csv", index=False)

In [24]:
def dataset_two():

    #  .DATA file
    # -----------------
    # 39, State-gov, 77516, Bachelors, 13, Never-married, Adm-clerical, Not-in-family, White, Male, 2174, 0, 40, United-States, <=50K
    column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income']
    data = pd.read_csv("./2nd/adult.data", header=None, names=column_names)
    # print(data.shape)

    # handling special cases
    for column in data.columns:
        values = data[column].unique()
        if data[column].dtype.name == 'object':
            if ('?' in values or ' ?' in values):
                data[column] = data[column].replace('?', None)
                data[column] = data[column].replace(' ?', None)

    # Initial data cleaning
    df_filtered = data.dropna(subset=['income'])
    # print(data['native_country'].nunique())

    # Splitting the data into features and target
    features = df_filtered.drop('income', axis=1)
    target = df_filtered['income']

    # handling target
    target = target.map({' <=50K': 0, ' >50K': 1})

    
    # preprocessing
    preprocessor = Preprocessor()
    features = preprocessor.preprocess(features)

    # saving the preprocessed dataset
    preprocessed_dataset = pd.concat([features, target], axis=1) 
    preprocessed_dataset = preprocessed_dataset.rename(columns={"income": "label"})

    # print(preprocessed_dataset.shape)

    preprocessed_dataset.to_csv("./dataset.csv", index=False)


    # .TEST file
    # ----------------- 

    data_test = pd.read_csv("./2nd/adult.test", header=None, names=column_names)

    # handling special cases
    for column in data_test.columns:
        values = data_test[column].unique()
        if data_test[column].dtype.name == 'object':
            if ('?' in values or ' ?' in values):
                data_test[column] = data_test[column].replace('?', None)
                data_test[column] = data_test[column].replace(' ?', None)
    
    # Initial data cleaning
    df_filtered_test = data_test.dropna(subset=['income'])

    # Splitting the data into features and target
    features_test = df_filtered_test.drop('income', axis=1)
    target_test = df_filtered_test['income']

    # handling special cases
    target_test = target_test.map({' <=50K.': 0, ' >50K.': 1})

    # preprocessing
    preprocessor_test = Preprocessor()
    features_test = preprocessor_test.preprocess(features_test)

    missing_cols = list(set(features.columns) - set(features_test.columns))
    # print(missing_cols)


    missing_df = pd.DataFrame(0, index=features_test.index, columns=missing_cols)
    features_test = pd.concat([features_test, missing_df], axis=1)

    # print(features_test.columns)
    features_test = features_test[features.columns]

    # saving the preprocessed dataset
    preprocessed_dataset_test = pd.concat([features_test, target_test], axis=1) 
    preprocessed_dataset_test = preprocessed_dataset_test.rename(columns={"income": "label"})

    # print(preprocessed_dataset_test.shape)

    preprocessed_dataset_test.to_csv("./dataset_test.csv", index=False)

In [25]:
# dataset_one()
dataset_two()
# dataset_three()

(32561, 105)
(16281, 105)
