In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, classification_report
from sklearn.model_selection import train_test_split, StratifiedKFold

In [2]:
train_df = pd.read_csv("raw/train.csv")
test_df = pd.read_csv("raw/test.csv")
train_labels = pd.read_csv("raw/train_class_labels.csv").drop("Unnamed: 0", axis = 1)

In [3]:
categorical_features = ['workclass', 'educational-num', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country']

In [7]:
"""Classes for transformations"""

class MyCustomTransformations(BaseEstimator, TransformerMixin):
    def __init__(self, column_name = 'native-country'):
        self.column_name = column_name

    def fit(self, df, y = None):
        return self

    def transform(self, df, y = None):
        new_df = df.copy()
        for i in new_df.columns.tolist():
            new_df[i].fillna('unk')
        new_df[self.column_name] = df[self.column_name].apply(lambda x: 'United-States' if x == 'United-States' else 'Others')
        return new_df

class FillMissingValues(BaseEstimator, TransformerMixin):
    """Fills all the missing values by majority voting"""

    def __init__(self, columns = []):
        self.columns = columns

    def fit(self, df, y = None):
        return self

    def transform(self, df, y = None):
        for i in self.columns:
            df[i] = df[i].fillna(df[i].mode())
        return df

class RemoveOutliers(BaseEstimator, TransformerMixin):
    """Removes outliers using IQR method"""

    def __init__(self, columns = []):
        self.columns = columns

    def fit(self, df, y = None):
        return self

    def transform(self, df, y = None):
        for col_no, col in enumerate(self.columns):
            q1 = df[col].quantile(0.25)
            q3 = df[col].quantile(0.75)
            iq_range = q3 - q1
            df[col] = df[col].clip(lower = q1 - 1.5*iq_range, upper = q3 + 1.5*iq_range)
        return df

class AddPolyFeatures(BaseEstimator, TransformerMixin):
    """Add Polynomial features"""

    def __init__(self, columns = []):
        self.columns = columns

    def fit(self, df, y = None):
        return self

    def transform(self, df, y = None):
        polynomial_features = PolynomialFeatures()
        new_features = polynomial_features.fit_transform(df[self.columns])
        rename_list = ["pf_" + str(i) for i in range(new_features.shape[1])]
        df[rename_list] = pd.DataFrame(np.nan, index=[i for i in range(df.shape[0])], columns = rename_list)
        df[rename_list] = new_features
        return df

class MinMaxScale(BaseEstimator, TransformerMixin):
    """Normalizing Numerical Variables"""

    def __init__(self,columns=[]):
        self.columns = columns

    def fit(self, df, y = None):
        return self

    def transform(self, df, y = None):
        scaler = MinMaxScaler()
        df[self.columns] = scaler.fit_transform(df[self.columns])
        return df

class DropColumns(BaseEstimator, TransformerMixin):
    """For dropping unnecessary columns"""

    def __init__(self,columns=[]):
        self.columns = columns

    def fit(self, df, y = None):
        return self

    def transform(self, df, y = None):
        return df.drop(self.columns, axis = 1)

class OneHotEncode(BaseEstimator, TransformerMixin):
    """Add Polynomial features"""

    def __init__(self, columns = []):
        self.columns = columns

    def fit(self, df, y = None):
        return self

    def transform(self, df, y = None):

        oe = OneHotEncoder(sparse=False)
        new_df = df.copy()
        oe_features = oe.fit_transform(df[self.columns])
        oe_features = pd.DataFrame(oe_features, columns=["oe_" + str(i) for i in range(oe_features.shape[1])])
        return pd.concat([new_df, oe_features], axis = 1).drop(categorical_features, axis = 1)

class LabelEncode(BaseEstimator, TransformerMixin):
    """For Label Encoding Categorical Variables"""

    def __init__(self, columns = []):
        self.columns = columns
        self.encoders = []

    def fit(self, df, y = None):
        for i in self.columns:
            self.encoders.append(LabelEncoder())
        for i,col in enumerate(self.columns):
            self.encoders[i].fit(df[col])
        return self

    def transform(self, df, y = None):
        for col_no, i in enumerate(self.columns):
            df[i] = self.encoders[col_no].transform(df[i])
        return df

class Estimate(BaseEstimator, TransformerMixin):

    def __init__(self, estimator = "lg"):
        self.estimator = estimator

    def fit(self, y = None):
        return self

    def predict(self, df, y = None):
        if self.estimator == "lg":
            lr = LogisticRegression()
            lr.fit(df, y)
            return lr
        elif self.estimator == "svc":
            svc = SVC()
            svc.fit(df, y)
            return svc
        elif self.estimator == "dtc":
            dtc = DecisionTreeClassifier()
            dtc.fit(df, y)
            return dtc
        elif self.estimator == "rf":
            rf = RandomForestClassifier()
            rf.fit(df, y)
            return rf
        elif self.estimator == "knn":
            knn = KNeighborsClassifier()
            knn.fit(df, y)
            return knn
        elif self.estimator == "adt":
            adt = AdaBoostClassifier()
            adt.fit(df, y)
            return adt
        else:
            return None

In [8]:
# default settings
# replace missing values by majority voting
# use one-hot encoding on all categorical features ---- done
# split the data into train and test through the pipeline
# min max scale the numerical variables ---- done
# this must be enough

default_preprocessed_pipeline =Pipeline([
                                        ("dropper", DropColumns(columns=['Unnamed: 0', 'education'])),
                                        ("majority_voter", FillMissingValues(columns = ['workclass', 'occupation', 'native-country'])),
                                        ("min_max_scaler", MinMaxScale(columns=['age', 'fnlwgt', 'hours-per-week', 'capital-gain', 'capital-loss'])),
                                        ("one-hot_encoder", OneHotEncode(columns=categorical_features)),
                                        ])

In [10]:
default_preprocessed_pipeline.fit_transform(train_df).to_csv("preprocessed/train.csv", index=False)

In [8]:
# current_estimator = RandomForestClassifier
#
# skf = StratifiedKFold(random_state = 42, shuffle = True)
# accuracy_metrics = []
# f1_score_metrics = []
# auc_roc_metrics = []
# y = train_labels['income_>50K'].values
#
# for train_index, test_index in skf.split(train_df, y):
#
#     # define pipeline
#     pipeLine = Pipeline([
#                      ("custom", MyCustomTransformations()),
#                      ("dropper", DropColumns(columns=['Unnamed: 0', 'education', 'capital-gain', 'capital-loss'])),
#                      ("remove_outliers", RemoveOutliers(columns = ["age", "fnlwgt", "hours-per-week"])),
#                      ("label_encoder", LabelEncode(columns=['workclass', 'occupation', "gender", "relationship", "native-country", "marital-status", "race"])),
#                      ("min_max_scaler", MinMaxScale(columns=["age", "fnlwgt", "hours-per-week"])),
#                      ("PF", AddPolyFeatures(columns = ["age", "fnlwgt", "hours-per-week"])),
#                      ("estimator", current_estimator())])
#
#     X_train, X_test = train_df.iloc[train_index], train_df.iloc[test_index]
#     y_train, y_test = y[train_index], y[test_index]
#
#     # fit pipeline
#     pipeLine.fit(X_train, y_train)
#
#     # predict and get statistics
#     predictions = pipeLine.predict(X_test)
#     accuracy_metrics.append(accuracy_score(y_test, predictions))
#     f1_score_metrics.append(f1_score(y_test, predictions))
#     auc_roc_metrics.append(roc_auc_score(y_test, predictions))
#
# print("Accuracy = ", np.mean(accuracy_metrics))
# print("F1 Score = ", np.mean(f1_score_metrics))
# print("AUC-ROC = ", np.mean(auc_roc_metrics))

Accuracy =  0.8261028552135119
F1 Score =  0.6027797868943396
AUC-ROC =  0.7320646347902879
