In [98]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [99]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [100]:
df.shape

(595212, 59)

In [101]:
target = df['target']

In [102]:
target.value_counts()

0    573518
1     21694
Name: target, dtype: int64

- As it is imbalanced dataset we have to synthesize for new data points 

In [103]:
df.drop(columns=['target'],inplace=True)

### Getting the Metadata Dataframe

In [104]:
import pickle
df_metadata = pickle.load(open('df_metedata_pickle','rb'))

In [105]:
df_metadata

Unnamed: 0,DTypes,Dropped,Missing
id,int64,True,7.0
ps_ind_01,Ordinal,False,0.0
ps_ind_02_cat,Categorical,False,1.0
ps_ind_03,int64,False,4.423318
ps_ind_04_cat,Categorical,False,0.0
ps_ind_05_cat,Categorical,False,0.0
ps_ind_06_bin,Categorical,False,0.0
ps_ind_07_bin,Categorical,False,0.0
ps_ind_08_bin,Categorical,False,0.0
ps_ind_09_bin,Categorical,False,0.0


 - Before Synthesizing the new data lets complete the pre processing

### PreProcessing

- Dropping Stastically insignificant columns, Filling the Missing values and changing the datatypes of columns accordingly

In [106]:
def preprocessing(df):
    df.replace(to_replace=-1,value=np.nan,inplace=True)
    for col in df.columns:
        #Dropping Insignificant Columns
        if df_metadata.loc[col,'Dropped']:
            df.drop(columns=[col],inplace=True)
            continue
        #Filling Missing Values
        df[col].fillna(df_metadata.loc[col,'Missing'],inplace=True)
        #Changing the datatype of columns
        if (df_metadata.loc[col,'DTypes'] == 'Categorical') or (df_metadata.loc[col,'DTypes'] == 'Ordinal'):
            df[col] = df[col].astype('category')

In [107]:
df.shape

(595212, 58)

In [108]:
preprocessing(df)
preprocessing(df_test)

In [117]:
df.isnull().sum()

ps_ind_03        0
ps_ind_04_cat    0
ps_ind_06_bin    0
ps_ind_07_bin    0
ps_ind_08_bin    0
                ..
ps_car_15_100    0
ps_car_15_101    0
ps_car_15_102    0
ps_car_15_103    0
ps_car_15_104    0
Length: 178, dtype: int64

In [118]:
df_test.isnull().sum()

ps_ind_03        0
ps_ind_04_cat    0
ps_ind_06_bin    0
ps_ind_07_bin    0
ps_ind_08_bin    0
                ..
ps_car_15_100    0
ps_car_15_101    0
ps_car_15_102    0
ps_car_15_103    0
ps_car_15_104    0
Length: 178, dtype: int64

### Operating with outliers

- The Idea is to find the outliers and replace them accordingly

In [111]:
def outlier_processing(df,df_test):
    for col in df.columns:
        if df[col].dtype.name != 'category':
            first_quartile, third_quartile = np.percentile(df[col],[25,75])
            first_percetnile, ninetynine_percentile = np.percentile(df[col],[1,99])
            IQR = third_quartile - first_quartile
            lower_bound = first_quartile - (1.5*IQR)
            upper_bound = third_quartile + (1.5*IQR)
            df[col].loc[df[col]>upper_bound] = ninetynine_percentile
            df_test[col].loc[df_test[col]>upper_bound] = ninetynine_percentile
            df[col].loc[df[col]<lower_bound] = first_percetnile
            df_test[col].loc[df_test[col]<lower_bound] = first_percetnile
        

In [112]:
outlier_processing(df,df_test)

### Encoding

- The Idea is to encode the ordinal values with Ordinal Encoder and Categorical values with OneHot Encoder - unless they are binary

In [113]:
ordinal_columns = [col for col in df.columns if df_metadata.loc[col,'DTypes'] == 'Ordinal' and df[col].nunique() > 2]

In [114]:
categorical_columns_great_2 = [col for col in df.columns if df_metadata.loc[col,'DTypes'] == 'Categorical' and df[col].nunique() > 2]

In [115]:
from sklearn.preprocessing import LabelEncoder
for col in ordinal_columns:
    label_encode = LabelEncoder()
    df[col+'label'] = label_encode.fit_transform(df[col])
    df_test[col+'label'] = label_encode.transform(df_test[col])
    df.drop(columns=[col],inplace=True)
    df_test.drop(columns=[col],inplace=True)

In [116]:
df = pd.get_dummies(df,prefix=col,columns=categorical_columns_great_2,drop_first=True)
df_test = pd.get_dummies(df_test,columns=categorical_columns_great_2,prefix=col,drop_first=True)

In [20]:
df.shape

(595212, 197)

In [21]:
df_test.shape

(892816, 197)

### Lets do the scaling

In [23]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [24]:
df_train_scale = scaler.fit_transform(df)

In [25]:
df_test_scale = scaler.transform(df_test)

## Making data in to multiple folds

In [None]:
df_train_scale = pd.DataFrame(df_train_scale,columns=df.columns)
df_test_scale = pd.DataFrame(df_test_scale,columns=df_test.columns)

In [119]:
chunks = [df_train_scale,target]
df_train_scale_target = pd.concat(chunks,axis=1)
df_minority = df_train_scale_target.loc[df_train_scale_target['target'] == 1].copy()
df_majority = df_train_scale_target.loc[df_train_scale_target['target'] == 0].copy()

NameError: name 'df_train_scale' is not defined

In [None]:
splitted_frame = np.array_split(df_majority, 20)

### Logistic Regression

In [141]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
log_reg= LogisticRegression(max_iter=100000)

In [142]:
param = {'C':[0.001,0.003,0.005,0.01,0.03,0.05,0.1,0.3,0.5,1]}

In [None]:
from sklearn.utils import shuffle
for frames in splitted_frame:
    Glog_reg = GridSearchCV(estimator = log_reg,param_grid = param, scoring = 'accuracy', cv=5)
    chunks_temp = [frames,df_minority]
    df_temp_train = shuffle(pd.concat(chunks_temp,axis=0))
    
    

In [144]:
X_train_scale.shape

(1124095, 33)

In [145]:
y_train.shape

(1124095,)

In [146]:
Glog_reg.fit(X_train_scale,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100000, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.001, 0.003, 0.005, 0.01, 0.03, 0.05, 0.1, 0.3,
                               0.5, 1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [150]:
Glog_reg.best_score_

0.8305250001112006

In [151]:
Glog_reg.best_params_

{'C': 0.3}

In [152]:
Glog_reg.best_estimator_

LogisticRegression(C=0.3, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [153]:
log_reg =LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [154]:
log_reg.fit(X_train,y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [165]:
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,roc_auc_score,roc_curve,f1_score

In [164]:
y_train_predict = log_reg.predict(X_train)

In [166]:
confusion_matrix_train=confusion_matrix(y_train,y_train_predict)
confusion_matrix_train

array([[513881,  48078],
       [142431, 419705]], dtype=int64)

In [167]:
print("Accuracy:", accuracy_score(y_train,y_train_predict))
print("Precision:", precision_score(y_train,y_train_predict))
print("recall:", recall_score(y_train,y_train_predict))
print("f1_score:", f1_score(y_train,y_train_predict))

Accuracy: 0.8305223312976217
Precision: 0.8972215749610396
recall: 0.746625371796149
f1_score: 0.8150252592679619


### For Testing Set

In [168]:
y_test_predict = log_reg.predict(X_test)

In [170]:
confusion_matrix_test=confusion_matrix(y_test,y_test_predict)
confusion_matrix_test

array([[10567,   992],
       [ 2858,  8524]], dtype=int64)

In [171]:
print("Accuracy:", accuracy_score(y_test,y_test_predict))
print("Precision:", precision_score(y_test,y_test_predict))
print("recall:", recall_score(y_test,y_test_predict))
print("f1_score:", f1_score(y_test,y_test_predict))

Accuracy: 0.8321781962425352
Precision: 0.8957545187053384
recall: 0.748901774732033
f1_score: 0.8157718441956168
