In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [36]:
id_test = df_test['id']

KeyError: 'id'

In [3]:
df.shape

(595212, 59)

In [4]:
target = df['target']

In [5]:
target.value_counts()

0    573518
1     21694
Name: target, dtype: int64

- As it is imbalanced dataset we have to synthesize for new data points 

In [6]:
df.drop(columns=['target'],inplace=True)

### Getting the Metadata Dataframe

In [7]:
import pickle
df_metadata = pickle.load(open('df_metedata_pickle','rb'))

In [8]:
df_metadata

Unnamed: 0,DTypes,Dropped,Missing
id,int64,True,7.0
ps_ind_01,Ordinal,False,0.0
ps_ind_02_cat,Categorical,False,1.0
ps_ind_03,int64,False,4.423318
ps_ind_04_cat,Categorical,False,0.0
ps_ind_05_cat,Categorical,False,0.0
ps_ind_06_bin,Categorical,False,0.0
ps_ind_07_bin,Categorical,False,0.0
ps_ind_08_bin,Categorical,False,0.0
ps_ind_09_bin,Categorical,False,0.0


 - Before Synthesizing the new data lets complete the pre processing

### PreProcessing

- Dropping Stastically insignificant columns, Filling the Missing values and changing the datatypes of columns accordingly

In [9]:
def preprocessing(df):
    df.replace(to_replace=-1,value=np.nan,inplace=True)
    for col in df.columns:
        #Dropping Insignificant Columns
        if df_metadata.loc[col,'Dropped']:
            df.drop(columns=[col],inplace=True)
            continue
        #Filling Missing Values
        df[col].fillna(df_metadata.loc[col,'Missing'],inplace=True)
        #Changing the datatype of columns
        if (df_metadata.loc[col,'DTypes'] == 'Categorical') or (df_metadata.loc[col,'DTypes'] == 'Ordinal'):
            df[col] = df[col].astype('category')

In [10]:
df.shape

(595212, 58)

In [11]:
preprocessing(df)
preprocessing(df_test)

In [12]:
df.isnull().sum()

ps_ind_01        0
ps_ind_02_cat    0
ps_ind_03        0
ps_ind_04_cat    0
ps_ind_05_cat    0
ps_ind_06_bin    0
ps_ind_07_bin    0
ps_ind_08_bin    0
ps_ind_09_bin    0
ps_ind_12_bin    0
ps_ind_14        0
ps_ind_15        0
ps_ind_16_bin    0
ps_ind_17_bin    0
ps_ind_18_bin    0
ps_reg_01        0
ps_reg_02        0
ps_reg_03        0
ps_car_01_cat    0
ps_car_02_cat    0
ps_car_04_cat    0
ps_car_05_cat    0
ps_car_06_cat    0
ps_car_07_cat    0
ps_car_08_cat    0
ps_car_09_cat    0
ps_car_11_cat    0
ps_car_11        0
ps_car_12        0
ps_car_13        0
ps_car_14        0
ps_car_15        0
dtype: int64

In [13]:
df_test.isnull().sum()

ps_ind_01        0
ps_ind_02_cat    0
ps_ind_03        0
ps_ind_04_cat    0
ps_ind_05_cat    0
ps_ind_06_bin    0
ps_ind_07_bin    0
ps_ind_08_bin    0
ps_ind_09_bin    0
ps_ind_12_bin    0
ps_ind_14        0
ps_ind_15        0
ps_ind_16_bin    0
ps_ind_17_bin    0
ps_ind_18_bin    0
ps_reg_01        0
ps_reg_02        0
ps_reg_03        0
ps_car_01_cat    0
ps_car_02_cat    0
ps_car_04_cat    0
ps_car_05_cat    0
ps_car_06_cat    0
ps_car_07_cat    0
ps_car_08_cat    0
ps_car_09_cat    0
ps_car_11_cat    0
ps_car_11        0
ps_car_12        0
ps_car_13        0
ps_car_14        0
ps_car_15        0
dtype: int64

### Operating with outliers

- The Idea is to find the outliers and replace them accordingly

In [14]:
def outlier_processing(df,df_test):
    for col in df.columns:
        if df[col].dtype.name != 'category':
            first_quartile, third_quartile = np.percentile(df[col],[25,75])
            first_percetnile, ninetynine_percentile = np.percentile(df[col],[1,99])
            IQR = third_quartile - first_quartile
            lower_bound = first_quartile - (1.5*IQR)
            upper_bound = third_quartile + (1.5*IQR)
            df[col].loc[df[col]>upper_bound] = ninetynine_percentile
            df_test[col].loc[df_test[col]>upper_bound] = ninetynine_percentile
            df[col].loc[df[col]<lower_bound] = first_percetnile
            df_test[col].loc[df_test[col]<lower_bound] = first_percetnile
        

In [15]:
outlier_processing(df,df_test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


### Encoding

- The Idea is to encode the ordinal values with Ordinal Encoder and Categorical values with OneHot Encoder - unless they are binary

In [16]:
ordinal_columns = [col for col in df.columns if df_metadata.loc[col,'DTypes'] == 'Ordinal' and df[col].nunique() > 2]

In [17]:
categorical_columns_great_2 = [col for col in df.columns if df_metadata.loc[col,'DTypes'] == 'Categorical' and df[col].nunique() > 2]

In [18]:
from sklearn.preprocessing import LabelEncoder
for col in ordinal_columns:
    label_encode = LabelEncoder()
    df[col+'label'] = label_encode.fit_transform(df[col])
    df_test[col+'label'] = label_encode.transform(df_test[col])
    df.drop(columns=[col],inplace=True)
    df_test.drop(columns=[col],inplace=True)

In [19]:
df = pd.get_dummies(df,prefix=col,columns=categorical_columns_great_2,drop_first=True)
df_test = pd.get_dummies(df_test,columns=categorical_columns_great_2,prefix=col,drop_first=True)

In [20]:
df.shape

(595212, 178)

In [21]:
df_test.shape

(892816, 178)

### Lets do the scaling

In [22]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [23]:
df_train_scale = scaler.fit_transform(df)

In [24]:
df_test_scale = scaler.transform(df_test)

## Making data in to multiple folds

In [25]:
df_train_scale = pd.DataFrame(df_train_scale,columns=df.columns)
df_test_scale = pd.DataFrame(df_test_scale,columns=df_test.columns)

In [26]:
chunks = [df_train_scale,target]
df_train_scale_target = pd.concat(chunks,axis=1)
df_minority = df_train_scale_target.loc[df_train_scale_target['target'] == 1].copy()
df_majority = df_train_scale_target.loc[df_train_scale_target['target'] == 0].copy()

In [27]:
splitted_frame = np.array_split(df_majority, 20)

### Logistic Regression

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
log_reg= LogisticRegression(max_iter=100000)

In [29]:
param = {'C':[0.001,0.003,0.005,0.01,0.03,0.05,0.1,0.3,0.5,1]}

In [32]:
from sklearn.utils import shuffle
y_train_pred_proba = 0
y_test_pred_proba = 0
for frames in splitted_frame:
    Glog_reg = GridSearchCV(estimator = log_reg,param_grid = param, scoring = 'accuracy', cv=5)
    chunks_temp = [frames,df_minority]
    df_temp_train = shuffle(pd.concat(chunks_temp,axis=0))
    target_train = df_temp_train['target']
    df_temp_train.drop(columns=['target'],inplace=True)
    Glog_reg.fit(df_temp_train,target_train)
    best_model = Glog_reg.best_estimator_
    best_model.fit(df_temp_train,target_train)
    y_train_pred_proba = y_train_pred_proba + best_model.predict_proba(df_train_scale)[:,1]
    y_test_pred_proba = y_test_pred_proba + best_model.predict_proba(df_test_scale)[:,1]
    

In [33]:
y_train_pred_proba = y_train_pred_proba/20
y_test_pred_proba = y_test_pred_proba/20

In [34]:
y_train_pred_proba

array([0.54147991, 0.40948125, 0.32760853, ..., 0.31136847, 0.44760403,
       0.34977448])

In [35]:
y_test_pred_proba

array([0.36798631, 0.40888746, 0.33988719, ..., 0.44982776, 0.34751376,
       0.41190061])

In [165]:
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,roc_auc_score,roc_curve,f1_score

In [164]:
y_train_predict = y_train_pred_proba.map()

In [166]:
confusion_matrix_train=confusion_matrix(y_train,y_train_predict)
confusion_matrix_train

array([[513881,  48078],
       [142431, 419705]], dtype=int64)

In [167]:
print("Accuracy:", accuracy_score(y_train,y_train_predict))
print("Precision:", precision_score(y_train,y_train_predict))
print("recall:", recall_score(y_train,y_train_predict))
print("f1_score:", f1_score(y_train,y_train_predict))

Accuracy: 0.8305223312976217
Precision: 0.8972215749610396
recall: 0.746625371796149
f1_score: 0.8150252592679619


### For Testing Set

In [168]:
y_test_predict = log_reg.predict(X_test)

In [170]:
confusion_matrix_test=confusion_matrix(y_test,y_test_predict)
confusion_matrix_test

array([[10567,   992],
       [ 2858,  8524]], dtype=int64)

In [171]:
print("Accuracy:", accuracy_score(y_test,y_test_predict))
print("Precision:", precision_score(y_test,y_test_predict))
print("recall:", recall_score(y_test,y_test_predict))
print("f1_score:", f1_score(y_test,y_test_predict))

Accuracy: 0.8321781962425352
Precision: 0.8957545187053384
recall: 0.748901774732033
f1_score: 0.8157718441956168
