In [1]:
import warnings

warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator,TransformerMixin

from mypipes import *

In [2]:
train_file='rg_train.csv'
test_file='rg_test.csv'

bd_train=pd.read_csv(train_file)
bd_test=pd.read_csv(test_file)

In [3]:
bd_train.head()

Unnamed: 0,REF_NO,children,age_band,status,occupation,occupation_partner,home_status,family_income,self_employed,self_employed_partner,...,Investment.Tax.Saving.Bond,Home.Loan,Online.Purchase.Amount,Revenue.Grid,gender,region,Investment.in.Commudity,Investment.in.Equity,Investment.in.Derivative,Portfolio.Balance
0,2148,1,45-50,Partner,Professional,Professional,Rent Privately,">=35,000",Yes,Yes,...,7.49,2.48,0.0,2,Female,South West,65.87,9.27,30.93,87.48
1,8099,1,61-65,Partner,Retired,Retired,Own Home,"<12,500, >=10,000",No,No,...,0.0,3.99,0.0,2,Female,Unknown,42.46,4.49,26.23,110.73
2,6611,3,31-35,Partner,Professional,Professional,Own Home,">=35,000",No,No,...,0.0,0.0,0.0,2,Male,East Anglia,75.38,0.0,26.66,127.57
3,1950,Zero,55-60,Partner,Professional,Professional,Own Home,">=35,000",No,No,...,2.0,0.0,0.0,2,Female,North West,34.78,6.91,29.24,33.79
4,10857,2,51-55,Partner,Manual Worker,Manual Worker,Own Home,"<27,500, >=25,000",Yes,Yes,...,0.0,0.0,0.0,2,Female,South West,48.58,9.58,20.65,56.17


In [4]:
bd_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 32 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   REF_NO                           8124 non-null   int64  
 1   children                         8124 non-null   object 
 2   age_band                         8124 non-null   object 
 3   status                           8124 non-null   object 
 4   occupation                       8124 non-null   object 
 5   occupation_partner               8124 non-null   object 
 6   home_status                      8124 non-null   object 
 7   family_income                    8124 non-null   object 
 8   self_employed                    8124 non-null   object 
 9   self_employed_partner            8124 non-null   object 
 10  year_last_moved                  8124 non-null   int64  
 11  TVarea                           8124 non-null   object 
 12  post_code           

In [5]:
bd_train['Revenue.Grid'].value_counts()

2    7261
1     863
Name: Revenue.Grid, dtype: int64

In [6]:
bd_train['children'].value_counts(dropna=True)

Zero    4966
1       1491
2       1282
3        373
4+        12
Name: children, dtype: int64

In [7]:
bd_train['post_code'].nunique()

8048

In [8]:
pd.crosstab(bd_train['post_area'],bd_train['Revenue.Grid'], normalize=True).reset_index()

Revenue.Grid,post_area,1,2
0,AB12,0.000000,0.000492
1,AB13,0.000000,0.000123
2,AB14,0.000123,0.000246
3,AB15,0.000000,0.000123
4,AB21,0.000000,0.000246
...,...,...,...
1951,YO62,0.000123,0.000000
1952,YO7,0.000000,0.000739
1953,YO8,0.000000,0.000985
1954,ZE1,0.000000,0.000123


In [9]:
#select numeric columns only.

num_vars=list(bd_train.select_dtypes(exclude=['object']).columns)
num_vars

['REF_NO',
 'year_last_moved',
 'Average.Credit.Card.Transaction',
 'Balance.Transfer',
 'Term.Deposit',
 'Life.Insurance',
 'Medical.Insurance',
 'Average.A.C.Balance',
 'Personal.Loan',
 'Investment.in.Mutual.Fund',
 'Investment.Tax.Saving.Bond',
 'Home.Loan',
 'Online.Purchase.Amount',
 'Revenue.Grid',
 'Investment.in.Commudity',
 'Investment.in.Equity',
 'Investment.in.Derivative',
 'Portfolio.Balance']

In [10]:
num_vars=[_ for _  in num_vars if _ not in ['REF_NO','Revenue.Grid']]
num_vars

['year_last_moved',
 'Average.Credit.Card.Transaction',
 'Balance.Transfer',
 'Term.Deposit',
 'Life.Insurance',
 'Medical.Insurance',
 'Average.A.C.Balance',
 'Personal.Loan',
 'Investment.in.Mutual.Fund',
 'Investment.Tax.Saving.Bond',
 'Home.Loan',
 'Online.Purchase.Amount',
 'Investment.in.Commudity',
 'Investment.in.Equity',
 'Investment.in.Derivative',
 'Portfolio.Balance']

In [11]:
#select Categorial varibales.
   
cat_vars=list(bd_train.select_dtypes(include=['object']).columns)
cat_vars

['children',
 'age_band',
 'status',
 'occupation',
 'occupation_partner',
 'home_status',
 'family_income',
 'self_employed',
 'self_employed_partner',
 'TVarea',
 'post_code',
 'post_area',
 'gender',
 'region']

In [12]:
cat_vars=[_ for _ in cat_vars if _ not in ['children','age_band','post_code','post_area','family_income']]
cat_vars

['status',
 'occupation',
 'occupation_partner',
 'home_status',
 'self_employed',
 'self_employed_partner',
 'TVarea',
 'gender',
 'region']

# Pipeline Started

In [13]:
p1=pdPipeline([
    ('var_select',VarSelector(num_vars)),
    ('missing_trt',DataFrameImputer())
])

In [14]:
p2=pdPipeline([
    ('var_select',VarSelector(cat_vars)),
    ('misisng_trt',DataFrameImputer()),
    ('create_dummies',get_dummies_Pipe(70))
])

In [15]:
p3=pdPipeline([
    ('var_select',VarSelector(['age_band'])),
    ('custom_fico',custom_age_band()),
    ('missing_trt',DataFrameImputer())
])

In [16]:
p4=pdPipeline([
    ('var_select',VarSelector(['family_income'])),
    ('custom_fico',custom_family_income()),
    ('missing_trt',DataFrameImputer())
    
])

In [17]:
p5=pdPipeline([
    ('var_select',VarSelector(['children'])),
    ('string_clean1',string_clean(replace_it='Zero',replace_with='0')),
    ('string_clean2',string_clean(replace_it='4+',replace_with='4')),
    ('convert_to_numeric',convert_to_numeric()),
    ('missing_trt',DataFrameImputer())
    
])

In [18]:
data_pipe=FeatureUnion([
    ('num',p1),
    ('obj_to_dum',p2),
    ('age_band',p3),
    ('family_income',p4),
    ('children',p5)
    
])

In [19]:
data_pipe

FeatureUnion(transformer_list=[('num',
                                pdPipeline(steps=[('var_select',
                                                   VarSelector(feature_names=['year_last_moved',
                                                                              'Average.Credit.Card.Transaction',
                                                                              'Balance.Transfer',
                                                                              'Term.Deposit',
                                                                              'Life.Insurance',
                                                                              'Medical.Insurance',
                                                                              'Average.A.C.Balance',
                                                                              'Personal.Loan',
                                                                              'Investment.in.Mutual.Fun

In [20]:
x_train=pd.DataFrame(data=data_pipe.fit_transform(bd_train),
                    columns=data_pipe.get_feature_names())

In [21]:
x_test=pd.DataFrame(data=data_pipe.transform(bd_test),
                   columns=data_pipe.get_feature_names())

In [22]:
bd_train['Revenue.Grid'].value_counts(dropna=True)

2    7261
1     863
Name: Revenue.Grid, dtype: int64

In [23]:
y_train=(bd_train['Revenue.Grid']==1).astype(int)

In [24]:
x_train.shape

(8124, 71)

In [25]:
x_test.shape

(2031, 71)

In [26]:
x_train.head()

Unnamed: 0,num__year_last_moved,num__Average.Credit.Card.Transaction,num__Balance.Transfer,num__Term.Deposit,num__Life.Insurance,num__Medical.Insurance,num__Average.A.C.Balance,num__Personal.Loan,num__Investment.in.Mutual.Fund,num__Investment.Tax.Saving.Bond,...,obj_to_dum__region_West Midlands,obj_to_dum__region_Scotland,obj_to_dum__region_East Midlands,obj_to_dum__region_North,obj_to_dum__region_Wales,obj_to_dum__region_East Anglia,obj_to_dum__region_Northern Ireland,age_band__age_band,family_income__fi,children__children
0,1999.0,0.0,0.0,196.95,132.42,0.0,0.0,21.47,24.18,7.49,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,47.5,35000.0,1.0
1,1959.0,0.0,77.89,0.0,134.39,0.0,7.99,14.98,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,63.0,11250.0,1.0
2,1992.0,119.98,0.0,96.94,0.0,159.97,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,33.0,35000.0,3.0
3,1990.0,0.0,39.99,0.0,133.93,0.0,39.48,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,57.5,35000.0,0.0
4,1994.0,0.0,161.47,14.99,58.97,7.49,57.46,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.0,26250.0,2.0


## Logistic Regression

In [28]:
from sklearn.linear_model import LogisticRegression

In [30]:
params={'class_weight':['balanced',None],
       'penalty':['l1','l2'],
       'C':np.linspace(0.001,1,10)}

In [31]:
model=LogisticRegression(fit_intercept=True)

In [32]:
from sklearn.model_selection import GridSearchCV

In [34]:
grid_search=GridSearchCV(model,param_grid=params,cv=10,
                        scoring="roc_auc", n_jobs=-1, verbose=20)

In [37]:
grid_search.fit(x_train,y_train)

Fitting 10 folds for each of 40 candidates, totalling 400 fits


GridSearchCV(cv=10, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([0.001, 0.112, 0.223, 0.334, 0.445, 0.556, 0.667, 0.778, 0.889,
       1.   ]),
                         'class_weight': ['balanced', None],
                         'penalty': ['l1', 'l2']},
             scoring='roc_auc', verbose=20)

In [38]:
grid_search.best_estimator_

LogisticRegression(C=0.112, class_weight='balanced')