In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,StandardScaler,MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,f1_score,precision_score

from imblearn import under_sampling, over_sampling
from imblearn.over_sampling import SMOTE
from collections import Counter

In [2]:
#load dataSet and remove unwanted columns
train = pd.read_csv('Train_Set_90621.csv')
train = train.drop(['Application ID','Bank Masked','Name Masked','Approved_Timestamp','Term','Male to Female Employees Ratio'],axis=1)


In [3]:
#perform eda
train['Approved_Year'] = train['Approved_Year'].astype(object)


train['New Business'] = train['New Business'].replace(to_replace=2.0,value=0.0)
train['New Business'] = train['New Business'].astype(object)

train['Interest Rate'] = train['Interest Rate'].astype(object)


train['Term_years'] = train['Term_years'].replace(to_replace=1900,value=2011)


train['City or Rural'] = train['City or Rural'].replace(to_replace=2,value=0)
train['City or Rural'] = train['City or Rural'].astype(object)


train['New Business']=train['New Business'].fillna(train['New Business'].mode()[0])

train['Bank Type'].fillna(train['Bank Type'].mode()[0],inplace=True)
train['Business Owner State'].fillna(train['Business Owner State'].mode()[0],inplace=True)
train['BankState'].fillna(train['BankState'].mode()[0],inplace=True)
train['Carry-forward Credit'].fillna(train['Carry-forward Credit'].mode()[0],inplace=True)
train['Documents Provided'].fillna(train['Documents Provided'].mode()[0],inplace=True)

train['Expected Company Income'].fillna(np.mean(train['Expected Company Income']),inplace=True)

In [4]:
train_num = train.select_dtypes(include = [np.number])
print(train_num.columns)
train_cat = train.select_dtypes(include = [np.object])
print(train_cat.columns)

Index(['Business_Industry_Type_Code', 'New Business', 'Employees',
       'Gross Disbursed Amount', 'Term_years', 'Jobs Retained',
       'Expected Company Income', 'Funds available with company',
       'Gross_Apprv_Amount', 'Company Branch Code', 'Jobs Generated',
       'Balance Left', 'Amount Defaulted', 'Final_Appved_Amount',
       'Default_Status'],
      dtype='object')
Index(['Bank Type', 'Business Owner State', 'Approved_Year', 'BankState',
       'Interest Rate', 'City or Rural', 'Carry-forward Credit',
       'Documents Provided'],
      dtype='object')


In [5]:
train_num.drop(['Default_Status'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [6]:
train_cat = pd.get_dummies(train_cat,drop_first=True)
train_cat.shape

(195118, 52)

In [7]:
X = pd.concat([train_num,train_cat],axis=1)

In [8]:
X.shape

(195118, 66)

In [9]:
y = train['Default_Status']

In [53]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
vf=[vif(X_vif.values,i) for i in range(X_vif.shape[1])]
vif_df = pd.DataFrame(vf,index=X_vif.columns,columns=['vif'])

In [54]:
vif_df.sort_values(by='vif',ascending=False)

Unnamed: 0,vif
Final_Appved_Amount,10.014972
Gross Disbursed Amount,9.246669
Approved_Year_2011,6.787643
Business Owner State_Karnataka,5.448307
BankState_Karnataka,5.437317
...,...
Company Branch Code,1.027549
Funds available with company,1.026674
Jobs Retained,1.002551
BankState_Goa,1.000644


In [10]:
X_vif= X.drop(['Jobs Generated','Gross_Apprv_Amount','BankState_Delhi'],axis=1)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_vif, y, test_size=0.3, random_state=42)
rfc = RandomForestClassifier(n_jobs=3,n_estimators=500,criterion='entropy')
rfc_model = rfc.fit(X_train,y_train)
y_pred = rfc_model.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

0.9327251605849392
[[47833   237]
 [ 3701  6765]]
              precision    recall  f1-score   support

           0       0.93      1.00      0.96     48070
           1       0.97      0.65      0.77     10466

    accuracy                           0.93     58536
   macro avg       0.95      0.82      0.87     58536
weighted avg       0.93      0.93      0.93     58536



In [17]:
test = pd.read_csv('Test_Set_90621.csv')
test = test.drop(['Application ID','Bank Masked','Name Masked','Approved_Timestamp','Term','Male to Female Employees Ratio'],axis=1)


In [18]:
test['Approved_Year'] = test['Approved_Year'].astype(object)


test['New Business'] = test['New Business'].replace(to_replace=2.0,value=0.0)
test['New Business'] = test['New Business'].astype(object)


test['Interest Rate'] = test['Interest Rate'].replace(to_replace=1,value=2)
test['Interest Rate'] = test['Interest Rate'].astype(object)


test['Term_years'] = train['Term_years'].replace(to_replace=1900,value=2011)


test['City or Rural'] = test['City or Rural'].replace(to_replace=2,value=0)
test['City or Rural'] = test['City or Rural'].astype(object)


test['New Business']=test['New Business'].fillna(test['New Business'].mode()[0])

test['Bank Type'].fillna(test['Bank Type'].mode()[0],inplace=True)
test['Business Owner State'].fillna(test['Business Owner State'].mode()[0],inplace=True)
test['BankState'].fillna(test['BankState'].mode()[0],inplace=True)
test['Carry-forward Credit'].fillna(test['Carry-forward Credit'].mode()[0],inplace=True)
test['Documents Provided'].fillna(test['Documents Provided'].mode()[0],inplace=True)

test['Expected Company Income'].fillna(np.mean(test['Expected Company Income']),inplace=True)

In [19]:
test_df = pd.get_dummies(test,drop_first=True)
test_df.shape

(83623, 66)

In [20]:
test_df = test_df.drop(['Jobs Generated','Gross_Apprv_Amount','BankState_Delhi'],axis=1)

In [21]:
test = pd.read_csv('Test_Set_90621.csv')
y_pred = rfc_model.predict(test_df)
y_pred_series = pd.Series(y_pred)

app = test['Application ID']

data = {"Application ID": app,
        "Default_Status": y_pred_series}
  
# Concatenating the series side
# by side as depicted by axis=1
# If you want to concatenate the 
# series one below the other
# change the axis to zero.
df_samp_rf = pd.concat(data,
               axis = 1)
  
# show the dataframe
df_samp_rf.Default_Status.value_counts()

0    73483
1    10140
Name: Default_Status, dtype: int64

In [22]:
df_samp_rf.set_index("Application ID", inplace = True)
submission_data_eda = df_samp_rf.to_csv('final_GHfile_rf_vif.csv', index = True)