# MODEL_SELECTION

In [7]:
import pickle
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2

In [2]:
data = pd.read_csv('financial_loan_data.csv')
data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE_x,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_x,AMT_ANNUITY_x,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,100002,Defaulters,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,Auto technology,24.0,low_normal,POS other with interest,365243.0,-565.0,125.0,-25.0,-17.0,0.0
1,100003,Repayers,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,XNA,12.0,low_normal,Cash X-Sell: low,365243.0,-716.0,-386.0,-536.0,-527.0,1.0
2,100003,Repayers,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,Furniture,6.0,middle,POS industry with interest,365243.0,-797.0,-647.0,-647.0,-639.0,0.0
3,100003,Repayers,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,Consumer electronics,12.0,middle,POS household with interest,365243.0,-2310.0,-1980.0,-1980.0,-1976.0,1.0
4,100004,Repayers,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,Connectivity,4.0,middle,POS mobile without interest,365243.0,-784.0,-694.0,-724.0,-714.0,0.0


# Label Encoding the Categorical columns


In [3]:
cat_cols = data.select_dtypes(include='object').columns
cat_cols

Index(['TARGET', 'NAME_CONTRACT_TYPE_x', 'CODE_GENDER', 'FLAG_OWN_CAR',
       'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE_x', 'NAME_INCOME_TYPE',
       'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE',
       'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START_x', 'ORGANIZATION_TYPE',
       'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE',
       'EMERGENCYSTATE_MODE', 'NAME_CONTRACT_TYPE_y',
       'WEEKDAY_APPR_PROCESS_START_y', 'FLAG_LAST_APPL_PER_CONTRACT',
       'NAME_CASH_LOAN_PURPOSE', 'NAME_CONTRACT_STATUS', 'NAME_PAYMENT_TYPE',
       'CODE_REJECT_REASON', 'NAME_TYPE_SUITE_y', 'NAME_CLIENT_TYPE',
       'NAME_GOODS_CATEGORY', 'NAME_PORTFOLIO', 'NAME_PRODUCT_TYPE',
       'CHANNEL_TYPE', 'NAME_SELLER_INDUSTRY', 'NAME_YIELD_GROUP',
       'PRODUCT_COMBINATION'],
      dtype='object')

In [4]:
en = LabelEncoder()

for col in cat_cols:
    data[col] = en.fit_transform(data[col])

**Feature Selection**

In [10]:
X = abs(data.drop('TARGET',axis=1))
y = data['TARGET']  
selection = SelectKBest(chi2, k=10)
data_select = selection.fit_transform(X,y)

ValueError: Input X contains NaN.
SelectKBest does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [9]:
from sklearn.impute import SimpleImputer

# Instantiate the imputer
imputer = SimpleImputer(strategy='mean')

# Fit and transform the imputer on your data
X_imputed = imputer.fit_transform(X)

# Now, perform feature selection on the imputed data
data_select = selection.fit_transform(X_imputed, y)


In [11]:

best = pd.DataFrame({"columns":X.columns,"chi-sq-value": selection.scores_}).sort_values('chi-sq-value',ascending=False).head(15)
best

AttributeError: 'SelectKBest' object has no attribute 'scores_'

In [12]:
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2

# Assuming 'data' is your dataset containing features and target
X = abs(data.drop('TARGET', axis=1))
y = data['TARGET']  

# Instantiate the imputer
imputer = SimpleImputer(strategy='mean')

# Fit and transform the imputer on your data
X_imputed = imputer.fit_transform(X)

# Initialize the SelectKBest object with the chi-squared test
selection = SelectKBest(chi2, k=10)

# Fit and transform SelectKBest on the imputed data
data_select = selection.fit_transform(X_imputed, y)

In [13]:
best = pd.DataFrame({"columns":X.columns,"chi-sq-value": selection.scores_}).sort_values('chi-sq-value',ascending=False).head(15)
best

Unnamed: 0,columns,chi-sq-value
17,DAYS_EMPLOYED,273246400.0
9,AMT_GOODS_PRICE_x,106506300.0
7,AMT_CREDIT_x,58737920.0
154,DAYS_LAST_DUE_1ST_VERSION,17182360.0
155,DAYS_LAST_DUE,9478231.0
156,DAYS_TERMINATION,7998255.0
125,AMT_APPLICATION,7521718.0
6,AMT_INCOME_TOTAL,5615954.0
152,DAYS_FIRST_DRAWING,3593914.0
153,DAYS_FIRST_DUE,2716720.0


In [14]:
top_columns = list(best['columns'])  # Selecting the top 15 columns for prediction

# Scaling the columns

In [15]:

from sklearn.preprocessing import StandardScaler
std = StandardScaler()
for col in top_columns:
   data[col] = std.fit_transform(np.array(data[col]).reshape(-1, 1))

# Checking if the target is balanced


In [16]:
data['TARGET'].value_counts()

TARGET
1    400421
0     37968
Name: count, dtype: int64

**The target column is highly imbalanced. Hence using the SMOTE technique for oversampling**

In [17]:
X = data[top_columns]
y = data['TARGET']

# SMOTE -Oversampling


In [18]:
from imblearn.over_sampling import SMOTE

oversample = SMOTE()
X,y = oversample.fit_resample(X,y)

ValueError: Input X contains NaN.
SMOTE does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [19]:
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

# Instantiate the imputer
imputer = SimpleImputer(strategy='mean')

# Fit and transform the imputer on your data
X_imputed = imputer.fit_transform(X)

# Instantiate SMOTE and apply it to the data
oversample = SMOTE()
X_resampled, y_resampled = oversample.fit_resample(X_imputed, y)


In [20]:
from collections import Counter

counter = Counter(y)
print(counter)

Counter({1: 400421, 0: 37968})


**The Target is now balanced**

# Train test split

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn import metrics

# Model Building

In [23]:
def classification(df, algorithm):
    model = algorithm().fit(X_train, y_train)
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)

    print(algorithm.__name__)  
    Train_Classification_Report = metrics.classification_report(y_train,train_pred)
    print(Train_Classification_Report)
    print("\n\n")
    Test_Classification_Report = metrics.classification_report(y_test,test_pred)
    print(Test_Classification_Report)

In [29]:
from sklearn.metrics import classification_report

def classification(algorithm, X_train, y_train, X_test, y_test):
    model = algorithm()
    model.fit(X_train, y_train)
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    train_report = classification_report(y_train, train_pred)
    test_report = classification_report(y_test, test_pred)
    return train_report, test_report


In [26]:
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Instantiate the imputer
imputer = SimpleImputer(strategy='mean')

# Impute missing values in X
X_imputed = imputer.fit_transform(X)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Instantiate the LogisticRegression model
model = LogisticRegression()

# Fit the model on the training data
model.fit(X_train, y_train)

# Predict on the test data
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.9143456739432925


In [32]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# Assuming you already have X_train, X_test, y_train, y_test
logistic_train_report, logistic_test_report = classification(LogisticRegression, X_train, y_train, X_test, y_test)
print("Logistic Regression - Training:")
print(logistic_train_report)
print("Logistic Regression - Testing:")
print(logistic_test_report)

decision_tree_train_report, decision_tree_test_report = classification(DecisionTreeClassifier, X_train, y_train, X_test, y_test)
print("Decision Tree Classifier - Training:")
print(decision_tree_train_report)
print("Decision Tree Classifier - Testing:")
print(decision_tree_test_report)


Logistic Regression - Training:
              precision    recall  f1-score   support

           0       1.00      0.00      0.00     30457
           1       0.91      1.00      0.95    320254

    accuracy                           0.91    350711
   macro avg       0.96      0.50      0.48    350711
weighted avg       0.92      0.91      0.87    350711

Logistic Regression - Testing:
              precision    recall  f1-score   support

           0       1.00      0.00      0.00      7511
           1       0.91      1.00      0.96     80167

    accuracy                           0.91     87678
   macro avg       0.96      0.50      0.48     87678
weighted avg       0.92      0.91      0.87     87678

Decision Tree Classifier - Training:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     30457
           1       1.00      1.00      1.00    320254

    accuracy                           1.00    350711
   macro avg       1.00     

In [34]:
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

def classification(algorithm, data):
    X = data.drop('TARGET', axis=1)
    y = data['TARGET']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = algorithm()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred)
    return report


In [35]:
print(classification(DecisionTreeClassifier, data))


              precision    recall  f1-score   support

           0       0.78      0.30      0.43      7511
           1       0.94      0.99      0.96     80167

    accuracy                           0.93     87678
   macro avg       0.86      0.64      0.70     87678
weighted avg       0.92      0.93      0.92     87678



In [None]:
from xgboost import XGBClassifier

# Instantiate XGBClassifier
xgb_classifier = XGBClassifier()

# Call the classification function with the instantiated classifier
print(classification(xgb_classifier, data))

In [None]:
print(classification(data, XGBClassifier))

**From the above Models' evaluation the Decision Tree has a better performance. Hence selecting the Decision Tree as the ideal algorithm and saving it as a pickle file.**

In [36]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

print("Train classification report")  
Train_Classification_Report = metrics.classification_report(y_train,train_pred)
print(Train_Classification_Report)
print("\n\n")
print("Test classification report") 
Test_Classification_Report = metrics.classification_report(y_test,test_pred)
print(Test_Classification_Report)

Train classification report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     30457
           1       1.00      1.00      1.00    320254

    accuracy                           1.00    350711
   macro avg       1.00      1.00      1.00    350711
weighted avg       1.00      1.00      1.00    350711




Test classification report
              precision    recall  f1-score   support

           0       0.73      0.75      0.74      7511
           1       0.98      0.97      0.98     80167

    accuracy                           0.95     87678
   macro avg       0.85      0.86      0.86     87678
weighted avg       0.96      0.95      0.96     87678



# Predicting the Target

In [43]:
ip = [[0.877012,1.255301,0.947656,0.051587,0.473693,-0.526915,-0.957155,0.265246,0.316398,0.704997,0.114612,-0.038348,-0.059868,0.214962,-0.504138]]
classified = model.predict(np.array(ip))
if classified == 1:
    print("The Client is a Repayer")
else:
    print("The Client is a Defaulter")

The Client is a Repayer


# Saving the model as pickle file


In [40]:
import pickle
with open('financial_model.pkl','wb') as file:
    pickle.dump(model,file)