In [None]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd 
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import category_encoders as ce
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.metrics import accuracy_score,f1_score
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

In [None]:
#Read the data set
data=pd.read_csv("/content/credit_score.csv")
data.shape

(100000, 28)

In [None]:
#Find the outliers
def find_outliers_IQR(df):
   q1=df.quantile(0.25)
   q3=df.quantile(0.75)
   IQR=q3-q1
   outliers = df[((df<(q1-1.5*IQR)) | (df>(q3+1.5*IQR)))]
   return outliers

In [None]:
#Set the upper limit and the lower limit for the outliers
q1=data.quantile(0.25)
q3=data.quantile(0.75)
IQR=q3-q1
data1= data.select_dtypes(include=['float', 'int'])
upper_lim=q3+1.5*IQR
lower_lim=q1-1.5*IQR

In [None]:
#replace the outliers with their new values
for i in range(21):
    data1.iloc[:,i]=np.where(data1.iloc[:,i]>upper_lim[i],upper_lim[i],np.where(data1.iloc[:,i]<lower_lim[i],lower_lim[i],data1.iloc[:,i]))

In [None]:
#merge the data set 
merged_data=pd.merge(data,data1,on=['ID','Customer_ID','Month','Age','SSN','Annual_Income', 'Monthly_Inhand_Salary', 
'Num_Bank_Accounts','Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan','Delay_from_due_date', 'Num_of_Delayed_Payment',
'Changed_Credit_Limit','Num_Credit_Inquiries','Outstanding_Debt', 'Credit_Utilization_Ratio', 'Credit_History_Age',
'Total_EMI_per_month', 'Amount_invested_monthly',
'Monthly_Balance'],how='left')

In [None]:
#drop unnecessary features like ID ,Name...
merged_data.drop(['ID','Customer_ID','SSN','Name'],axis=1,inplace=True)

In [None]:
merged_data['Credit_Score'].value_counts()

Standard    53174
Poor        28998
Good        17828
Name: Credit_Score, dtype: int64

In [None]:
df=merged_data.copy()

In [None]:
#Binary encoding
str_col=["Occupation","Type_of_Loan","Credit_Mix","Payment_of_Min_Amount","Payment_Behaviour"]
encoder= ce.BinaryEncoder(cols=str_col,return_df=True)
data_encoded=encoder.fit_transform(df)
data_encoded

Unnamed: 0,Month,Age,Occupation_0,Occupation_1,Occupation_2,Occupation_3,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,...,Credit_History_Age,Payment_of_Min_Amount_0,Payment_of_Min_Amount_1,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour_0,Payment_Behaviour_1,Payment_Behaviour_2,Monthly_Balance,Credit_Score
0,1,23.0,0,0,0,1,19114.12,1824.843333,3.0,4.0,...,265.0,0,1,49.574949,21.465380,0,0,1,312.494089,Good
1,2,23.0,0,0,0,1,19114.12,1824.843333,3.0,4.0,...,266.0,0,1,49.574949,21.465380,0,1,0,284.629162,Good
2,3,23.0,0,0,0,1,19114.12,1824.843333,3.0,4.0,...,267.0,0,1,49.574949,21.465380,0,1,1,331.209863,Good
3,4,23.0,0,0,0,1,19114.12,1824.843333,3.0,4.0,...,268.0,0,1,49.574949,21.465380,1,0,0,223.451310,Good
4,5,23.0,0,0,0,1,19114.12,1824.843333,3.0,4.0,...,269.0,0,1,49.574949,21.465380,1,0,1,341.489231,Good
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,4,25.0,1,1,0,1,39628.99,3359.415833,4.0,6.0,...,378.0,0,1,35.104023,24.028477,1,1,0,479.866228,Poor
99996,5,25.0,1,1,0,1,39628.99,3359.415833,4.0,6.0,...,379.0,0,1,35.104023,24.028477,1,0,1,496.651610,Poor
99997,6,25.0,1,1,0,1,39628.99,3359.415833,4.0,6.0,...,380.0,0,1,35.104023,24.028477,1,1,0,516.809083,Poor
99998,7,25.0,1,1,0,1,39628.99,3359.415833,4.0,6.0,...,381.0,0,1,35.104023,24.028477,0,1,0,319.164979,Standard


In [None]:
#values of X,y 
X=data_encoded.drop("Credit_Score",axis=1)
y=data_encoded['Credit_Score']

In [None]:
#Selecting the best 10 features 
def SelectFeatures(X,y):
  test = SelectKBest(score_func=f_classif, k=10)
  fit = test.fit(X,y)
  features = fit.transform(X)
  cols_idxs = test.get_support(indices=True)
  z=X.iloc[:,cols_idxs]
  return z


In [None]:
#Split the train and test subsets 
def split(X,y):
  features=SelectFeatures(X,y)
  skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
  for train_index, test_index in skf.split(features, y):
     X_train, X_test = features.iloc[train_index], features.iloc[test_index]
     y_train, y_test = y.iloc[train_index], y.iloc[test_index]
  return X_train,X_test,y_train,y_test

In [None]:
#RandomForestClassifier model
def RFClassifier(X,y):
  X_train,X_test,y_train,y_test=split(X,y)
  RF = RandomForestClassifier(n_estimators=200,criterion="entropy",random_state=42)
  RF.fit(X_train, y_train)
  y_pred = RF.predict(X_test)
  return y_pred,RF
  

In [None]:
X_train,X_test,y_train,y_test=split(X,y)
X_train.shape

(80000, 10)

In [None]:
#apply the RandomForestClassifier and print the accuracy score and the f1-score 
X_train,X_test,y_train,y_test=split(X,y)
y_pred,RF=RFClassifier(X,y)
print('Accuracy Score: ', accuracy_score(y_test, y_pred))
print('f1-Score: ', round(f1_score(y_test, y_pred,average="macro"),4))


Accuracy Score:  0.8091
f1-Score:  0.8017


In [None]:
#print classification report 
print("Classification report:\n\n",classification_report(y_test, y_pred))

Classification report:

               precision    recall  f1-score   support

        Good       0.77      0.78      0.78      3565
        Poor       0.80      0.82      0.81      5800
    Standard       0.83      0.81      0.82     10635

    accuracy                           0.81     20000
   macro avg       0.80      0.80      0.80     20000
weighted avg       0.81      0.81      0.81     20000



In [None]:
#Saving the model
import joblib 
y_pred,RF=RFClassifier(X,y)
joblib.dump(RF,"mymodel.pkl",compress=1)

['mymodel.pkl']

Some models that we didn't use for multiple reasons, either because of their low accuracy or because they are computationally intensive and slow 

In [None]:
#GradientBoostingClassifier (good but slower than the Random forest classifier)
from sklearn.ensemble import GradientBoostingClassifier
X_train,X_test,y_train,y_test=split(X,y)
y_predict,RF=RFClassifier(X,y)
Boosting = GradientBoostingClassifier(init=RF,random_state=42)
Boosting.fit(X_train, y_train)
y_pred = Boosting.predict(X_test)
print('Accuracy Score: ', accuracy_score(y_test, y_pred))
print('f1 Score:',f1_score(y_test, y_pred, average="macro"))

Accuracy Score:  0.80915
f1 Score: 0.8015140383390585


In [None]:
#Extra tree classifier (good but our model randomforest classifier has a little higher accuracy and f1 score )
from sklearn.ensemble import ExtraTreesClassifier
X_train,X_test,y_train,y_test=split(X,y)
ExtraTrees = ExtraTreesClassifier(n_estimators=200,criterion="entropy", random_state=42)
ExtraTrees.fit(X_train, y_train)
y_pred=ExtraTrees.predict(X_test)
print("Accuracy score:",accuracy_score(y_test, y_pred))
print('f1 Score: ', f1_score(y_test, y_pred,average="macro"))

Accuracy score: 0.8005
f1 Score:  0.7933412726817329


In [None]:
#Logistic regression (very low accuracy)
from sklearn.linear_model import LogisticRegression
X_train,X_test,y_train,y_test=split(X,y)
Logistic_Regression = LogisticRegression(max_iter=500,solver='saga', random_state=42)
Logistic_Regression.fit(X_train, y_train)
y_pred = Logistic_Regression.predict(X_test)
print('Accuracy Score: ', accuracy_score(y_test, y_pred))
print('f1 Score: ', f1_score(y_test, y_pred,average="macro"))



Accuracy Score:  0.6039
f1 Score:  0.5472891431475754


In [None]:
#XGBClassifier (good but slower than the Random forest classifier)  
from xgboost import XGBClassifier
mapping = {"Good": 0, "Poor": 1, "Standard": 2}
X_train,X_test,y_train,y_test=split(X,y)
# use the map() function to apply the mapping to the series
y_train1= y_train.map(mapping)
y_test1=y_test.map(mapping)
XGBC = XGBClassifier(n_estimators=100,max_depth=50,random_state=42)
XGBC.fit(X_train, y_train1)
y_pred = XGBC.predict(X_test)
print("Accuracy:", accuracy_score(y_test1, y_pred))
print('f1 Score: ', f1_score(y_test1, y_pred,average="macro"))

Accuracy: 0.809
f1 Score:  0.8016111289761719


In [None]:
#AdaBoost Classifier (low accuracy)
from sklearn.ensemble import AdaBoostClassifier
AdaBoost  = AdaBoostClassifier(n_estimators=200, random_state=42)
X_train,X_test,y_train,y_test=split(X,y)
AdaBoost.fit(X_train, y_train)
y_pred=AdaBoost.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print('f1 Score: ', f1_score(y_test, y_pred,average="macro"))

Accuracy: 0.6635
f1 Score:  0.6432422108618241


In [None]:
#KNeighborsClassifier (good but our model randomforest classifier has a little higher accuracy and f1 score )
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(n_neighbors=5)
X_train,X_test,y_train,y_test=split(X,y)
KNN.fit(X_train, y_train)
y_pred = KNN.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print('f1 Score:',f1_score(y_test, y_pred, average="macro"))

Accuracy: 0.80295
f1 Score: 0.796891766316011
