# Python AI Project: Artificial Intelligence and Predictions

### Case: Customer Credit Score

You were hired by a bank to define customers' credit scores. You need to analyze all the bank's customers and, based on this analysis, create a model that can read the customer's information and automatically tell their credit score: Bad, Ok, Good

Class files: https://drive.google.com/drive/folders/1FbDqVq4XLvU85VBlVIMJ73p9oOu6u2-J?usp=drive_link

In [185]:
import pandas as pd
# importing the database
table = pd.read_csv("clients.csv")
display(table)
# Verifying the database
display(table.info())
display(table.columns)

Unnamed: 0,client_id,month,age,profession,annual_salary,number_accounts,number_cards,interest_loan,number_loan,days_delay,...,credit_history_age,monthly_investment,payment_behavior,final_month_balance,score_credit,car_loan,house_loan,personal_loan,credit_loan,student_loan
0,3392,1,23.0,cientista,19114.12,3.0,4.0,3.0,4.0,3.0,...,265.0,21.465380,alto_gasto_pagamento_baixos,312.494089,Good,1,1,1,1,0
1,3392,2,23.0,cientista,19114.12,3.0,4.0,3.0,4.0,3.0,...,266.0,21.465380,baixo_gasto_pagamento_alto,284.629162,Good,1,1,1,1,0
2,3392,3,23.0,cientista,19114.12,3.0,4.0,3.0,4.0,3.0,...,267.0,21.465380,baixo_gasto_pagamento_medio,331.209863,Good,1,1,1,1,0
3,3392,4,23.0,cientista,19114.12,3.0,4.0,3.0,4.0,5.0,...,268.0,21.465380,baixo_gasto_pagamento_baixo,223.451310,Good,1,1,1,1,0
4,3392,5,23.0,cientista,19114.12,3.0,4.0,3.0,4.0,6.0,...,269.0,21.465380,alto_gasto_pagamento_medio,341.489231,Good,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,37932,4,25.0,mecanico,39628.99,4.0,6.0,7.0,2.0,23.0,...,378.0,24.028477,alto_gasto_pagamento_alto,479.866228,Poor,1,0,0,0,1
99996,37932,5,25.0,mecanico,39628.99,4.0,6.0,7.0,2.0,18.0,...,379.0,24.028477,alto_gasto_pagamento_medio,496.651610,Poor,1,0,0,0,1
99997,37932,6,25.0,mecanico,39628.99,4.0,6.0,7.0,2.0,27.0,...,380.0,24.028477,alto_gasto_pagamento_alto,516.809083,Poor,1,0,0,0,1
99998,37932,7,25.0,mecanico,39628.99,4.0,6.0,7.0,2.0,20.0,...,381.0,24.028477,baixo_gasto_pagamento_alto,319.164979,Standard,1,0,0,0,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 25 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   client_id             100000 non-null  int64  
 1   month                 100000 non-null  int64  
 2   age                   100000 non-null  float64
 3   profession            100000 non-null  object 
 4   annual_salary         100000 non-null  float64
 5   number_accounts       100000 non-null  float64
 6   number_cards          100000 non-null  float64
 7   interest_loan         100000 non-null  float64
 8   number_loan           100000 non-null  float64
 9   days_delay            100000 non-null  float64
 10  number_late_payments  100000 non-null  float64
 11  number_credit_checks  100000 non-null  float64
 12  credit_mix            100000 non-null  object 
 13  total_debt            100000 non-null  float64
 14  credit_usage_rate     100000 non-null  float64
 15  c

None

Index(['client_id', 'month', 'age', 'profession', 'annual_salary',
       'number_accounts', 'number_cards', 'interest_loan', 'number_loan',
       'days_delay', 'number_late_payments', 'number_credit_checks',
       'credit_mix', 'total_debt', 'credit_usage_rate', 'credit_history_age',
       'monthly_investment', 'payment_behavior', 'final_month_balance',
       'score_credit', 'car_loan', 'house_loan', 'personal_loan',
       'credit_loan', 'student_loan'],
      dtype='object')

In [186]:
# The AI can only process numbers, not objects, so we need to encode all objects type columns in our table,
# The only exeption being the score, since it's the column we want the Ai do predict.
# we could encode it as well, but if we did, the Ai would return numbers to us, not ok/good/bad
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

for column in table.columns:
    if table[column].dtype == "object" and column != "score_credit":
        table[column] = encoder.fit_transform(table[column])
display(table.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 25 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   client_id             100000 non-null  int64  
 1   month                 100000 non-null  int64  
 2   age                   100000 non-null  float64
 3   profession            100000 non-null  int32  
 4   annual_salary         100000 non-null  float64
 5   number_accounts       100000 non-null  float64
 6   number_cards          100000 non-null  float64
 7   interest_loan         100000 non-null  float64
 8   number_loan           100000 non-null  float64
 9   days_delay            100000 non-null  float64
 10  number_late_payments  100000 non-null  float64
 11  number_credit_checks  100000 non-null  float64
 12  credit_mix            100000 non-null  int32  
 13  total_debt            100000 non-null  float64
 14  credit_usage_rate     100000 non-null  float64
 15  c

None

In [187]:
# Now we need to choose which columns we will use to train our prediction.
# we will name Y the column we want to predict, our output
# and X the columns we will use to predict the result, our input

# Since Id have no use here, we will remove it as well. axis = 1 means to remove columns, axis = o would remove lines
x = table.drop(["client_id","score_credit"], axis = 1)
y = table["score_credit"]

from sklearn.model_selection import train_test_split

# Then we separate the database into training and testing database.
# training one will be used to train our data,
# while testing will be used to check how accurate Ai is
# test_size, specify the % of the table used for testing and training
# random_state is used to control the shuffling applied to the table before splitting, passing an int allows for reproducible outputs
# in multiple function calls, this is mainly used for debugging or documentation purposes so that everyone can consistently see the same results
# using random_state = None or not using it will make the result completely random
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)


In [188]:
# For this project, we will try two of the most used models.
# Random forest and K-nearest neightbors
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# for variety, comparission and testing sake, we are also using 4 more models.
# Linear Support Vector Machines, Logistic Regression, Decision Tree and Gaussian Naive Bayes
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

Model_RTree = RandomForestClassifier()
Model_KNN = KNeighborsClassifier()

Model_Lsvm = LinearSVC()
Model_LG = LogisticRegression()
Model_Dtree = DecisionTreeClassifier()
Model_GNB = GaussianNB()

# To improve our options, we will train the 6 models without any alteration 
# and again while making use of make_pipeline to automate the workflow, while using
# StandardScaler to standardized our training, making all it values have
# mean of 0 and a standard deviation of 1
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

pipe_RTree = make_pipeline(StandardScaler(), RandomForestClassifier())
pipe_KNN = make_pipeline(StandardScaler(),  KNeighborsClassifier())
pipe_Lsvm = make_pipeline(StandardScaler(), LinearSVC())
pipe_LG = make_pipeline(StandardScaler(), LogisticRegression())
pipe_Dtree = make_pipeline(StandardScaler(), DecisionTreeClassifier())
pipe_GNB = make_pipeline(StandardScaler(), GaussianNB())

# training the models.
Model_RTree.fit(x_train, y_train)
Model_KNN.fit(x_train, y_train)
Model_Lsvm.fit(x_train, y_train) # Our linear module is throwing an error
# because it has a hard time dealing with non standardized values, it will still work, but poorly.
Model_LG.fit(x_train, y_train)
Model_Dtree.fit(x_train, y_train)
Model_GNB.fit(x_train, y_train)

# training the Pipelined models
pipe_RTree.fit(x_train, y_train) 
pipe_KNN.fit(x_train, y_train) 
pipe_Lsvm.fit(x_train, y_train) # The version with standardized training works fine
pipe_LG.fit(x_train, y_train) 
pipe_Dtree.fit(x_train, y_train) 
pipe_GNB.fit(x_train, y_train) 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [189]:


# Testing their accuracy
from sklearn.metrics import accuracy_score

# Calculating the predictions
prediction_RTree = Model_RTree.predict(x_test)
prediction_KNN = Model_KNN.predict(x_test)
prediction_Lsvm = Model_Lsvm.predict(x_test)
prediction_Dtree = Model_Dtree.predict(x_test)
prediction_LG = Model_LG.predict(x_test)
prediction_GNB = Model_GNB.predict(x_test)



# Comparing the predictions using y_test
print(f"RTee accuracy score = {accuracy_score(y_test, prediction_RTree)}")
print(f"KNN accuracy score = {accuracy_score(y_test, prediction_KNN)}")
print(f"Lsvm accuracy score = {accuracy_score(y_test, prediction_Lsvm)}")
print(f"LG accuracy score = {accuracy_score(y_test, prediction_LG)}")
print(f"Dtree accuracy score = {accuracy_score(y_test, prediction_Dtree)}")
print(f"GNB accuracy score = {accuracy_score(y_test, prediction_GNB)}")
print("")

# it's also possible to do the same with pipeline
prediction_pipe_RTree = pipe_RTree.predict(x_test)
print(f"Pipe_RTee accuracy score = {accuracy_score(y_test, prediction_pipe_RTree)}")
print("")
# or we can do both at once with the Pipeline
print(f"Pipe_Rtree score = {pipe_RTree.score(x_test, y_test)}")
print(f"Pipe_KNN score = {pipe_KNN.score(x_test, y_test)}")
print(f"Pipe_Lsvn score = {pipe_Lsvm.score(x_test, y_test)}")
print(f"Pipe_LG score = {pipe_LG.score(x_test, y_test)}")
print(f"Pipe_Dtree score = {pipe_Dtree.score(x_test, y_test)}")
print(f"Pipe_GNB score = {pipe_GNB.score(x_test, y_test)}")

RTee accuracy score = 0.8264666666666667
KNN accuracy score = 0.7324
Lsvm accuracy score = 0.5463
LG accuracy score = 0.5452
Dtree accuracy score = 0.7334
GNB accuracy score = 0.5976

Pipe_RTee accuracy score = 0.8260666666666666

Pipe_Rtree score = 0.8260666666666666
Pipe_KNN score = 0.7458
Pipe_Lsvn score = 0.6275333333333334
Pipe_LG score = 0.6438666666666667
Pipe_Dtree score = 0.7327666666666667
Pipe_GNB score = 0.6254


In [190]:
# From the informations above, we can see that StandardScaler improved most of our models accuracy
# mainly with Linear Support Vector Machinesm and Logistic Regression

# The Random forest proved to be the most precise model,
# So it will be the model we will use to make predictions

# Making new predictions
new_clients = pd.read_csv("new_clients.csv")
display(new_clients)
for column in new_clients.columns:
    if new_clients[column].dtype == "object":
        new_clients[column] = encoder.fit_transform(new_clients[column])

predictions = pipe_RTree.predict(new_clients)
display(predictions)

Unnamed: 0,month,age,profession,annual_salary,number_accounts,number_cards,interest_loan,number_loan,days_delay,number_late_payments,...,credit_usage_rate,credit_history_age,monthly_investment,payment_behavior,final_month_balance,car_loan,house_loan,personal_loan,credit_loan,student_loan
0,1,31.0,empresario,19300.34,6.0,7.0,17.0,5.0,52.0,19.0,...,29.934186,218.0,44.50951,baixo_gasto_pagamento_baixo,312.487689,1,1,0,0,0
1,4,32.0,advogado,12600.445,5.0,5.0,10.0,3.0,25.0,18.0,...,28.819407,12.0,0.0,baixo_gasto_pagamento_medio,300.994163,0,0,0,0,1
2,2,48.0,empresario,20787.69,8.0,6.0,14.0,7.0,24.0,14.0,...,34.235853,215.0,0.0,baixo_gasto_pagamento_alto,345.081577,0,1,0,1,0


array(['Poor', 'Good', 'Standard'], dtype=object)

In [191]:
# To better understand how our Ai works and to indentify the most 
# important characteristcs to define the credit score
# we can examine how Ai weight each characteristic

# we can use the model's attribute .feature_importances_ to find the weight
# of each of the inputs in it's prediction (models.feature_importances_)

# But the make_pipeline function doesn't have said attribute, so we need to access the model inside it.
data = pipe_RTree.named_steps['randomforestclassifier'].feature_importances_
display (data)

# Displaying it properly:
columns = list(x_test.columns)
weight = pd.DataFrame(index = columns, data = pipe_RTree.named_steps['randomforestclassifier'].feature_importances_)
weight = weight * 100
display(weight)

array([0.04007021, 0.04253182, 0.03258617, 0.05095393, 0.03586105,
       0.04472241, 0.08129519, 0.03053915, 0.06069807, 0.0450967 ,
       0.04903074, 0.08584762, 0.11551649, 0.05062196, 0.07265377,
       0.048349  , 0.0237274 , 0.0545342 , 0.00719966, 0.0071876 ,
       0.00709775, 0.00692179, 0.00695731])

Unnamed: 0,0
month,4.007021
age,4.253182
profession,3.258617
annual_salary,5.095393
number_accounts,3.586105
number_cards,4.472241
interest_loan,8.129519
number_loan,3.053915
days_delay,6.069807
number_late_payments,4.50967


In [192]:
# Through this, we can see that the 4 most important characteristics
# are total_debt, credit_mix, interest_loan	and credit_history_age

# Helping us better understand how the Ai works and what
# Characteristics are more important when deciding a client's score