In [None]:
# write step by step

# 0. Understand the company’s challenge/goal
# 1. import database
# 2. Prepare the database for AI
# 3. CREATE AN AI MODEL -> credit score: bad, average or good
# 4. choose the best AI model
# 5. Use the model to make predictions

# tools used: pandas (database) and scikit-learn (IA)

In [None]:
# 1. import database
import pandas as pd

tabela = pd.read_csv("clientes.csv") # insert the table name in parentheses
display(tabela) # displays a format table organized in ipynb files

# check the database | data processing | ex: display(table.info())

In [None]:
#2. Prepare the database for AI

from sklearn.preprocessing import LabelEncoder as le 

codificador = le() # specifically import labelEncoder to reduce typing and optimize code

# intelligence firstly only works with numbers, that is, object will not be read, for this, we must annotate these columns with object

# minus the prediction column (score)

# transform strs into numbers

# select a [column] in the table | encoder.fit_transform encodes and adds a label to arrays (in this case, our "objects" columns will be transformed into an integer)
tabela["profissao"] = codificador.fit_transform(tabela["profissao"])

# mix_credito
tabela["mix_credito"] = codificador.fit_transform(tabela["mix_credito"])

# comportamento_pagamento
tabela["comportamento_pagamento"] = codificador.fit_transform(tabela["comportamento_pagamento"])


display(tabela.info()) # shows the database information on the screen


In [12]:
# prediction with AI
# The AI ​​will use the data in the table to calculate the customer's score, therefore, the score_credito table is what we want to predict, so we will not provide this data to the AI

# X is who AI can use to make predictions
# Y and who AI has to predict

x = tabela.drop(columns=["score_credito", "id_cliente"]) # drop to throw away useless columns | as and more than one, it needs to be in an array
y = tabela["score_credito"] 

# train the AI

# divide the data table into parts x_training and y_training and x_test and Y_test

from sklearn.model_selection import train_test_split # will divide the database into 4 parts

x_treino, x_teste, y_treino, y_teste = train_test_split(x, y) # writing order: first x and then y | You can also define the quantity of each thing. ex: train_test_split(x, y, test_size=0.3), but it is important to highlight that what is suggested is to leave more data for x (training)

In [None]:
# 3. Create an AI

# Decision tree -> RandomForest

# KNN -> Noirest Neighbors

# import AI
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# create IA
modelo_arvoredecisao = RandomForestClassifier()
modelo_knn = KNeighborsClassifier() 

# train the AI
modelo_arvoredecisao.fit(x_treino, y_treino)
modelo_knn.fit(x_treino, y_treino)


In [None]:
# choose better AI
previsao_arvoredecisao = modelo_arvoredecisao.predict(x_teste)
previsao_knn = modelo_knn.predict(x_teste)

from sklearn.metrics import accuracy_score

display(accuracy_score(y_teste, previsao_arvoredecisao))
display(accuracy_score(y_teste, previsao_knn))

# Results

#0.82792 tree
# 0.746 knn

In [None]:
#5. Use the best model to make new predictions

# import new table
tabelanova = pd.read_csv("novos_clientes.csv")

# show table data formats
display(tabelanova)

# transform obj into num

# profissao
tabelanova["profissao"] = codificador.fit_transform(tabelanova["profissao"])

# mix_credito
tabelanova["mix_credito"] = codificador.fit_transform(tabelanova["mix_credito"])

# comportamento_pagamento 
tabelanova["comportamento_pagamento"] = codificador.fit_transform(tabelanova["comportamento_pagamento"])

# nova previsao
previsoes = modelo_arvoredecisao.predict(tabelanova)
display(previsoes)
