In [26]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [40]:
# 1. Carregar os dados do conjunto "Credit Approval"
df = pd.read_csv("../data/raw/crx.data", header=None)
df.isin(["?"]).sum()

0     12
1     12
2      0
3      6
4      6
5      9
6      9
7      0
8      0
9      0
10     0
11     0
12     0
13    13
14     0
15     0
dtype: int64

In [42]:
# 2. Eliminar exemplos com NA
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)
df.isin(["?"]).sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
dtype: int64

In [32]:
# 3. Converter atributos categóricos em variáveis dummy
categorical_features = [0, 3, 4, 5, 6, 8, 9, 11, 12]
transformer = ColumnTransformer([('one_hot_encoder', OneHotEncoder(), categorical_features)], remainder='passthrough')
df = transformer.fit_transform(df.astype(str))

In [33]:
# 4. Normalizar os atributos
scaler = StandardScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(df[:, :-1]), columns=range(df.shape[1]-1))
scaled_df['class'] = df[:, -1]
df = scaled_df
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,73,74,75,76,77,78,79,80,81,class
0,-0.671648,0.671648,-0.555533,0.555533,-0.550811,0.550811,0.550811,-0.550811,0.550811,-0.550811,...,0.303271,-0.055427,-0.297209,-0.056962,-0.96144,-0.295171,-0.302596,0.128682,-0.193125,+
1,1.488875,-1.488875,-0.555533,0.555533,-0.550811,0.550811,0.550811,-0.550811,0.550811,-0.550811,...,0.303271,-0.055427,-0.297209,2.296536,-0.073565,0.236217,0.704516,-0.816802,-0.086443,+
2,1.488875,-1.488875,-0.555533,0.555533,-0.550811,0.550811,0.550811,-0.550811,0.550811,-0.550811,...,0.303271,-0.055427,-0.297209,-0.592078,-0.861903,-0.220955,-0.504019,0.592504,-0.03615,+
3,-0.671648,0.671648,-0.555533,0.555533,-0.550811,0.550811,0.550811,-0.550811,0.550811,-0.550811,...,0.303271,-0.055427,-0.297209,-0.310572,-0.654865,0.44699,0.503093,-0.477855,-0.192553,+
4,-0.671648,0.671648,-0.555533,0.555533,-0.550811,0.550811,0.550811,-0.550811,0.550811,-0.550811,...,-3.297382,-0.055427,3.364633,-0.958122,0.158358,-0.158613,-0.504019,-0.358926,-0.193125,+


In [24]:
# 5. Dividir o conjunto de dados em treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df['class'], test_size=0.3, random_state=42)

In [25]:
# 6. Executar o algoritmo KNN e calcular a acurácia
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"A acurácia do modelo KNN é de {round(acc*100,2)}%.")

A acurácia do modelo KNN é de 82.65%.
