In [1]:
from seaborn import load_dataset
from sklearn import preprocessing
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn import metrics

In [2]:
label_encoder = preprocessing.LabelEncoder()

## Autor: Antonio Roberto P. de Lima Jr. 
## Git: https://github.com/AntonioJunior2222/
## Linkdn: https://www.linkedin.com/in/antonior-junior/

## Dataset Titanic do Seaborn

In [3]:
df_titanic = load_dataset('titanic')
df_titanic.head(5)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


### Alvo: Survived

In [4]:
df_mod = df_titanic
df_mod.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


### _Tratamento dos Dados_

#### Usando Label Encoder para transformar 'alive' e 'class' em númericos, para assim checar se eles são iguais as colunas 'survived' e 'pclass'.

In [5]:
df_mod['alive'] = label_encoder.fit_transform(df_mod['alive'])
df_mod['class'] = label_encoder.fit_transform(df_mod['class'])

In [6]:
df_mod['class'] = np.where(df_mod['class'] == 2, 3, df_mod['class'])
df_mod['class'] = np.where(df_mod['class'] == 1, 2, df_mod['class'])
df_mod['class'] = np.where(df_mod['class'] == 0, 1, df_mod['class'])

#### Ao olhar para os valores de 'embark_town' percebi que poderiam se tratar dos mesmos valores de 'embarked', sendo assim, converti a coluna para verificar.

In [7]:
df_mod['embark_town'].value_counts()

Southampton    644
Cherbourg      168
Queenstown      77
Name: embark_town, dtype: int64

In [8]:
df_mod['embark_town'] = np.where(df_mod['embark_town'] == 'Southampton', 'S', df_mod['embark_town'])
df_mod['embark_town'] = np.where(df_mod['embark_town'] == 'Cherbourg', 'C', df_mod['embark_town'])
df_mod['embark_town'] = np.where(df_mod['embark_town'] == 'Queenstown', 'Q', df_mod['embark_town'])

In [9]:
check_if_same = df_mod['class']==df_mod['pclass']
check_if_same.value_counts()

True    891
dtype: int64

In [10]:
check_if_same = df_mod['alive']==df_mod['survived']
check_if_same.value_counts()

True    891
dtype: int64

In [11]:
check_if_same = df_mod['embark_town']==df_mod['embarked']
check_if_same.value_counts()

True     889
False      2
dtype: int64

#### De fato as 3 colunas estavam repetidas. Decidi fazer a exclusão das mesmas.

In [12]:
df_mod.drop(columns = ['alive','embark_town','class','deck','adult_male'], inplace = True)

#### Quanto as colunas 'Age' e 'Fare': Por terem ranges de valores muito grandes, optei pela normalização utilizando o MinMaxScaler.

In [13]:
df_mod['age'] = MinMaxScaler().fit_transform(np.array(df_mod['age']).reshape(-1,1))
df_mod['fare'] = MinMaxScaler().fit_transform(np.array(df_mod['fare']).reshape(-1,1))

#### Após isso decidi fazer utilizar o OneHotEncoder nas features categóricas.

In [14]:
column_transformer = make_column_transformer((OneHotEncoder(), ['sex','who','embarked']), remainder = 'passthrough')
df_mod = column_transformer.fit_transform(df_mod)
df_mod = pd.DataFrame(data=df_mod)

#### Infelizmente foi necessário excluir algumas linhas por excesso de NaN's

In [15]:
df_mod.dropna(inplace = True)

In [16]:
df_mod.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       714 non-null    object
 1   1       714 non-null    object
 2   2       714 non-null    object
 3   3       714 non-null    object
 4   4       714 non-null    object
 5   5       714 non-null    object
 6   6       714 non-null    object
 7   7       714 non-null    object
 8   8       714 non-null    object
 9   9       714 non-null    object
 10  10      714 non-null    object
 11  11      714 non-null    object
 12  12      714 non-null    object
 13  13      714 non-null    object
 14  14      714 non-null    object
 15  15      714 non-null    object
dtypes: object(16)
memory usage: 94.8+ KB


In [17]:
df_mod

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0,3,0.271174,1,0,0.014151,False
1,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1,1,0.472229,1,0,0.139136,False
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1,3,0.321438,0,0,0.015469,True
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1,1,0.434531,1,0,0.103644,False
4,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0,3,0.434531,0,0,0.015713,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0,3,0.484795,0,5,0.056848,False
886,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0,2,0.334004,0,0,0.025374,True
887,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1,1,0.233476,0,0,0.058556,True
889,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1,1,0.321438,0,0,0.058556,True


#### Separação de Target e Features

In [18]:
X_tt = df_mod.loc[:,1:]
y_tt = df_mod[0].astype(int)

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X_train_tt, X_test_tt, y_train_tt, y_test_tt = train_test_split(X_tt, y_tt, test_size=0.2, random_state=None, stratify = y_tt) # 80% treino e 20% teste

#### Importação dos modelos escolhidos

In [21]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
model_Tree_1_tt = DecisionTreeClassifier(criterion = "entropy", max_depth = 5)
model_Tree_2_tt = DecisionTreeClassifier(criterion = "log_loss", max_depth = 15)
model_KNN_1_tt = KNeighborsClassifier(n_neighbors = 5, metric = "euclidean", algorithm = 'brute')
model_KNN_2_tt = KNeighborsClassifier(n_neighbors = 15, metric = "minkowski", algorithm = 'ball_tree')

In [22]:
model_Tree_1_tt.fit(X_train_tt, y_train_tt)
model_Tree_2_tt.fit(X_train_tt, y_train_tt)
model_KNN_1_tt.fit(X_train_tt, y_train_tt)
model_KNN_2_tt.fit(X_train_tt, y_train_tt)

In [23]:
result_tt_Tree_1 = model_Tree_1_tt.predict(X_test_tt)
result_tt_Tree_2 = model_Tree_2_tt.predict(X_test_tt)
result_tt_KNN_1 = model_KNN_1_tt.predict(X_test_tt)
result_tt_KNN_2 = model_KNN_2_tt.predict(X_test_tt)

#### Resultado Final ao aplicar os Modelos

In [24]:
precisao_tt_Tree_1 = metrics.accuracy_score(result_tt_Tree_1, y_test_tt)
show1_tt = round(precisao_tt_Tree_1 * 100)
print(f"Precisão Tree 1: {show1_tt}%")

precisao_tt_Tree_2 = metrics.accuracy_score(result_tt_Tree_2, y_test_tt)
show2_tt = round(precisao_tt_Tree_2 * 100)
print(f"Precisão Tree 2: {show2_tt}%")


precisao_tt_KNN_1 = metrics.accuracy_score(result_tt_KNN_1, y_test_tt)
show3_tt = round(precisao_tt_KNN_1 * 100)
print(f"Precisão KNN 1: {show3_tt}%")

precisao_tt_KNN_2 = metrics.accuracy_score(result_tt_KNN_2, y_test_tt)
show4_tt = round(precisao_tt_KNN_2 * 100)
print(f"Precisão KNN 2: {show4_tt}%")

Precisão Tree 1: 100%
Precisão Tree 2: 100%
Precisão KNN 1: 98%
Precisão KNN 2: 95%


## Dataset Transfusão em github tmoura

In [25]:
df_transfusao = pd.read_csv("https://raw.githubusercontent.com/tmoura/machinelearning/master/transfusion.data", header = None)

In [26]:
df_transfusao_mod = df_transfusao

In [47]:
df_transfusao_mod.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       748 non-null    int64  
 1   1       748 non-null    float64
 2   2       748 non-null    float64
 3   3       748 non-null    float64
 4   4       748 non-null    float64
dtypes: float64(4), int64(1)
memory usage: 29.3 KB


In [48]:
for i in range(1,5):
    df_transfusao_mod[i] = MinMaxScaler().fit_transform(np.array(df_transfusao_mod[i]).reshape(-1,1))
df_transfusao_mod

Unnamed: 0,0,1,2,3,4
0,1,0.027027,1.000000,1.000000,1.000000
1,1,0.000000,0.244898,0.244898,0.270833
2,1,0.013514,0.306122,0.306122,0.343750
3,1,0.027027,0.387755,0.387755,0.447917
4,2,0.013514,0.469388,0.469388,0.781250
...,...,...,...,...,...
743,2,0.310811,0.020408,0.020408,0.375000
744,2,0.283784,0.020408,0.020408,0.520833
745,2,0.310811,0.040816,0.040816,0.625000
746,2,0.527027,0.000000,0.000000,0.385417


In [49]:
X_trn = df_transfusao_mod.loc[:,1:]
y_trn = df_transfusao_mod[0]

In [50]:
X_trn

Unnamed: 0,1,2,3,4
0,0.027027,1.000000,1.000000,1.000000
1,0.000000,0.244898,0.244898,0.270833
2,0.013514,0.306122,0.306122,0.343750
3,0.027027,0.387755,0.387755,0.447917
4,0.013514,0.469388,0.469388,0.781250
...,...,...,...,...
743,0.310811,0.020408,0.020408,0.375000
744,0.283784,0.020408,0.020408,0.520833
745,0.310811,0.040816,0.040816,0.625000
746,0.527027,0.000000,0.000000,0.385417


In [51]:
y_trn

0      1
1      1
2      1
3      1
4      2
      ..
743    2
744    2
745    2
746    2
747    2
Name: 0, Length: 748, dtype: int64

In [52]:
X_train_trn, X_test_trn, y_train_trn, y_test_trn = train_test_split(X_trn, y_trn, test_size=0.2, random_state=None, stratify = y_trn) # 80% treino e 20% teste

In [53]:
model_Tree_1_trn = DecisionTreeClassifier(criterion = "entropy", max_depth = 5)
model_Tree_2_trn = DecisionTreeClassifier(criterion = "log_loss", max_depth = 15)
model_KNN_1_trn = KNeighborsClassifier(n_neighbors = 5, metric = "euclidean", algorithm = 'brute')
model_KNN_2_trn = KNeighborsClassifier(n_neighbors = 15, metric = "minkowski", algorithm = 'ball_tree')

In [54]:
model_Tree_1_trn.fit(X_train_trn, y_train_trn)
model_Tree_2_trn.fit(X_train_trn, y_train_trn)
model_KNN_1_trn.fit(X_train_trn, y_train_trn)
model_KNN_2_trn.fit(X_train_trn, y_train_trn)

In [55]:
result_trn_Tree_1 = model_Tree_1_trn.predict(X_test_trn)
result_trn_Tree_2 = model_Tree_2_trn.predict(X_test_trn)
result_trn_KNN_1 = model_KNN_1_trn.predict(X_test_trn)
result_trn_KNN_2 = model_KNN_2_trn.predict(X_test_trn)

In [56]:
precisao_trn_Tree_1 = metrics.accuracy_score(result_trn_Tree_1, y_test_trn)
show1_trn = round(precisao_trn_Tree_1 * 100)
print(f"Precisão Tree 1: {show1_trn}%")

precisao_trn_Tree_2 = metrics.accuracy_score(result_trn_Tree_2, y_test_trn)
show2_trn = round(precisao_trn_Tree_2 * 100)
print(f"Precisão Tree 2: {show2_trn}%")


precisao_trn_KNN_1 = metrics.accuracy_score(result_trn_KNN_1, y_test_trn)
show3_trn = round(precisao_trn_KNN_1 * 100)
print(f"Precisão KNN 1: {show3_trn}%")

precisao_trn_KNN_2 = metrics.accuracy_score(result_trn_KNN_2, y_test_trn)
show4_trn = round(precisao_trn_KNN_2 * 100)
print(f"Precisão KNN 2: {show4_trn}%")

Precisão Tree 1: 83%
Precisão Tree 2: 71%
Precisão KNN 1: 78%
Precisão KNN 2: 81%
