## Machine learning (Naive Bayes)

### Para este exemplo, está sendo usado o dataset "ADULT DATA SET" que pode ser encontrado no UCI:

https://archive.ics.uci.edu/ml/datasets/Adult

### Passo 1: Importar o dataset

In [1]:
import pandas as pd

In [2]:
names = ['AGE', 'WORKCLASS', 'FNLWGT', 'EDUCATION', 'EDUCATION_NUM', 'MARITAL_STATUS', 
         'OCCUPATION', 'RELATIONSHIP', 'RACE', 'SEX', 'CAPITAL_GAIN', 'CAPITAL_LOSS', 
         'HOURS_PER_WEEK', 'NATIVE_COUNTRY', 'TARGET']

dataset = pd.read_csv("adult.data", names = names)

print("########################################## DATASET ##############################################")
print("")
print(dataset.head())

print("")
print("")
print("##################################### ASPECTOS GERAIS ##########################################")
print("")
print(dataset.info())


########################################## DATASET ##############################################

   AGE          WORKCLASS  FNLWGT   EDUCATION  EDUCATION_NUM  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        MARITAL_STATUS          OCCUPATION    RELATIONSHIP    RACE      SEX  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   CAPITAL_GAIN  CAPITAL_LOSS  HOURS_PER_

### Passo 2.1: Pré-processamento dos dados: Quantidade de classificações em cada coluna

In [3]:
for i, columns in enumerate(names):
    
    types = dataset[columns].dtype
    
    if (types == object):
        
        count = dataset.pivot_table(index=[columns], aggfunc='size')
        
        print("")
        print(count)


WORKCLASS
 ?                    1836
 Federal-gov           960
 Local-gov            2093
 Never-worked            7
 Private             22696
 Self-emp-inc         1116
 Self-emp-not-inc     2541
 State-gov            1298
 Without-pay            14
dtype: int64

EDUCATION
 10th              933
 11th             1175
 12th              433
 1st-4th           168
 5th-6th           333
 7th-8th           646
 9th               514
 Assoc-acdm       1067
 Assoc-voc        1382
 Bachelors        5355
 Doctorate         413
 HS-grad         10501
 Masters          1723
 Preschool          51
 Prof-school       576
 Some-college     7291
dtype: int64

MARITAL_STATUS
 Divorced                  4443
 Married-AF-spouse           23
 Married-civ-spouse       14976
 Married-spouse-absent      418
 Never-married            10683
 Separated                 1025
 Widowed                    993
dtype: int64

OCCUPATION
 ?                    1843
 Adm-clerical         3770
 Armed-Forces         

### Passo 2.2: Pré-processamento dos dados: removendo espaços em branco

In [4]:
dataset.columns = dataset.columns.str.replace(' ', '')

### Passo 2.3: Pré-processamento dos dados: removendo classicações entituladas com ?

In [5]:
for i, columns in enumerate(names):
    
    types = dataset[columns].dtype
    
    if (types == object):
        
        dataset2 = dataset[(dataset[columns] != "?")]
        dataset = dataset2
        
print(dataset2.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   AGE             32561 non-null  int64 
 1   WORKCLASS       32561 non-null  object
 2   FNLWGT          32561 non-null  int64 
 3   EDUCATION       32561 non-null  object
 4   EDUCATION_NUM   32561 non-null  int64 
 5   MARITAL_STATUS  32561 non-null  object
 6   OCCUPATION      32561 non-null  object
 7   RELATIONSHIP    32561 non-null  object
 8   RACE            32561 non-null  object
 9   SEX             32561 non-null  object
 10  CAPITAL_GAIN    32561 non-null  int64 
 11  CAPITAL_LOSS    32561 non-null  int64 
 12  HOURS_PER_WEEK  32561 non-null  int64 
 13  NATIVE_COUNTRY  32561 non-null  object
 14  TARGET          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 4.0+ MB
None


### Passo 2.4: Pré-processamento dos dados: Criando variáveis dummy

In [6]:
classificacoes = []

dataset3 = dataset2 #O dataset3 contêm os valores categóricos. O dataset2 irá conter as variáveis do tipo dummy

for columns in names:
    
    types = dataset2[columns].dtype
    
    if (types == object):
        
        categorias = pd.unique(dataset2[columns]) #Identificando as categorias únicas de cada coluna
        classificacoes.append(sorted(categorias)) #Armazenando as categorias em ordem alfabética (Caso queira olhar)
        dummy = []
        
        for i, categoria in enumerate(categorias):
            
            dummy.append(i)
            
        dataset2[columns] = dataset2[columns].replace(categorias, dummy)
        
print(dataset2.head())            

   AGE  WORKCLASS  FNLWGT  EDUCATION  EDUCATION_NUM  MARITAL_STATUS  \
0   39          0   77516          0             13               0   
1   50          1   83311          0             13               1   
2   38          2  215646          1              9               2   
3   53          2  234721          2              7               1   
4   28          2  338409          0             13               1   

   OCCUPATION  RELATIONSHIP  RACE  SEX  CAPITAL_GAIN  CAPITAL_LOSS  \
0           0             0     0    0          2174             0   
1           1             1     0    0             0             0   
2           2             0     0    0             0             0   
3           2             1     1    0             0             0   
4           3             2     1    1             0             0   

   HOURS_PER_WEEK  NATIVE_COUNTRY  TARGET  
0              40               0       0  
1              13               0       0  
2              40   

### Passo 3: Ajuste do modelo de classificação

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix

target = dataset2['TARGET']

#--------------------Separando dados de treino e teste--------------------------#
X_train, X_test, Y_train, Y_test= train_test_split(dataset2.drop(columns=['TARGET', 'FNLWGT']), target,
                                                   test_size= 0.3, random_state= 0)

#---------------------------- Ajuste do modelo ---------------------------------#
gnb = GaussianNB()
modelo = gnb.fit(X_train, Y_train)

#-------------------- Predições com dados de treino -----------------------------#
previsoes_train = modelo.predict(X_train)

#-------------------- Predições com dados de teste -----------------------------#
previsoes_test = modelo.predict(X_test)

#---------------------- Acurácia nos dados de treino ----------------------------#
acuracia_train = accuracy_score(Y_train, previsoes_train)

#---------------------- Acurácia nos dados de teste ----------------------------#
acuracia_test = accuracy_score(Y_test, previsoes_test)

#---------------------- Matriz de confusão nos dados de treino ----------------------------#
matriz_train = confusion_matrix(Y_train, previsoes_train)

#---------------------- Matriz de confusão nos dados de teste ----------------------------#
matriz_test = confusion_matrix(Y_test, previsoes_test)


print("Acurácia para os dados de treino: ", acuracia_train)
print("###################### Matriz de confusão para os dados de treino #########################")
print(matriz_train)

print("")
print("Acurácia para os dados de teste: ", acuracia_test)
print("###################### Matriz de confusão para os dados de teste #########################")
print(matriz_test)



Acurácia para os dados de treino:  0.8035714285714286
###################### Matriz de confusão para os dados de treino #########################
[[16423   890]
 [ 3587  1892]]

Acurácia para os dados de teste:  0.8085781553894974
###################### Matriz de confusão para os dados de teste #########################
[[7044  363]
 [1507  855]]
