# Projeto 3 - Ciência dos Dados

# Classificação Naive-Bayes

### Alunos:
- Felipe Junqueira
- Giovana Campedelli
- Gabriela Choichit
- João Roxo



In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import preprocessing


In [3]:
#Convertendo as colunas qualitativas em quantitativas.
def dummify(data, column_name):
    """
        Converte a coluna column_name em dummies / one-hot e as adiciona ao dataframe
        retorna uma copia do  df original *sem* a coluna que foi dummified
    """
    df = data.copy()
    df2 = pd.concat([df.drop(column_name, axis=1), pd.get_dummies(data[column_name], prefix=column_name)], axis=1)
    return df2

In [4]:
x= ['top-left','top-middle','top-right','middle-left','middle-middle','middle-right','bottom-left','bottom-middle','bottom-right','class']


dados_velha = pd.read_excel('tic-tac-toe.xlsx', header=None, names=x)
dados_velha.head()


Unnamed: 0,top-left,top-middle,top-right,middle-left,middle-middle,middle-right,bottom-left,bottom-middle,bottom-right,class
0,x,x,x,x,o,o,x,o,o,positive
1,x,x,x,x,o,o,o,x,o,positive
2,x,x,x,x,o,o,o,o,x,positive
3,x,x,x,x,o,o,o,b,b,positive
4,x,x,x,x,o,o,b,o,b,positive


In [5]:
#checando quantas saídas são postivas e quantas são negativas.
dados_velha["class"].value_counts()

positive    626
negative    332
Name: class, dtype: int64

In [6]:
feature_cols = ['top-left','top-middle','top-right','middle-left','middle-middle','middle-right','bottom-left','bottom-middle','bottom-right']
X = dados_velha[feature_cols] #features
y = dados_velha['class']

In [7]:
#começando a aplicação do naive bayes.
dv = X.copy()

In [8]:
for f in feature_cols:
    dv = dummify(dv, f)

In [9]:
dv.head()

Unnamed: 0,top-left_b,top-left_o,top-left_x,top-middle_b,top-middle_o,top-middle_x,top-right_b,top-right_o,top-right_x,middle-left_b,...,middle-right_x,bottom-left_b,bottom-left_o,bottom-left_x,bottom-middle_b,bottom-middle_o,bottom-middle_x,bottom-right_b,bottom-right_o,bottom-right_x
0,0,0,1,0,0,1,0,0,1,0,...,0,0,0,1,0,1,0,0,1,0
1,0,0,1,0,0,1,0,0,1,0,...,0,0,1,0,0,0,1,0,1,0
2,0,0,1,0,0,1,0,0,1,0,...,0,0,1,0,0,1,0,0,0,1
3,0,0,1,0,0,1,0,0,1,0,...,0,0,1,0,1,0,0,1,0,0
4,0,0,1,0,0,1,0,0,1,0,...,0,1,0,0,0,1,0,1,0,0


In [10]:
X_train, X_test, y_train, y_test = train_test_split(dv, y, test_size=0.25, random_state=1) # 75% training and 25% 

In [11]:
# Cria modelo.
model = Pipeline([
    ("classifier", MultinomialNB())
])

In [12]:
# Treina modelo.
#model = MultinomialNB()
m = model.fit(X_train, y_train)
m

Pipeline(memory=None,
         steps=[('classifier',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [13]:
v = m.predict(X_test)

In [14]:
#calculando acurácia do modelo.
acc = accuracy_score(y_test, v)
print("Acurácia: {}".format(acc))

Acurácia: 0.7583333333333333


___

In [15]:
#aplicando o modelo novamente, porém alterando a base de treinamento e teste.
X_train1, X_test1, y_train1, y_test1 = train_test_split(dv, y, test_size=0.15, random_state=1) # 85% training and 15% 

X_train2, X_test2, y_train2, y_test2 = train_test_split(dv, y, test_size=0.10, random_state=1) # 90% training and 10% 

X_train3, X_test3, y_train3, y_test3 = train_test_split(dv, y, test_size=0.05, random_state=1) # 95% training and 05% 

In [16]:
m1 = model.fit(X_train1, y_train1)
m2 = model.fit(X_train2, y_train2)
m3 = model.fit(X_train3, y_train3)

In [17]:
v1 = m1.predict(X_test1)
v2 = m2.predict(X_test2)
v3 = m3.predict(X_test3)

In [18]:
#checando a acurácia das iterações.

In [19]:
acc = accuracy_score(y_test1, v1)
print("Acurácia para 85% de treinamento: {}".format(acc))

Acurácia para 85% de treinamento: 0.7569444444444444


In [20]:
acc = accuracy_score(y_test2, v2)
print("Acurácia para 90% de treinamento: {}".format(acc))

Acurácia para 90% de treinamento: 0.7083333333333334


In [21]:
acc = accuracy_score(y_test3, v3)
print("Acurácia para 95% de treinamento: {}".format(acc))

Acurácia para 95% de treinamento: 0.75


___

In [22]:
#removendo a coluna de baixo no meio e aplicando o naive bayes.

In [23]:
bottommiddle = dv.drop(['bottom-middle_b','bottom-middle_o','bottom-middle_x'],axis='columns')

In [24]:
bottommiddle.head()


Unnamed: 0,top-left_b,top-left_o,top-left_x,top-middle_b,top-middle_o,top-middle_x,top-right_b,top-right_o,top-right_x,middle-left_b,...,middle-middle_x,middle-right_b,middle-right_o,middle-right_x,bottom-left_b,bottom-left_o,bottom-left_x,bottom-right_b,bottom-right_o,bottom-right_x
0,0,0,1,0,0,1,0,0,1,0,...,0,0,1,0,0,0,1,0,1,0
1,0,0,1,0,0,1,0,0,1,0,...,0,0,1,0,0,1,0,0,1,0
2,0,0,1,0,0,1,0,0,1,0,...,0,0,1,0,0,1,0,0,0,1
3,0,0,1,0,0,1,0,0,1,0,...,0,0,1,0,0,1,0,1,0,0
4,0,0,1,0,0,1,0,0,1,0,...,0,0,1,0,1,0,0,1,0,0


In [25]:
X_train4, X_test4, y_train4, y_test4 = train_test_split(bottommiddle, y, test_size=0.25, random_state=1) # 75% training and 25% 

In [26]:
# Cria modelo.
model = Pipeline([
    ("classifier", MultinomialNB())
])

In [27]:
# Treina modelo.
#model = MultinomialNB()
m4 = model.fit(X_train4, y_train4)
m4


Pipeline(memory=None,
         steps=[('classifier',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [28]:
v4 = m4.predict(X_test4)

In [29]:
acc2 = accuracy_score(y_test4, v4)
print("Acurácia: {}".format(acc2))

Acurácia: 0.75


In [30]:
#Acuráccia permanece quase a mesma pelo fato da posição não ser muito utilizada

___

In [31]:
#removendo a coluna do meio no meio e aplicando o naive bayes.

In [32]:
middlemiddle = dv.drop(['middle-middle_x', 'middle-middle_o','middle-middle_b'],axis='columns')

In [33]:
X_train5, X_test5, y_train5, y_test5 = train_test_split(middlemiddle, y, test_size=0.25, random_state=1) # 75% training and 25%

In [34]:
# Cria modelo.
model = Pipeline([
    ("classifier", MultinomialNB())
])

In [35]:
# Treina modelo. 
#model = MultinomialNB()
m5 = model.fit(X_train5, y_train5)
m5

Pipeline(memory=None,
         steps=[('classifier',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [36]:
v5 = m5.predict(X_test5)

In [37]:
acc3 = accuracy_score(y_test5, v5)
print("Acurácia: {}".format(acc3))

Acurácia: 0.6958333333333333


In [38]:
#Acurácia diminui pelo fato da posição ser muito utilizada.

___

<img src="jogo_velha.jpg" height=100 width=200/>

In [39]:
#removendo a coluna de cima na esquerda e direita e a de baixo da esquerda, em seguida aplicando o naive bayes.

In [40]:
jogada = dv.drop(['top-left_b','top-left_o','top-left_x','bottom-left_b','bottom-left_o','bottom-left_x','top-right_b','top-right_o','top-right_x'],axis='columns')

In [41]:
X_train6, X_test6, y_train6, y_test6 = train_test_split(jogada, y, test_size=0.25, random_state=1) # 75% training and 25%

In [42]:
# Cria modelo.
model = Pipeline([
    ("classifier", MultinomialNB())
])

In [43]:
# Treina modelo.
#model = MultinomialNB()
m6 = model.fit(X_train6, y_train6)
m6


Pipeline(memory=None,
         steps=[('classifier',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [44]:
v6 = m6.predict(X_test6)

In [45]:
acc4 = accuracy_score(y_test6, v6)
print("Acurácia: {}".format(acc4))

Acurácia: 0.725


___

<img src="jogo_velha2.jpg" height=100 width=200/>

In [46]:
#removendo a coluna de cima na esquerda e direita, a de baixo da esquerda e a do meio, em seguida aplicando o naive bayes.

In [47]:
jogada2 = dv.drop(['top-left_b','top-left_o','top-left_x','bottom-left_b','bottom-left_o','bottom-left_x','top-right_b','top-right_o','top-right_x','middle-middle_b','middle-middle_o','middle-middle_x'],axis='columns')

In [48]:
X_train7, X_test7, y_train7, y_test7 = train_test_split(jogada2, y, test_size=0.25, random_state=1) # 75% training and 25%

In [49]:
# Treina modelo.
#model = MultinomialNB()
m7 = model.fit(X_train7, y_train7)
m7

Pipeline(memory=None,
         steps=[('classifier',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [50]:
v7 = m7.predict(X_test7)

In [51]:
acc5 = accuracy_score(y_test7, v7)
print("Acurácia: {}".format(acc5))

Acurácia: 0.6916666666666667


___

In [52]:
#removendo todas as colunas menos a de cima da esquerda e do meio no meio, em seguida aplicando o naive bayes.

In [53]:
bituga2= dv.drop(['bottom-right_o','bottom-right_x','top-right_b','top-right_o','top-right_x', 'top-middle_b','top-middle_o', 'top-middle_x','middle-left_b','middle-left_o','middle-left_x','middle-right_b','middle-right_o','middle-right_x', 'bottom-left_b','bottom-left_o','bottom-left_x','bottom-middle_b','bottom-middle_o','bottom-middle_x'],axis='columns')

In [54]:
X_train8, X_test8, y_train8, y_test8 = train_test_split(bituga2, y, test_size=0.25, random_state=1) # 75% training and 25%

In [55]:
# Treina modelo.
#model = MultinomialNB()
m8 = model.fit(X_train8, y_train8)
m8

Pipeline(memory=None,
         steps=[('classifier',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [56]:
v8 = m8.predict(X_test8)

In [57]:
acc6 = accuracy_score(y_test8, v8)
print("Acurácia: {}".format(acc6))

Acurácia: 0.6958333333333333
