# Avaliação dos Resultados - Oscar IMDB

In [1]:
import os
import pandas as pd
import pickle


from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, KFold
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC

In [3]:
os.chdir("../tratados")
os.getcwd()

'/home/antero/Documentos/tcc/tratados'

In [3]:
with open('oscar_imdb.pkl', 'rb') as f:
    df_imdb_oscar=pickle.load(f)

## Importando o Dataframe

In [4]:
X_imdb_oscar = df_imdb_oscar.iloc[:, 5:7]

In [5]:
Y_imdb_oscar_nominee = df_imdb_oscar.iloc[:, 7]

In [6]:
Y_imdb_oscar_winner = df_imdb_oscar.iloc[:, 8]

### Escalonamento dos valores

In [7]:
scaler = StandardScaler()
X_imdb_oscar = scaler.fit_transform(X_imdb_oscar)

### Tratamento de atributos categóricos

In [8]:
label_encoder = LabelEncoder()
Y_imdb_oscar_nominee = label_encoder.fit_transform(Y_imdb_oscar_nominee)
Y_imdb_oscar_winner = label_encoder.fit_transform(Y_imdb_oscar_winner)

In [9]:
resultados_DummyClassifier = []
resultados_NaiveBayes = []
resultados_RegressaoLogistica = []
resultados_KNN = []
resultados_RandomForest = []
resultados_MPLC = []
resultados_SGDC = []
resultados_SupportVector = []

In [10]:
for i in range(30):
    kfold = KFold(n_splits=10, shuffle=True, random_state=i)

    DummyCl = DummyClassifier(strategy='most_frequent')
    scores = cross_val_score(DummyCl, X_imdb_oscar, Y_imdb_oscar_nominee, cv = kfold)
    resultados_DummyClassifier.append(scores.mean())

    NaiveBayes= GaussianNB()
    scores = cross_val_score(NaiveBayes, X_imdb_oscar, Y_imdb_oscar_nominee, cv = kfold)
    resultados_NaiveBayes.append(scores.mean())

    RegressaoLogistica = LogisticRegression(dual=True, fit_intercept=True, penalty='l2', solver='liblinear')
    scores = cross_val_score(RegressaoLogistica, X_imdb_oscar, Y_imdb_oscar_nominee, cv = kfold)
    resultados_RegressaoLogistica.append(scores.mean())

    knn = KNeighborsClassifier(algorithm='auto', n_neighbors=10, p=1)
    scores = cross_val_score(RegressaoLogistica, X_imdb_oscar, Y_imdb_oscar_nominee, cv = kfold)
    resultados_KNN.append(scores.mean())

    RandomForest = RandomForestClassifier(criterion='entropy', max_features='auto', min_samples_leaf=10, min_samples_split=5, n_estimators=150)
    scores = cross_val_score(RandomForest, X_imdb_oscar, Y_imdb_oscar_nominee, cv = kfold)
    resultados_RandomForest.append(scores.mean())

    MPLC = MLPClassifier(hidden_layer_sizes=(5, 2), learning_rate='adaptive', max_iter=2000, solver='sgd')
    scores = cross_val_score(MPLC, X_imdb_oscar, Y_imdb_oscar_nominee, cv = kfold)
    resultados_MPLC.append(scores.mean())

    SGDC = SGDClassifier(loss='hinge', max_iter=3000, penalty='l1')
    scores = cross_val_score(SGDC, X_imdb_oscar, Y_imdb_oscar_nominee, cv = kfold)
    resultados_SGDC.append(scores.mean())

    SV = SVC(kernel='linear')
    scores = cross_val_score(SV, X_imdb_oscar, Y_imdb_oscar_nominee, cv = kfold)
    resultados_SupportVector.append(scores.mean())

In [11]:
resultados = pd.DataFrame({'Dummy Classifier': resultados_DummyClassifier,
                          'Naive Bayes Classifier': resultados_NaiveBayes,
                          'Regressao Logistica': resultados_RegressaoLogistica,
                          'KNN':resultados_KNN,
                          'Random Forest': resultados_RandomForest,
                          'Multi-layer Perceptron': resultados_MPLC,
                          'Stochastic Gradient Descent': resultados_SGDC,
                          'Support Vector Classifier': resultados_SupportVector})

In [8]:
resultados.describe()

Unnamed: 0,Dummy Classifier,Naive Bayes Classifier,Regressao Logistica,KNN,Random Forest,Multi-layer Perceptron,Stochastic Gradient Descent,Support Vector Classifier
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,0.9912902,0.988532,0.991421,0.991421,0.991601,0.991357,0.991294,0.9912902
std,5.8443e-10,7e-06,7e-06,1.2e-05,2.2e-05,4.3e-05,4.3e-05,5.8443e-10
min,0.9912902,0.98852,0.991403,0.991401,0.991536,0.991265,0.991216,0.9912902
25%,0.9912902,0.988527,0.991417,0.991414,0.991594,0.991329,0.991253,0.9912902
50%,0.9912902,0.988531,0.991422,0.991423,0.991604,0.991356,0.991309,0.9912902
75%,0.9912902,0.988537,0.991425,0.991425,0.991614,0.991382,0.991326,0.9912902
max,0.9912902,0.988545,0.991434,0.991459,0.991635,0.991441,0.991358,0.9912902


In [22]:
resultados.describe().round(8)

Unnamed: 0,Dummy Classifier,Naive Bayes Classifier,Regressao Logistica,KNN,Random Forest,Multi-layer Perceptron,Stochastic Gradient Descent,Support Vector Classifier
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,0.99129,0.988532,0.991421,0.991421,0.991601,0.991357,0.991294,0.99129
std,0.0,7e-06,7e-06,1.2e-05,2.2e-05,4.3e-05,4.3e-05,0.0
min,0.99129,0.98852,0.991403,0.991401,0.991536,0.991265,0.991216,0.99129
25%,0.99129,0.988527,0.991417,0.991414,0.991594,0.991329,0.991253,0.99129
50%,0.99129,0.988531,0.991422,0.991423,0.991604,0.991356,0.991309,0.99129
75%,0.99129,0.988537,0.991425,0.991425,0.991614,0.991382,0.991326,0.99129
max,0.99129,0.988545,0.991434,0.991459,0.991635,0.991441,0.991358,0.99129
