# Avaliação dos Resultados - Oscar IMDB

In [1]:
import os
import pandas as pd
import pickle


from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, KFold
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC

In [2]:
os.chdir("../tratados")
os.getcwd()

'/home/antero/Documentos/wrk/tratados'

In [3]:
with open('oscar_imdb.pkl', 'rb') as f:
    df_imdb_oscar=pickle.load(f)

## Importando o Dataframe

In [4]:
X_imdb_oscar = df_imdb_oscar.iloc[:, 5:7]

In [5]:
Y_imdb_oscar_nominee = df_imdb_oscar.iloc[:, 7]

In [6]:
Y_imdb_oscar_winner = df_imdb_oscar.iloc[:, 8]

### Escalonamento dos valores

In [7]:
scaler = StandardScaler()
X_imdb_oscar = scaler.fit_transform(X_imdb_oscar)

### Tratamento de atributos categóricos

In [8]:
label_encoder = LabelEncoder()
Y_imdb_oscar_nominee = label_encoder.fit_transform(Y_imdb_oscar_nominee)
Y_imdb_oscar_winner = label_encoder.fit_transform(Y_imdb_oscar_winner)

In [9]:
resultados_DummyClassifier = []
resultados_NaiveBayes = []
resultados_RegressaoLogistica = []
resultados_KNN = []
resultados_RandomForest = []
resultados_MPLC = []
resultados_SGDC = []
resultados_SupportVector = []

In [10]:
for i in range(30):
    kfold = KFold(n_splits=10, shuffle=True, random_state=i)

    DummyCl = DummyClassifier(strategy='most_frequent')
    scores = cross_val_score(DummyCl, X_imdb_oscar, Y_imdb_oscar_winner, cv = kfold)
    resultados_DummyClassifier.append(scores.mean())

    NaiveBayes= GaussianNB()
    scores = cross_val_score(NaiveBayes, X_imdb_oscar, Y_imdb_oscar_winner, cv = kfold)
    resultados_NaiveBayes.append(scores.mean())

    RegressaoLogistica = LogisticRegression(dual= False, fit_intercept=True, penalty='l1', solver='liblinear')
    scores = cross_val_score(RegressaoLogistica, X_imdb_oscar, Y_imdb_oscar_winner, cv = kfold)
    resultados_RegressaoLogistica.append(scores.mean())

    knn = KNeighborsClassifier(algorithm='auto', n_neighbors=20, p=2)
    scores = cross_val_score(RegressaoLogistica, X_imdb_oscar, Y_imdb_oscar_winner, cv = kfold)
    resultados_KNN.append(scores.mean())

    RandomForest = RandomForestClassifier(criterion='gini', max_features='sqrt', min_samples_leaf=10, min_samples_split=2, n_estimators=150)
    scores = cross_val_score(RandomForest, X_imdb_oscar, Y_imdb_oscar_winner, cv = kfold)
    resultados_RandomForest.append(scores.mean())

    MPLC = MLPClassifier(hidden_layer_sizes=(5, 2), learning_rate='invscaling', max_iter=1500, solver='adam')
    scores = cross_val_score(MPLC, X_imdb_oscar, Y_imdb_oscar_winner, cv = kfold)
    resultados_MPLC.append(scores.mean())

    SGDC = SGDClassifier(loss = 'squared_hinge', max_iter = 3000, penalty = 'elasticnet')
    scores = cross_val_score(SGDC, X_imdb_oscar, Y_imdb_oscar_winner, cv = kfold)
    resultados_SGDC.append(scores.mean())

    SV = SVC(kernel = 'linear')
    scores = cross_val_score(SV, X_imdb_oscar, Y_imdb_oscar_winner, cv = kfold)
    resultados_SupportVector.append(scores.mean())

In [11]:
resultados = pd.DataFrame({'Dummy Classifier': resultados_DummyClassifier,
                          'Naive Bayes Classifier': resultados_NaiveBayes,
                          'Regressao Logistica': resultados_RegressaoLogistica,
                          'KNN':resultados_KNN,
                          'Random Forest': resultados_RandomForest,
                          'Multi-layer Perceptron': resultados_MPLC,
                          'Stochastic Gradient Descent': resultados_SGDC,
                          'Support Vector Classifier': resultados_SupportVector})

In [12]:
resultados.describe().round(8)

Unnamed: 0,Dummy Classifier,Naive Bayes Classifier,Regressao Logistica,KNN,Random Forest,Multi-layer Perceptron,Stochastic Gradient Descent,Support Vector Classifier
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,0.9975424,0.993938,0.997479,0.997479,0.997558,0.997529,0.997321,0.9975424
std,5.010666e-10,7e-06,4e-06,4e-06,9e-06,1.8e-05,0.000263,5.010666e-10
min,0.9975424,0.993921,0.997473,0.997473,0.997545,0.997484,0.996786,0.9975424
25%,0.9975424,0.993935,0.997477,0.997477,0.997552,0.997516,0.997149,0.9975424
50%,0.9975424,0.993939,0.997479,0.997479,0.997556,0.997532,0.997452,0.9975424
75%,0.9975424,0.993944,0.997482,0.997482,0.997562,0.997542,0.997514,0.9975424
max,0.9975424,0.993953,0.997486,0.997486,0.997578,0.997556,0.997551,0.9975424


In [4]:
resultados.describe().round(8)

Unnamed: 0,Dummy Classifier,Naive Bayes Classifier,Regressao Logistica,KNN,Random Forest,Multi-layer Perceptron,Stochastic Gradient Descent,Support Vector Classifier
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,0.997542,0.993938,0.997479,0.997479,0.997558,0.997529,0.997321,0.997542
std,0.0,7e-06,4e-06,4e-06,9e-06,1.8e-05,0.000263,0.0
min,0.997542,0.993921,0.997473,0.997473,0.997545,0.997484,0.996786,0.997542
25%,0.997542,0.993935,0.997477,0.997477,0.997552,0.997516,0.997149,0.997542
50%,0.997542,0.993939,0.997479,0.997479,0.997556,0.997532,0.997452,0.997542
75%,0.997542,0.993944,0.997482,0.997482,0.997562,0.997542,0.997514,0.997542
max,0.997542,0.993953,0.997486,0.997486,0.997578,0.997556,0.997551,0.997542
