In [None]:
import pandas as pd
import re
from sklearn.utils import shuffle
import random as rand
import numpy as np
import math
import sklearn.naive_bayes as nB
from sklearn.model_selection import KFold, cross_val_score, cross_validate

In [None]:
data = pd.read_csv('games_clean.csv')
random = 24
data

In [None]:
for name in data.columns[1:]:
    print("Before", name + '\n', data[data[name].isna()][name])
    data[name].fillna(data[name].mean(), inplace=True)
    print("After", name + '\n', data[data[name].isna()][name]

In [None]:
data.isna().sum().sum()

In [None]:
data['Metacritic'].value_counts()

In [None]:
Labels = data['Metacritic'].unique()
Labels

In [None]:
from scipy.stats import boxcox

exponential = ['Presence', 'OriginalCost', 'Achievements', 'Storage', 'RatingsBreakdown-Recommended', 'RatingsBreakdown-Meh', 'RatingsBreakdown-Exceptional', 'RatingsBreakdown-Skip']
lambdas = {}

for name in exponential:
    boc = boxcox(data.loc[:,name].apply(lambda x: x + 1*10**(-10)))
    data.loc[:,name] = boc[0]
    print(name, boc[1])
    lambdas[name] = boc[1] 

In [None]:
'''for name in ['Presence', 'Memory', 'ReleaseDate', 'OriginalCost', 'DiscountedCost', 'Achievements', 'Storage', 'RatingsBreakdown-Recommended', 'RatingsBreakdown-Meh', 'RatingsBreakdown-Exceptional', 'RatingsBreakdown-Skip']:
    data.loc[:,name] = data[name].apply(lambda x: np.log2(x+0.000001))'''

In [None]:
import matplotlib.pyplot as plt

data_analysis = data[['Presence',
'Memory',
'ReleaseDate',
'OriginalCost',
'DiscountedCost',
'Achievements',
'Storage',
'RatingsBreakdown-Recommended',
'RatingsBreakdown-Meh',
'RatingsBreakdown-Exceptional',
'RatingsBreakdown-Skip']]



hist = data_analysis.hist(bins=50, figsize=(20,20))
box = data_analysis.boxplot()
plt.xticks(rotation='vertical')
plt.xticks(rotation='horizontal')
for name in data_analysis.columns:
    fig, axes= plt.subplots(1,2, gridspec_kw={'width_ratios': [1, 4]}, figsize=(9,5))
    data_analysis.boxplot(column=name,ax=axes[0])
    data_analysis.hist(column=name, ax=axes[1])
plt.show()

In [None]:
for name in data.columns:
    print('\033[1m'+name+'\033[0;0m')
    print(data[name].unique())

In [None]:
from sklearn import preprocessing

In [None]:
encoder = preprocessing.LabelEncoder()
Labels_encoded = encoder.fit_transform(Labels)
Labels_encoded

In [None]:
data_expected = encoder.fit_transform(data['Metacritic'])
data_input = data.loc[:, data.columns != 'Metacritic']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_input, data_expected, test_size=0.3,random_state=404)

In [None]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

In [None]:
gnb = GaussianNB()
bnb = BernoulliNB(alpha=0.2)
mnb = MultinomialNB(alpha=1)

In [None]:
data['Metacritic'].dtype

In [None]:
gnb.fit(X_train, y_train)

In [None]:
y_pred = gnb.predict(X_test)

In [None]:
pd.Series(encoder.inverse_transform(y_pred)).value_counts() 

In [None]:
y_pred = y_pred.reshape(3000,-1)

In [None]:
y_test = y_test.reshape(3000,-1)

In [None]:
print(y_pred[:5], y_test[:5])

In [None]:
from sklearn.metrics import log_loss, accuracy_score

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
all_accuracy_split = []
for i in range(20,100):
    print('')
    print('*-----------* KFOLD', i, '*-----------*')
    cv = KFold(n_splits=i, shuffle=False)
    cv_scores = cross_validate(bnb, X=X_train, y=y_train, cv=cv, scoring='accuracy', return_estimator=True)
    print('Max Accuracy in train', '--->', max([(sc,i) for i,sc in enumerate(cv_scores['test_score'])])[0])
    print('Mean Accuracy in train', '--->', np.mean(cv_scores['test_score']))
    all_accuracy_split.append(np.mean(cv_scores['test_score']))
    bnb = cv_scores['estimator'][max([(sc,i) for i,sc in enumerate(cv_scores['test_score'])])[1]]
    print('Max Accuracy in test', '--->', accuracy_score(y_test,bnb.predict(X_test)))
    print('')

In [None]:
all_accuracy = []
for i in range(20,100):
    print('')
    print('*-----------* KFOLD', i, '*-----------*')
    cv = KFold(n_splits=i, shuffle=False)
    cv_scores = cross_validate(bnb, X=data_input, y=data_expected, cv=cv, scoring='accuracy', return_estimator=True)
    print('Max Accuracy in train', '--->', max([(sc,i) for i,sc in enumerate(cv_scores['test_score'])])[0])
    print('Mean Accuracy in train', '--->', np.mean(cv_scores['test_score']))
    all_accuracy.append(np.mean(cv_scores['test_score']))
    bnb = cv_scores['estimator'][max([(sc,i) for i,sc in enumerate(cv_scores['test_score'])])[1]]
    print('Max Accuracy in test', '--->', accuracy_score(data_expected,bnb.predict(data_input)))
    print('')

In [None]:
plt.plot(list(range(20,100)), all_accuracy_split, label = "cross-validation + hold out")
plt.plot(list(range(20,100)), all_accuracy, label = "cross-validation k > 1")
plt.legend()
plt.show()