In [17]:
import os  
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
# --------cross-validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
# -------- classification
import sklearn
from sklearn import neighbors, tree, ensemble, naive_bayes, svm
# *** KNN
from sklearn.neighbors import KNeighborsClassifier
# *** Decision Tree; Random Forest
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# *** Naive Bayes
from sklearn.naive_bayes import GaussianNB
# *** SVM classifier
from sklearn.svm import SVC
# --------  metrics:
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import make_scorer
from sklearn import preprocessing
from sklearn import linear_model

In [18]:
#Load the CSV into a DataFrame
df = pd.read_csv(r'clean_data.csv')

#A copy of the data for backup
clean_df = df.iloc[:,1:].copy()

In [19]:
clean_df['Game'] = preprocessing.LabelEncoder().fit_transform(clean_df['Game'])
clean_df['Genre'] = preprocessing.LabelEncoder().fit_transform(clean_df['Genre'])
clean_df['Console'] = preprocessing.LabelEncoder().fit_transform(clean_df['Console'])
clean_df['Publisher'] = preprocessing.LabelEncoder().fit_transform(clean_df['Publisher'])
clean_df['Developer'] = preprocessing.LabelEncoder().fit_transform(clean_df['Developer'])
clean_df

Unnamed: 0,Game,Pos,Genre,Console,Publisher,Developer,VGChartz_Score,Critic_Score,User_Score,NA_Sales,PAL_Sales,Japan_Sales,Other_Sales,Total_Sales,Relese_Month,Relese_Year,Hit_Games
0,4124,1,0,36,848,2483,0.0,0.0,0.0,0,0,0,0,51000000,3,2005,1
1,12294,2,0,36,485,2075,0.0,0.0,0.0,0,0,0,0,47820000,6,1997,1
2,2424,3,0,36,165,524,0.0,0.0,0.0,0,0,0,0,25200000,10,2001,1
3,2992,4,0,36,485,2075,0.0,0.0,0.0,0,0,0,0,21150000,6,1997,1
4,3788,5,0,36,510,1582,0.0,0.0,0.0,0,0,0,0,20000000,10,1981,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17505,12229,461,17,31,443,1379,0.0,0.0,0.0,0,0,30000,0,30000,11,2012,0
17506,4963,462,17,32,16,53,0.0,0.0,0.0,0,0,30000,0,30000,2,2014,0
17507,9506,463,17,27,663,2017,0.0,0.0,0.0,0,0,40000,0,40000,8,2014,0
17508,9017,464,17,28,698,1534,0.0,0.0,0.0,0,0,30000,0,30000,11,2016,0


In [20]:
def split_to_train_and_test(dataset, label_column, test_ratio, rand_state):
    training = dataset.columns[dataset.columns != label_column]
    target = label_column
    X = dataset[training]
    y = dataset[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=rand_state)
    return X_train, X_test, y_train, y_test

test_ratio, rand_state = 0.2, 42
X_train, X_test, y_train, y_test = split_to_train_and_test(clean_df, "Hit_Games", test_ratio, rand_state)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(14008, 16) (3502, 16) (14008,) (3502,)


In [21]:
def find_best_k_for_KNN(X_train, y_train):
    params={"n_neighbors":[3,7,9,11]}
    knn = KNeighborsClassifier()
    clf = GridSearchCV(knn, params,scoring=make_scorer(metrics.f1_score, greater_is_better=True))
    clf.fit(X_train,y_train)
    best_K = clf.best_params_['n_neighbors']
    best_f1_val = clf.best_score_
    return best_K, best_f1_val
best_K, best_f1_KNN_params = find_best_k_for_KNN(X_train, y_train)
# --- add additional code to check your code if needed:
best_K, best_f1_KNN_params

(9, 0.9960831059926767)

In [22]:
def find_best_decision_tree_params(X_train, y_train):
    parameters={'max_depth':[2,4,6],'min_samples_split':[5,10,20]}
    clf = GridSearchCV(tree.DecisionTreeClassifier(), parameters,scoring=make_scorer(metrics.f1_score, greater_is_better=True))
    clf.fit(X_train,y_train)
    best_max_depth = clf.best_params_['max_depth']
    best_min_samples_split = clf.best_params_['min_samples_split']
    best_f1_val = clf.best_score_
    return best_max_depth, best_min_samples_split, best_f1_val
best_max_dep, best_min_smpl_splt, best_f1_DT_params = find_best_decision_tree_params(X_train, y_train)
best_max_dep, best_min_smpl_splt, best_f1_DT_params

(2, 5, 1.0)

In [23]:
def find_best_random_forest_num_estimators(X_train, y_train):
    parameters={"n_estimators":[11,51,71]}
    forest=RandomForestClassifier()    
    clf = GridSearchCV(RandomForestClassifier(), parameters,scoring=make_scorer(metrics.f1_score, greater_is_better=True))
    clf.fit(X_train,y_train)
    best_num_estimators = clf.best_params_['n_estimators']
    best_f1_val = clf.best_score_
    return best_num_estimators, best_f1_val
best_n_estimators, best_f1_RF_params = find_best_random_forest_num_estimators(X_train, y_train)
best_n_estimators, best_f1_RF_params

(11, 1.0)

In [24]:
def get_classifier_obj(classifier_name, params):

    if(classifier_name == "KNN"):
        if(params):
                clf=KNeighborsClassifier(n_neighbors=params["n_neighbors"])
        else:
            clf=KNeighborsClassifier()
    if(classifier_name == "decision_tree"):
        if(params):
            clf=DecisionTreeClassifier(max_depth=params["max_depth"], min_samples_split=params["min_samples_split"])
        else:
            clf=DecisionTreeClassifier()
    if(classifier_name == "random_forest"):
        if(params):
            clf=RandomForestClassifier(n_estimators=params["n_estimators"])
        else:
            clf=RandomForestClassifier()
    if(classifier_name == "svm"):
        clf=svm.SVC()
    if(classifier_name == "naive_bayes"):
        clf=GaussianNB()
    if(classifier_name == "LogisticRegression"):    
        clf=linear_model.LogisticRegression()
    return clf

params_knn = {'n_neighbors': best_K}
params_random_forest = {'n_estimators': best_n_estimators}
params_decision_tree = {'max_depth': best_max_dep, 'min_samples_split': best_min_smpl_splt}
clf_naive_bayes = get_classifier_obj("naive_bayes",None)
clf_svm = get_classifier_obj("svm",None)
clf_knn = get_classifier_obj("KNN",params_knn)
clf_random_forest = get_classifier_obj("random_forest",params_random_forest)    
clf_decision_tree = get_classifier_obj("decision_tree",params_decision_tree)
clf_logistic_regression = get_classifier_obj("LogisticRegression",None)

In [31]:
def calc_evaluation_val(eval_metric, y_test, y_predicted):
    if(eval_metric == 'accuracy'):
        evaluation_val = accuracy_score(y_test, y_predicted)
    elif(eval_metric == 'precision'):
        evaluation_val= precision_score(y_test, y_predicted)
    elif(eval_metric == 'recall'):
        evaluation_val = recall_score(y_test, y_predicted)
    elif(eval_metric == 'f1'):
        evaluation_val = f1_score(y_test, y_predicted)
    elif(eval_metric == 'confusion_matrix'):
        evaluation_val = confusion_matrix(y_test, y_predicted)
    return evaluation_val

trained_logistic_regression = clf_logistic_regression.fit(X_train, y_train)
trained_svm = clf_svm.fit(X_train, y_train)
trained_knn = clf_knn.fit(X_train, y_train)
trained_random_forest = clf_random_forest.fit(X_train, y_train)
trained_naive_bayes = clf_naive_bayes.fit(X_train, y_train)
trained_decision_tree = clf_decision_tree.fit(X_train, y_train)


predicted_logistic_regression = trained_logistic_regression.predict(X_test)
predicted_naive_bayes = trained_naive_bayes.predict(X_test)
predicted_svm = trained_svm.predict(X_test)
predicted_knn = trained_knn.predict(X_test)
predicted_random_forest = trained_random_forest.predict(X_test)
predicted_decision_tree = trained_decision_tree.predict(X_test)

data_predicted = {"logistic_regression": predicted_logistic_regression,
                  "naive_bayes": predicted_naive_bayes,
                  "svm": predicted_svm,
                  "knn": predicted_knn,
                  "random_forest": predicted_random_forest,
                  "decision_tree": predicted_decision_tree}
evaluation = ["accuracy","precision","recall","f1"]
for alg in data_predicted.keys():
    print(f" \n\nthe algorithm is : {alg}")
    for eval in evaluation:
        print(eval)
        print(calc_evaluation_val(eval, y_test, data_predicted[alg]))
        
#accuracy_val = calc_evaluation_val("accuracy", y_test, predicted_knn)
#precision_val = calc_evaluation_val("precision", y_test, predicted_knn)
#recall_val = calc_evaluation_val("recall", y_test, predicted_knn)
#f1_val = calc_evaluation_val("f1", y_test, predicted_knn)
#confusion_matrix_val = calc_evaluation_val("confusion_matrix", y_test, predicted_knn)
# --- add additional code to check your code if needed:
#confusion_matrix_val

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


 

the algorithm is : logistic_regression
accuracy
0.9862935465448315
precision
0.9604612850082372
recall
0.9604612850082372
f1
0.9604612850082372
 

the algorithm is : naive_bayes
accuracy
0.9477441462021702
precision
0.8281733746130031
recall
0.8813838550247117
f1
0.853950518754988
 

the algorithm is : svm
accuracy
0.9945745288406624
precision
0.9932885906040269
recall
0.9752883031301482
f1
0.9842061512884455
 

the algorithm is : knn
accuracy
0.997430039977156
precision
0.9966777408637874
recall
0.9884678747940692
f1
0.9925558312655087
 

the algorithm is : random_forest
accuracy
1.0
precision
1.0
recall
1.0
f1
1.0
 

the algorithm is : decision_tree
accuracy
1.0
precision
1.0
recall
1.0
f1
1.0


In [74]:
def find_best_model(X_train, y_train, max_depth_val, min_samples_split_val):
    best_recall_val = 0
    clfDecisionTree= DecisionTreeClassifier(max_depth=max_depth_val,min_samples_split=min_samples_split_val)                                
    clfNaiveBayes = GaussianNB()
    clfsvm = svm.SVC()
    for allClf in [clfDecisionTree,clfNaiveBayes,clfsvm]:
        allClf.fit(X_train, y_train) 
        yPredTrain = allClf.predict(X_train)
        clf_recall_scores = metrics.recall_score(y_true=y_train,y_pred=yPredTrain)
        if (clf_recall_scores > best_recall_val):
            best_recall_val = clf_recall_scores
            best_clf = allClf
    return best_clf, best_recall_val

best_clf, best_recall_val=find_best_model(X_train, y_train, best_max_dep, best_min_smpl_splt)
best_clf, best_recall_val

(DecisionTreeClassifier(max_depth=2, min_samples_split=5), 1.0)