# Lab 4

## Prep

In [1]:
# required packages
# from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import (
classification_report, precision_recall_curve, auc, make_scorer,
recall_score, accuracy_score, precision_score, confusion_matrix, f1_score
)
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold,cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import tree

from sklearn.decomposition import TruncatedSVD

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression


In [2]:
import pandas as pd
import numpy as np

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
## set read/write paths for easy access
fp_path = 'data/'

### Grid Search Func

In [5]:
# require package GridSearchCV, StratifiedKFold
# required make_scorer, precision_score, recall_score,
# require accuracy_score, classification_report

def grid_search(X_train, X_test, y_train, y_test, param_grid, classifier,
                refit_score='accuracy_score', n_splits=5, random_state=None):
    """
    Fits a GridSearchCV classifier using refit_score for optimization.
    Prints classifier performance metrics.

    Parameters:
    - X_train: Training feature dataset.
    - X_test: Testing feature dataset.
    - y_train: Training labels.
    - y_test: Testing labels.
    - classifier: The scikit-learn classifier
    - param_grid: The parameter grid to search over for the classifier.
    - refit_score: The score used to refit the best model.
    - n_splits: Number of splits for cross-validation.
    - random_state: Random state for reproducibility.
    """
    if not param_grid:
        raise ValueError(
            "param_grid is empty. Please provide parameter grid to search.")

    scorers = {
        'precision_score': make_scorer(precision_score),
        'recall_score': make_scorer(recall_score),
        'accuracy_score': make_scorer(accuracy_score)
    }

    skf = StratifiedKFold(n_splits=n_splits,
                          random_state=random_state,
                          shuffle=True)

    grid_search = GridSearchCV(classifier, param_grid, scoring=scorers,
                               refit=refit_score,
                               cv=skf, return_train_score=True,
                               n_jobs=-1, # this allows parallel with CPU
                               verbose=1)

    grid_search.fit(X_train, y_train)
    print('Best score: %0.3f' % grid_search.best_score_)

    best_parameters = grid_search.best_estimator_.get_params()
    print('Best parameters set:')

    for param_name in sorted(param_grid.keys()):
        print('\t%s: %r' % (param_name, best_parameters[param_name]))

    predictions = grid_search.predict(X_test)
    print(classification_report(y_test, predictions))

    return grid_search

### SVM Trainer

In [6]:
# Define parameter grid
svm_param_grid = {'C': [0.1, 1, 10],
              'gamma': [1, 0.1, 0.01],
              'kernel': ['rbf', 'poly', 'sigmoid']}

In [7]:
def run_svm_with_grid_search(X_train, X_test, y_train, y_test, param_grid, random_state):
  ## step one: grid search
  gs = grid_search(X_train, X_test, y_train, y_test, param_grid, SVC(), random_state=random_state)

  ## step two: extract best parameters
  best_params = gs.best_estimator_.get_params()
  print("\n Best parameter set found: ", best_params)
  for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_params[param_name]))

  ## step three: train
  clf = SVC(C = best_params['C'], gamma = best_params['gamma'], kernel = best_params['kernel'], random_state = random_state)
  clf.fit(X_train, y_train)

  ## step four: predict
  y_predict = clf.predict(X_test)
  accuracy = clf.score(X_test, y_test)
  print("accuracy: ", accuracy)

  return clf, y_predict

### Grad Boost Trainer

In [8]:
gbc_param_grid = {
    'learning_rate' : [0.01, 0.1, 1],
    'n_estimators' : [100, 200, 500],
    'max_features' : ['sqrt', 'log2']
    }



In [9]:
# Explore Decision Tree baseline
crossvalidation = StratifiedKFold(n_splits=10,
                                  shuffle = True,
                                  random_state = 4561)

In [10]:
## first determine optimal max_depth for the forest we'll be boosting
def train_depth(X_train, y_train, random_state):
  best_depth = None
  best_score = None

  for depth in range (1, 15):

    tree_classifier = tree.DecisionTreeClassifier(max_depth = depth,
                                                  random_state = random_state)
    if tree_classifier.fit(X_train, y_train).tree_.max_depth< depth:
      break
    score = np.mean(cross_val_score(tree_classifier, X_train, y_train,
                                    scoring = 'accuracy',
                                    cv = crossvalidation,
                                    n_jobs = -1))
    ## keep best score and depth
    if best_score:
      if score > best_score:
        best_depth = depth
        best_score = score
    else:
      best_depth = depth
      best_score = score

    # print(depth, score)

  print("\nBest Depth and Score:\n", best_depth, best_score)
  return best_depth, best_score



In [11]:
def train_leafsize(X_train, y_train, depth, random_state):
  ## find optimal leafsize
  best_leafsize = None
  best_score = None

  for leafsize in range (20, 40, 2):
    tree_classifier = tree.DecisionTreeClassifier(min_samples_leaf = leafsize,
                                                  max_depth = depth,
                                                  random_state = random_state)
    score = np.mean(cross_val_score(tree_classifier, X_train, y_train,
                                    scoring = 'accuracy',
                                    cv = crossvalidation,
                                    n_jobs = -1))
    ## keep best leafsize and depth
    if best_score:
      if score > best_score:
        best_leafsize = leafsize
        best_score = score
    else:
      best_leafsize = leafsize
      best_score = score

    # print(leafsize, score)

  print("\nBest Leafsize and Score:\n", best_leafsize, best_score)
  return best_leafsize, best_score


In [12]:
def run_grad_boost_with_grid_search(X_train, X_test, y_train, y_test, depth, leafsize, param_grid, random_state):

  ## step one: define decision tree base for grad boost
  DTC = tree.DecisionTreeClassifier(min_samples_leaf=leafsize,
  max_depth=depth,
  random_state=random_state)

  grad = GradientBoostingClassifier(init=DTC)

  ## step two: grid search
  gs = grid_search(X_train, X_test, y_train, y_test, param_grid, grad, random_state=random_state)

  ## step three: extract best parameters
  best_params = gs.best_estimator_.get_params()
  print("\n Best parameter set found: ", best_params)
  for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_params[param_name]))

  ## step four: train
  gbc = GradientBoostingClassifier(init = DTC,
                             loss='exponential',
                             n_estimators = best_params['n_estimators'],
                             max_features = best_params['max_features'],
                             learning_rate = best_params['learning_rate'],
                             random_state = random_state)
  gbc.fit(X_train, y_train)

  ## step five: predict
  y_predict = gbc.predict(X_test)
  accuracy = gbc.score(X_test, y_test)
  print("accuracy: ", accuracy)

  return gbc, y_predict

### Logistic Regression Trainer

In [13]:
# Define parameter grid
lr_param_grid = {"C":[1e-4, 1e-3, 1e-2, 0.1, 1, 10],
              "penalty":["l1","l2"]}

In [14]:
def run_lr_with_grid_search(X_train, X_test, y_train, y_test, param_grid, random_state):
  ## step one: grid search
  gs = grid_search(X_train, X_test, y_train, y_test, param_grid, LogisticRegression(solver='liblinear', max_iter=500), random_state=random_state)
    
  ## step two: extract best parameters
  best_params = gs.best_estimator_.get_params()
  print("\n Best parameter set found: ", best_params)
  for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_params[param_name]))

  ## step three: train
  clf = LogisticRegression(C = best_params['C'], penalty = best_params['penalty'], solver='liblinear', max_iter=500, random_state = random_state)
  clf.fit(X_train, y_train)

  ## step four: predict
  y_predict = clf.predict(X_test)
  accuracy = clf.score(X_test, y_test)
  print("accuracy: ", accuracy)

  return clf, y_predict

### Results Table

In [15]:
model_table = {}

In [16]:
## turn results dictionary into displayable table
def results_table(table_dict):
  results_df = pd.DataFrame(table_dict).T[['acc','recall', 'precision']] # transpose and reorder the datafram
  return results_df

## Section 1: tfidf

### Prep the Data

In [17]:
tfidf_df = pd.read_csv(f"{fp_path}/tfidf_game.csv")
tfidf_df.head()

Unnamed: 0,Appname,review_cleaned,ability,able,absolute,absolutely,access,achievement,act,action,...,world,worth,would,write,wrong,yeah,year,yes,yet,zombie
0,7 Days to Die,game bad animation sound possible like prototy...,0.002579,0.008978,0.0,0.015272,0.017722,0.014897,0.003287,0.008415,...,0.022535,0.029694,0.029455,0.008309,0.005223,0.0,0.077912,0.004544,0.004453,0.519725
1,A Total War Saga: TROY,good like mom one best total war title bad myt...,0.008647,0.007525,0.001633,0.007315,0.007922,0.003996,0.006613,0.003762,...,0.01511,0.01991,0.042322,0.007428,0.014009,0.001811,0.008707,0.007617,0.005971,0.0
2,ARC Raiders,addictive stressful time waster steam comment ...,0.005507,0.026354,0.007797,0.009316,0.0,0.0,0.007018,0.002994,...,0.019245,0.018113,0.044918,0.005913,0.002788,0.005765,0.050825,0.007275,0.00713,0.0
3,ARK: Survival Ascended,love quite easily play ase hour several map pl...,0.0,0.01437,0.019485,0.010477,0.009455,0.0,0.0,0.0,...,0.028856,0.027159,0.067353,0.0,0.00836,0.004322,0.013856,0.0,0.017818,0.009453
4,Abiotic Factor,one survival game understand fun enemy get har...,0.004972,0.002163,0.014081,0.018928,0.014236,0.002872,0.0,0.0,...,0.060823,0.018401,0.05273,0.008009,0.0,0.002603,0.020862,0.01314,0.015022,0.0


In [18]:
y_df = pd.read_csv(f"{fp_path}/y_variable.csv")
y_df["Appname"] = y_df["App Title"]
y_df = y_df[['Appname', 'long_game']]
y_df.head()
print(y_df.shape)

(247, 2)


In [19]:
tfidf_df = pd.merge(y_df, tfidf_df, on="Appname")
tfidf_df.shape

(247, 503)

In [20]:
(tfidf_df.isnull().sum()/len(tfidf_df)*100).sort_values()
## no NULL values, we can keep all the variables if we want

Appname      0.0
played       0.0
play         0.0
place        0.0
pick         0.0
            ... 
fantastic    0.0
fan          0.0
faction      0.0
find         0.0
zombie       0.0
Length: 503, dtype: float64

### Train/Test Split

In [21]:
tfidf_df.columns[0:10]

Index(['Appname', 'long_game', 'review_cleaned', 'ability', 'able', 'absolute',
       'absolutely', 'access', 'achievement', 'act'],
      dtype='object')

In [22]:
from sklearn.model_selection import train_test_split

# Split the df into train and test

X = tfidf_df.drop(columns=['Appname', 'long_game', 'review_cleaned']) # select all features
y = tfidf_df['long_game'] # Target variable
# Perform the train/test split with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y)

print("The length of training set:", len(X_train))
print("The shape of training/test feature set:", X_train.shape)
print("The length of testing:", X_test.shape)

The length of training set: 197
The shape of training/test feature set: (197, 500)
The length of testing: (50, 500)


### SVM

In [23]:
tfidf_svm, y_predict = run_svm_with_grid_search(X_train, X_test, y_train, y_test, svm_param_grid, 4561)

score = {"acc": accuracy_score(y_test, y_predict),
         "recall": recall_score(y_test, y_predict),
         "precision": precision_score(y_test, y_predict)}

model_table['SVM with tfidf'] = score
model_table

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best score: 0.776
Best parameters set:
	C: 10
	gamma: 1
	kernel: 'sigmoid'
              precision    recall  f1-score   support

           0       0.72      0.84      0.78        25
           1       0.81      0.68      0.74        25

    accuracy                           0.76        50
   macro avg       0.77      0.76      0.76        50
weighted avg       0.77      0.76      0.76        50


 Best parameter set found:  {'C': 10, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 1, 'kernel': 'sigmoid', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}
	C: 10
	gamma: 1
	kernel: 'sigmoid'
accuracy:  0.76


{'SVM with tfidf': {'acc': 0.76,
  'recall': 0.68,
  'precision': 0.8095238095238095}}

In [24]:
pd.DataFrame(confusion_matrix(y_test, y_predict),
             columns=["Predicted Not Good", "Predicted Good"],
             index = ['Actual negative', 'Actual positive']
).head()

Unnamed: 0,Predicted Not Good,Predicted Good
Actual negative,21,4
Actual positive,8,17


### Logistic Regression

In [25]:
tfidf_lr, y_predict = run_lr_with_grid_search(X_train, X_test, y_train, y_test, lr_param_grid, 4561)

score = {"acc": accuracy_score(y_test, y_predict),
         "recall": recall_score(y_test, y_predict),
         "precision": precision_score(y_test, y_predict)}

model_table['LR with tfidf'] = score
model_table

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best score: 0.812
Best parameters set:
	C: 10
	penalty: 'l2'
              precision    recall  f1-score   support

           0       0.78      0.84      0.81        25
           1       0.83      0.76      0.79        25

    accuracy                           0.80        50
   macro avg       0.80      0.80      0.80        50
weighted avg       0.80      0.80      0.80        50


 Best parameter set found:  {'C': 10, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 500, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
	C: 10
	penalty: 'l2'
accuracy:  0.8


{'SVM with tfidf': {'acc': 0.76,
  'recall': 0.68,
  'precision': 0.8095238095238095},
 'LR with tfidf': {'acc': 0.8,
  'recall': 0.76,
  'precision': 0.8260869565217391}}

In [26]:
pd.DataFrame(confusion_matrix(y_test, y_predict),
             columns=["Predicted Not Good", "Predicted Good"],
             index = ['Actual negative', 'Actual positive']
).head()

Unnamed: 0,Predicted Not Good,Predicted Good
Actual negative,21,4
Actual positive,6,19


### Gradient Boost

#### Determine Min Leaf Size and Depth
Keep using same values for rest of lab

In [27]:
best_depth, best_score = train_depth(X_train, y_train, 4561)


Best Depth and Score:
 1 0.7044736842105264


In [28]:
best_leafsize, best_score = train_leafsize(X_train, y_train, best_depth, 4561)


Best Leafsize and Score:
 20 0.7044736842105264


In [29]:
tfidf_gbc, y_predict = run_grad_boost_with_grid_search(X_train, X_test, y_train, y_test, best_depth, best_leafsize, gbc_param_grid, 4561)

score = {"acc": accuracy_score(y_test, y_predict),
         "recall": recall_score(y_test, y_predict),
         "precision": precision_score(y_test, y_predict)}

model_table['GBC with tfidf'] = score
model_table

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best score: 0.787
Best parameters set:
	learning_rate: 1
	max_features: 'sqrt'
	n_estimators: 500
              precision    recall  f1-score   support

           0       0.81      0.84      0.82        25
           1       0.83      0.80      0.82        25

    accuracy                           0.82        50
   macro avg       0.82      0.82      0.82        50
weighted avg       0.82      0.82      0.82        50


 Best parameter set found:  {'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init__ccp_alpha': 0.0, 'init__class_weight': None, 'init__criterion': 'gini', 'init__max_depth': 1, 'init__max_features': None, 'init__max_leaf_nodes': None, 'init__min_impurity_decrease': 0.0, 'init__min_samples_leaf': 20, 'init__min_samples_split': 2, 'init__min_weight_fraction_leaf': 0.0, 'init__monotonic_cst': None, 'init__random_state': 4561, 'init__splitter': 'best', 'init': DecisionTreeClassifier(max_depth=1, min_samples_leaf

{'SVM with tfidf': {'acc': 0.76,
  'recall': 0.68,
  'precision': 0.8095238095238095},
 'LR with tfidf': {'acc': 0.8,
  'recall': 0.76,
  'precision': 0.8260869565217391},
 'GBC with tfidf': {'acc': 0.8,
  'recall': 0.72,
  'precision': 0.8571428571428571}}

In [30]:
pd.DataFrame(confusion_matrix(y_test, y_predict),
             columns=["Predicted Not Good", "Predicted Good"],
             index = ['Actual negative', 'Actual positive']
).head()


Unnamed: 0,Predicted Not Good,Predicted Good
Actual negative,22,3
Actual positive,7,18


### Display Results

In [31]:
results_df = results_table(model_table)
results_df

Unnamed: 0,acc,recall,precision
SVM with tfidf,0.76,0.68,0.809524
LR with tfidf,0.8,0.76,0.826087
GBC with tfidf,0.8,0.72,0.857143


## Section 2: Glove Vectors

### Prep the Data

In [32]:
glove_df = pd.read_csv(f"{fp_path}/glove.csv")
# assert dv_df.shape[0] == 1075
glove_df.head()


Unnamed: 0,recommendationid,Appname,review_cleaned,GLWE1,GLWE2,GLWE3,GLWE4,GLWE5,GLWE6,GLWE7,...,GLWE91,GLWE92,GLWE93,GLWE94,GLWE95,GLWE96,GLWE97,GLWE98,GLWE99,GLWE100
0,212664845,ARC Raiders,addictive stressful time waster,0.003068,-0.495246,0.278798,-0.299187,0.370652,0.436277,0.461215,...,-0.033881,0.60776,0.147187,0.026437,-0.038797,0.221966,-0.063209,0.162469,-0.132538,-0.62062
1,212664705,ARC Raiders,steam comment section like every comment secti...,0.081705,0.075504,-0.026725,0.040429,0.044176,0.201207,0.229808,...,-0.216757,0.209657,-0.028605,-0.062276,0.069856,0.042705,-0.035662,-0.080768,0.024089,0.016808
2,212664692,ARC Raiders,like gathering sneak around arc pvp part peopl...,-0.05273,0.095629,0.084744,-0.01263,-0.121921,0.384559,0.178765,...,-0.24461,0.192405,0.018402,-0.119586,0.005132,0.136383,0.020331,-0.123327,0.126544,-0.059477
3,212664560,ARC Raiders,well make game every time hop experience somet...,0.214481,0.091914,0.124746,-0.123716,-0.018185,0.440613,0.456368,...,-0.097712,0.352841,-0.035521,-0.250624,-0.049312,-0.129814,0.115082,0.023404,-0.025352,-0.241096
4,212664471,ARC Raiders,think would sweaty honestly somehow stop playi...,-0.110131,0.164658,0.092555,-0.114117,-0.185766,0.188016,0.214819,...,-0.009167,0.074861,-0.070553,-0.004708,-0.205371,-0.006124,-0.075116,0.109899,0.2821,0.00584


In [33]:
## set up aggregate functions dictionary
aggregate_functions = {
}

for column in glove_df.columns[3:]:
  aggregate_functions[column] = 'mean' # For all the embedding columns, aggregate them by averaging

len(aggregate_functions)


100

In [34]:
## perform the aggregation
dv_df = (glove_df
          .groupby(['Appname'])
          .agg(
              aggregate_functions
              )
)

In [35]:
y_df = pd.read_csv(f"{fp_path}/y_variable.csv")
y_df["Appname"] = y_df["App Title"]
y_df = y_df[['Appname', 'long_game']]
y_df.head()
print(y_df.shape)

(247, 2)


In [36]:
dv_df = pd.merge(y_df, dv_df, on="Appname")
dv_df.shape

(247, 102)

In [37]:
(dv_df.isnull().sum()/len(dv_df)*100).sort_values()
## no NULL values, we can keep all the variables if we want

Appname    0.0
GLWE72     0.0
GLWE71     0.0
GLWE70     0.0
GLWE69     0.0
          ... 
GLWE28     0.0
GLWE27     0.0
GLWE26     0.0
GLWE36     0.0
GLWE100    0.0
Length: 102, dtype: float64

### Full Feature Set

#### Train/Test Split

In [38]:
dv_df.columns[0:10]

Index(['Appname', 'long_game', 'GLWE1', 'GLWE2', 'GLWE3', 'GLWE4', 'GLWE5',
       'GLWE6', 'GLWE7', 'GLWE8'],
      dtype='object')

In [39]:
from sklearn.model_selection import train_test_split

# Split the df into train and test

X = dv_df.drop(columns=['Appname', 'long_game']) # select all features
y = dv_df['long_game'] # Target variable
# Perform the train/test split with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y)

print("The length of training set:", len(X_train))
print("The shape of training/test feature set:", X_train.shape)
print("The length of testing:", X_test.shape)

The length of training set: 197
The shape of training/test feature set: (197, 100)
The length of testing: (50, 100)


#### SVM

In [40]:
dv_svm, y_predict = run_svm_with_grid_search(X_train, X_test, y_train, y_test, svm_param_grid, 4561)

score = {"acc": accuracy_score(y_test, y_predict),
         "recall": recall_score(y_test, y_predict),
         "precision": precision_score(y_test, y_predict)}

model_table['SVM with Glove Vectors'] = score
model_table

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best score: 0.828
Best parameters set:
	C: 10
	gamma: 1
	kernel: 'rbf'
              precision    recall  f1-score   support

           0       0.88      0.88      0.88        25
           1       0.88      0.88      0.88        25

    accuracy                           0.88        50
   macro avg       0.88      0.88      0.88        50
weighted avg       0.88      0.88      0.88        50


 Best parameter set found:  {'C': 10, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 1, 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}
	C: 10
	gamma: 1
	kernel: 'rbf'
accuracy:  0.88


{'SVM with tfidf': {'acc': 0.76,
  'recall': 0.68,
  'precision': 0.8095238095238095},
 'LR with tfidf': {'acc': 0.8,
  'recall': 0.76,
  'precision': 0.8260869565217391},
 'GBC with tfidf': {'acc': 0.8,
  'recall': 0.72,
  'precision': 0.8571428571428571},
 'SVM with Glove Vectors': {'acc': 0.88, 'recall': 0.88, 'precision': 0.88}}

In [41]:
pd.DataFrame(confusion_matrix(y_test, y_predict),
             columns=["Predicted Not Good", "Predicted Good"],
             index = ['Actual negative', 'Actual positive']
).head()

Unnamed: 0,Predicted Not Good,Predicted Good
Actual negative,22,3
Actual positive,3,22


#### Logistic Regression

In [42]:
dv_lr, y_predict = run_lr_with_grid_search(X_train, X_test, y_train, y_test, lr_param_grid, 4561)

score = {"acc": accuracy_score(y_test, y_predict),
         "recall": recall_score(y_test, y_predict),
         "precision": precision_score(y_test, y_predict)}

model_table['LR with glove'] = score
model_table

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best score: 0.828
Best parameters set:
	C: 10
	penalty: 'l2'
              precision    recall  f1-score   support

           0       0.86      0.72      0.78        25
           1       0.76      0.88      0.81        25

    accuracy                           0.80        50
   macro avg       0.81      0.80      0.80        50
weighted avg       0.81      0.80      0.80        50


 Best parameter set found:  {'C': 10, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 500, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
	C: 10
	penalty: 'l2'
accuracy:  0.8


{'SVM with tfidf': {'acc': 0.76,
  'recall': 0.68,
  'precision': 0.8095238095238095},
 'LR with tfidf': {'acc': 0.8,
  'recall': 0.76,
  'precision': 0.8260869565217391},
 'GBC with tfidf': {'acc': 0.8,
  'recall': 0.72,
  'precision': 0.8571428571428571},
 'SVM with Glove Vectors': {'acc': 0.88, 'recall': 0.88, 'precision': 0.88},
 'LR with glove': {'acc': 0.8,
  'recall': 0.88,
  'precision': 0.7586206896551724}}

In [43]:
pd.DataFrame(confusion_matrix(y_test, y_predict),
             columns=["Predicted Not Good", "Predicted Good"],
             index = ['Actual negative', 'Actual positive']
).head()

Unnamed: 0,Predicted Not Good,Predicted Good
Actual negative,18,7
Actual positive,3,22


#### Gradient Boost

In [44]:
dv_gbc, y_predict = run_grad_boost_with_grid_search(X_train, X_test, y_train, y_test, best_depth, best_leafsize, gbc_param_grid, 4561)

score = {"acc": accuracy_score(y_test, y_predict),
         "recall": recall_score(y_test, y_predict),
         "precision": precision_score(y_test, y_predict)}

model_table['GBC with Glove Vectors'] = score
model_table

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best score: 0.843
Best parameters set:
	learning_rate: 0.1
	max_features: 'sqrt'
	n_estimators: 500
              precision    recall  f1-score   support

           0       0.88      0.84      0.86        25
           1       0.85      0.88      0.86        25

    accuracy                           0.86        50
   macro avg       0.86      0.86      0.86        50
weighted avg       0.86      0.86      0.86        50


 Best parameter set found:  {'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init__ccp_alpha': 0.0, 'init__class_weight': None, 'init__criterion': 'gini', 'init__max_depth': 1, 'init__max_features': None, 'init__max_leaf_nodes': None, 'init__min_impurity_decrease': 0.0, 'init__min_samples_leaf': 20, 'init__min_samples_split': 2, 'init__min_weight_fraction_leaf': 0.0, 'init__monotonic_cst': None, 'init__random_state': 4561, 'init__splitter': 'best', 'init': DecisionTreeClassifier(max_depth=1, min_samples_le

{'SVM with tfidf': {'acc': 0.76,
  'recall': 0.68,
  'precision': 0.8095238095238095},
 'LR with tfidf': {'acc': 0.8,
  'recall': 0.76,
  'precision': 0.8260869565217391},
 'GBC with tfidf': {'acc': 0.8,
  'recall': 0.72,
  'precision': 0.8571428571428571},
 'SVM with Glove Vectors': {'acc': 0.88, 'recall': 0.88, 'precision': 0.88},
 'LR with glove': {'acc': 0.8,
  'recall': 0.88,
  'precision': 0.7586206896551724},
 'GBC with Glove Vectors': {'acc': 0.84,
  'recall': 0.88,
  'precision': 0.8148148148148148}}

In [45]:
pd.DataFrame(confusion_matrix(y_test, y_predict),
             columns=["Predicted Not Good", "Predicted Good"],
             index = ['Actual negative', 'Actual positive']
).head()


Unnamed: 0,Predicted Not Good,Predicted Good
Actual negative,20,5
Actual positive,3,22


### SVD Decomp

#### Train/Test Split

In [46]:
X = dv_df.drop(columns=['Appname', 'long_game']) # select all features
X.shape
X.head(3)

Unnamed: 0,GLWE1,GLWE2,GLWE3,GLWE4,GLWE5,GLWE6,GLWE7,GLWE8,GLWE9,GLWE10,...,GLWE91,GLWE92,GLWE93,GLWE94,GLWE95,GLWE96,GLWE97,GLWE98,GLWE99,GLWE100
0,0.08176,0.076233,0.036443,-0.058441,0.058973,0.268846,0.244816,-0.060542,0.021521,0.042683,...,-0.192674,0.168326,-0.147149,-0.058265,0.146427,-0.030774,0.046256,0.060194,-0.015944,-0.093142
1,0.062359,0.090727,0.02754,-0.039486,0.010742,0.273171,0.204082,-0.032145,0.029351,0.056899,...,-0.183629,0.137912,-0.11393,-0.103522,0.079905,-0.022441,0.075371,0.054518,0.051332,-0.115636
2,0.047299,0.065563,-0.003837,-0.049936,0.030763,0.226612,0.22021,-0.047597,0.025421,0.065244,...,-0.199207,0.154211,-0.118297,-0.066827,0.099401,-0.01139,0.062823,0.029967,0.020959,-0.080667


In [47]:
var_explained = []
# Uses a range with steps initially (2 to 100 by 5)
# then larger steps (100 to 300 by 20)

n_components_range = np.hstack(
                              [np.arange(2, 100, 5), np.arange(100, 300, 20)]
                                )

n_comp_90 = None

for i in n_components_range:
 if i <= X.shape[1]:
  svd = TruncatedSVD(n_components=i)
  svd.fit(X)
  var_explained.append(svd.explained_variance_ratio_.sum())
  print(f"n_components={i}, Variance Explained={var_explained[-1]}")

  if not n_comp_90 and var_explained[-1] >= 0.9:
    n_comp_90 = i

print("Number of Components to Capture 90% of variance:", n_comp_90)

n_components=2, Variance Explained=0.2674869294265339
n_components=7, Variance Explained=0.6384031633992947
n_components=12, Variance Explained=0.7532871595290116
n_components=17, Variance Explained=0.818388179477363
n_components=22, Variance Explained=0.8607577655635047
n_components=27, Variance Explained=0.8911309088637492
n_components=32, Variance Explained=0.9138970476496503
n_components=37, Variance Explained=0.9314168358431051
n_components=42, Variance Explained=0.945676347470975
n_components=47, Variance Explained=0.9574079801187538
n_components=52, Variance Explained=0.9669256349497439
n_components=57, Variance Explained=0.9743753704165893
n_components=62, Variance Explained=0.9805976242979236
n_components=67, Variance Explained=0.9857329669865273
n_components=72, Variance Explained=0.9898539061656546
n_components=77, Variance Explained=0.9931436340680213
n_components=82, Variance Explained=0.9955708330773376
n_components=87, Variance Explained=0.9974477025814504
n_components=9

In [48]:

## truncate SVD to 90% variance
svd = TruncatedSVD(n_components=n_comp_90)
svd.fit(X)
X_svd = svd.transform(X)
X_svd.shape

(247, 32)

In [49]:
from sklearn.model_selection import train_test_split

# Split the df into train and test

y = dv_df['long_game'] # Target variable
# Perform the train/test split with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(
X_svd, y, test_size=0.2, random_state=42, stratify=y)

print("The length of training set:", len(X_train))
print("The shape of training/test feature set:", X_train.shape)
print("The length of testing:", X_test.shape)

The length of training set: 197
The shape of training/test feature set: (197, 32)
The length of testing: (50, 32)


#### SVM

In [50]:
dv_svm, y_predict = run_svm_with_grid_search(X_train, X_test, y_train, y_test, svm_param_grid, 4561)

score = {"acc": accuracy_score(y_test, y_predict),
         "recall": recall_score(y_test, y_predict),
         "precision": precision_score(y_test, y_predict)}

model_table['SVM with Glove Vectors SVD'] = score
model_table

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best score: 0.848
Best parameters set:
	C: 10
	gamma: 1
	kernel: 'rbf'
              precision    recall  f1-score   support

           0       0.88      0.88      0.88        25
           1       0.88      0.88      0.88        25

    accuracy                           0.88        50
   macro avg       0.88      0.88      0.88        50
weighted avg       0.88      0.88      0.88        50


 Best parameter set found:  {'C': 10, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 1, 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}
	C: 10
	gamma: 1
	kernel: 'rbf'
accuracy:  0.88


{'SVM with tfidf': {'acc': 0.76,
  'recall': 0.68,
  'precision': 0.8095238095238095},
 'LR with tfidf': {'acc': 0.8,
  'recall': 0.76,
  'precision': 0.8260869565217391},
 'GBC with tfidf': {'acc': 0.8,
  'recall': 0.72,
  'precision': 0.8571428571428571},
 'SVM with Glove Vectors': {'acc': 0.88, 'recall': 0.88, 'precision': 0.88},
 'LR with glove': {'acc': 0.8,
  'recall': 0.88,
  'precision': 0.7586206896551724},
 'GBC with Glove Vectors': {'acc': 0.84,
  'recall': 0.88,
  'precision': 0.8148148148148148},
 'SVM with Glove Vectors SVD': {'acc': 0.88,
  'recall': 0.88,
  'precision': 0.88}}

In [51]:
pd.DataFrame(confusion_matrix(y_test, y_predict),
             columns=["Predicted Not Good", "Predicted Good"],
             index = ['Actual negative', 'Actual positive']
).head()

Unnamed: 0,Predicted Not Good,Predicted Good
Actual negative,22,3
Actual positive,3,22


#### Logistic Regression

In [52]:
dv_lr, y_predict = run_lr_with_grid_search(X_train, X_test, y_train, y_test, lr_param_grid, 4561)

score = {"acc": accuracy_score(y_test, y_predict),
         "recall": recall_score(y_test, y_predict),
         "precision": precision_score(y_test, y_predict)}

model_table['LR with glove SVD'] = score
model_table

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best score: 0.848
Best parameters set:
	C: 10
	penalty: 'l2'
              precision    recall  f1-score   support

           0       0.83      0.76      0.79        25
           1       0.78      0.84      0.81        25

    accuracy                           0.80        50
   macro avg       0.80      0.80      0.80        50
weighted avg       0.80      0.80      0.80        50


 Best parameter set found:  {'C': 10, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 500, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
	C: 10
	penalty: 'l2'
accuracy:  0.8


{'SVM with tfidf': {'acc': 0.76,
  'recall': 0.68,
  'precision': 0.8095238095238095},
 'LR with tfidf': {'acc': 0.8,
  'recall': 0.76,
  'precision': 0.8260869565217391},
 'GBC with tfidf': {'acc': 0.8,
  'recall': 0.72,
  'precision': 0.8571428571428571},
 'SVM with Glove Vectors': {'acc': 0.88, 'recall': 0.88, 'precision': 0.88},
 'LR with glove': {'acc': 0.8,
  'recall': 0.88,
  'precision': 0.7586206896551724},
 'GBC with Glove Vectors': {'acc': 0.84,
  'recall': 0.88,
  'precision': 0.8148148148148148},
 'SVM with Glove Vectors SVD': {'acc': 0.88,
  'recall': 0.88,
  'precision': 0.88},
 'LR with glove SVD': {'acc': 0.8,
  'recall': 0.84,
  'precision': 0.7777777777777778}}

In [53]:
pd.DataFrame(confusion_matrix(y_test, y_predict),
             columns=["Predicted Not Good", "Predicted Good"],
             index = ['Actual negative', 'Actual positive']
).head()

Unnamed: 0,Predicted Not Good,Predicted Good
Actual negative,19,6
Actual positive,4,21


#### Gradient Boost

In [54]:
dv_gbc, y_predict = run_grad_boost_with_grid_search(X_train, X_test, y_train, y_test, best_depth, best_leafsize, gbc_param_grid, 4561)

score = {"acc": accuracy_score(y_test, y_predict),
         "recall": recall_score(y_test, y_predict),
         "precision": precision_score(y_test, y_predict)}

model_table['GBC with Glove Vectors SVD'] = score
model_table

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best score: 0.772
Best parameters set:
	learning_rate: 0.1
	max_features: 'sqrt'
	n_estimators: 200
              precision    recall  f1-score   support

           0       0.79      0.76      0.78        25
           1       0.77      0.80      0.78        25

    accuracy                           0.78        50
   macro avg       0.78      0.78      0.78        50
weighted avg       0.78      0.78      0.78        50


 Best parameter set found:  {'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init__ccp_alpha': 0.0, 'init__class_weight': None, 'init__criterion': 'gini', 'init__max_depth': 1, 'init__max_features': None, 'init__max_leaf_nodes': None, 'init__min_impurity_decrease': 0.0, 'init__min_samples_leaf': 20, 'init__min_samples_split': 2, 'init__min_weight_fraction_leaf': 0.0, 'init__monotonic_cst': None, 'init__random_state': 4561, 'init__splitter': 'best', 'init': DecisionTreeClassifier(max_depth=1, min_samples_le

{'SVM with tfidf': {'acc': 0.76,
  'recall': 0.68,
  'precision': 0.8095238095238095},
 'LR with tfidf': {'acc': 0.8,
  'recall': 0.76,
  'precision': 0.8260869565217391},
 'GBC with tfidf': {'acc': 0.8,
  'recall': 0.72,
  'precision': 0.8571428571428571},
 'SVM with Glove Vectors': {'acc': 0.88, 'recall': 0.88, 'precision': 0.88},
 'LR with glove': {'acc': 0.8,
  'recall': 0.88,
  'precision': 0.7586206896551724},
 'GBC with Glove Vectors': {'acc': 0.84,
  'recall': 0.88,
  'precision': 0.8148148148148148},
 'SVM with Glove Vectors SVD': {'acc': 0.88,
  'recall': 0.88,
  'precision': 0.88},
 'LR with glove SVD': {'acc': 0.8,
  'recall': 0.84,
  'precision': 0.7777777777777778},
 'GBC with Glove Vectors SVD': {'acc': 0.82,
  'recall': 0.84,
  'precision': 0.8076923076923077}}

In [55]:
pd.DataFrame(confusion_matrix(y_test, y_predict),
             columns=["Predicted Not Good", "Predicted Good"],
             index = ['Actual negative', 'Actual positive']
).head()


Unnamed: 0,Predicted Not Good,Predicted Good
Actual negative,20,5
Actual positive,4,21


### Display Results

In [56]:
results_df = results_table(model_table)
results_df

Unnamed: 0,acc,recall,precision
SVM with tfidf,0.76,0.68,0.809524
LR with tfidf,0.8,0.76,0.826087
GBC with tfidf,0.8,0.72,0.857143
SVM with Glove Vectors,0.88,0.88,0.88
LR with glove,0.8,0.88,0.758621
GBC with Glove Vectors,0.84,0.88,0.814815
SVM with Glove Vectors SVD,0.88,0.88,0.88
LR with glove SVD,0.8,0.84,0.777778
GBC with Glove Vectors SVD,0.82,0.84,0.807692


## Section 3: longformer Vectors

### Prep the Data

In [57]:
longformer_df = pd.read_csv(f"{fp_path}/longformer.csv")
# assert dv_df.shape[0] == 1075
longformer_df.head()


Unnamed: 0,recommendationid,Appname,review_text,0,1,2,3,4,5,6,...,758,759,760,761,762,763,764,765,766,767
0,212664845,ARC Raiders,Addictive. Stressful. Time waster.,0.160998,-0.314331,0.12897,-0.002612,0.363178,-0.22401,-0.496514,...,-0.242919,0.057376,0.166702,0.129182,-0.262234,0.04228,-0.29784,-0.106902,0.149733,0.019978
1,212664705,ARC Raiders,If the Steam comments section is like every ot...,0.171792,-0.321767,0.087026,-0.013351,0.341068,-0.23547,-0.495652,...,-0.243866,0.088027,0.174401,0.12542,-0.211613,0.038621,-0.270587,-0.055174,0.198233,0.028517
2,212664692,ARC Raiders,I like the gathering and sneaking around the A...,0.172855,-0.294004,0.098145,-0.008861,0.330265,-0.219347,-0.50227,...,-0.233194,0.068008,0.172816,0.134451,-0.217051,0.045791,-0.247482,-0.063542,0.169679,0.000969
3,212664560,ARC Raiders,"Very well made game, every time I hop on I exp...",0.192783,-0.339317,0.094364,0.009529,0.344692,-0.200341,-0.490265,...,-0.261185,0.022664,0.213612,0.160738,-0.22349,0.02688,-0.2638,-0.075613,0.200625,0.042513
4,212664471,ARC Raiders,I thought this would be too sweaty for me. Hon...,0.166418,-0.311148,0.098709,-0.004635,0.343317,-0.196987,-0.491329,...,-0.209657,0.029193,0.16678,0.151393,-0.2231,0.029565,-0.26766,-0.06099,0.182768,0.028096


In [58]:
## set up aggregate functions dictionary
aggregate_functions = {
}

for column in longformer_df.columns[3:]:
  aggregate_functions[column] = 'mean' # For all the embedding columns, aggregate them by averaging

len(aggregate_functions)


768

In [59]:
## perform the aggregation
dv_df = (longformer_df
          .groupby(['Appname'])
          .agg(
              aggregate_functions
              )
)

In [60]:
y_df = pd.read_csv(f"{fp_path}/y_variable.csv")
y_df["Appname"] = y_df["App Title"]
y_df = y_df[['Appname', 'long_game']]
y_df.head()
print(y_df.shape)

(247, 2)


In [61]:
dv_df = pd.merge(y_df, dv_df, on="Appname")
dv_df.shape

(247, 770)

In [62]:
(dv_df.isnull().sum()/len(dv_df)*100).sort_values()
## no NULL values, we can keep all the variables if we want

Appname    0.0
505        0.0
506        0.0
507        0.0
508        0.0
          ... 
258        0.0
259        0.0
260        0.0
190        0.0
767        0.0
Length: 770, dtype: float64

### Full Feature Set

#### Train/Test Split

In [63]:
dv_df.columns[0:10]

Index(['Appname', 'long_game', '0', '1', '2', '3', '4', '5', '6', '7'], dtype='object')

In [64]:
from sklearn.model_selection import train_test_split

# Split the df into train and test

X = dv_df.drop(columns=['Appname', 'long_game']) # select all features
y = dv_df['long_game'] # Target variable
# Perform the train/test split with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y)

print("The length of training set:", len(X_train))
print("The shape of training/test feature set:", X_train.shape)
print("The length of testing:", X_test.shape)

The length of training set: 197
The shape of training/test feature set: (197, 768)
The length of testing: (50, 768)


#### SVM

In [65]:
dv_svm, y_predict = run_svm_with_grid_search(X_train, X_test, y_train, y_test, svm_param_grid, 4561)

score = {"acc": accuracy_score(y_test, y_predict),
         "recall": recall_score(y_test, y_predict),
         "precision": precision_score(y_test, y_predict)}

model_table['SVM with longformer Vectors'] = score
model_table

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best score: 0.797
Best parameters set:
	C: 10
	gamma: 0.1
	kernel: 'poly'
              precision    recall  f1-score   support

           0       0.81      0.84      0.82        25
           1       0.83      0.80      0.82        25

    accuracy                           0.82        50
   macro avg       0.82      0.82      0.82        50
weighted avg       0.82      0.82      0.82        50


 Best parameter set found:  {'C': 10, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 0.1, 'kernel': 'poly', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}
	C: 10
	gamma: 0.1
	kernel: 'poly'
accuracy:  0.82


{'SVM with tfidf': {'acc': 0.76,
  'recall': 0.68,
  'precision': 0.8095238095238095},
 'LR with tfidf': {'acc': 0.8,
  'recall': 0.76,
  'precision': 0.8260869565217391},
 'GBC with tfidf': {'acc': 0.8,
  'recall': 0.72,
  'precision': 0.8571428571428571},
 'SVM with Glove Vectors': {'acc': 0.88, 'recall': 0.88, 'precision': 0.88},
 'LR with glove': {'acc': 0.8,
  'recall': 0.88,
  'precision': 0.7586206896551724},
 'GBC with Glove Vectors': {'acc': 0.84,
  'recall': 0.88,
  'precision': 0.8148148148148148},
 'SVM with Glove Vectors SVD': {'acc': 0.88,
  'recall': 0.88,
  'precision': 0.88},
 'LR with glove SVD': {'acc': 0.8,
  'recall': 0.84,
  'precision': 0.7777777777777778},
 'GBC with Glove Vectors SVD': {'acc': 0.82,
  'recall': 0.84,
  'precision': 0.8076923076923077},
 'SVM with longformer Vectors': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334}}

In [66]:
pd.DataFrame(confusion_matrix(y_test, y_predict),
             columns=["Predicted Not Good", "Predicted Good"],
             index = ['Actual negative', 'Actual positive']
).head()

Unnamed: 0,Predicted Not Good,Predicted Good
Actual negative,21,4
Actual positive,5,20


#### Logistic Regression

In [67]:
dv_lr, y_predict = run_lr_with_grid_search(X_train, X_test, y_train, y_test, lr_param_grid, 4561)

score = {"acc": accuracy_score(y_test, y_predict),
         "recall": recall_score(y_test, y_predict),
         "precision": precision_score(y_test, y_predict)}

model_table['LR with longformer'] = score
model_table

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best score: 0.777
Best parameters set:
	C: 10
	penalty: 'l2'
              precision    recall  f1-score   support

           0       0.81      0.84      0.82        25
           1       0.83      0.80      0.82        25

    accuracy                           0.82        50
   macro avg       0.82      0.82      0.82        50
weighted avg       0.82      0.82      0.82        50


 Best parameter set found:  {'C': 10, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 500, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
	C: 10
	penalty: 'l2'
accuracy:  0.82


{'SVM with tfidf': {'acc': 0.76,
  'recall': 0.68,
  'precision': 0.8095238095238095},
 'LR with tfidf': {'acc': 0.8,
  'recall': 0.76,
  'precision': 0.8260869565217391},
 'GBC with tfidf': {'acc': 0.8,
  'recall': 0.72,
  'precision': 0.8571428571428571},
 'SVM with Glove Vectors': {'acc': 0.88, 'recall': 0.88, 'precision': 0.88},
 'LR with glove': {'acc': 0.8,
  'recall': 0.88,
  'precision': 0.7586206896551724},
 'GBC with Glove Vectors': {'acc': 0.84,
  'recall': 0.88,
  'precision': 0.8148148148148148},
 'SVM with Glove Vectors SVD': {'acc': 0.88,
  'recall': 0.88,
  'precision': 0.88},
 'LR with glove SVD': {'acc': 0.8,
  'recall': 0.84,
  'precision': 0.7777777777777778},
 'GBC with Glove Vectors SVD': {'acc': 0.82,
  'recall': 0.84,
  'precision': 0.8076923076923077},
 'SVM with longformer Vectors': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'LR with longformer': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334}}

In [68]:
pd.DataFrame(confusion_matrix(y_test, y_predict),
             columns=["Predicted Not Good", "Predicted Good"],
             index = ['Actual negative', 'Actual positive']
).head()

Unnamed: 0,Predicted Not Good,Predicted Good
Actual negative,21,4
Actual positive,5,20


#### Gradient Boost

In [69]:
dv_gbc, y_predict = run_grad_boost_with_grid_search(X_train, X_test, y_train, y_test, best_depth, best_leafsize, gbc_param_grid, 4561)

score = {"acc": accuracy_score(y_test, y_predict),
         "recall": recall_score(y_test, y_predict),
         "precision": precision_score(y_test, y_predict)}

model_table['GBC with longformer Vectors'] = score
model_table

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best score: 0.787
Best parameters set:
	learning_rate: 1
	max_features: 'sqrt'
	n_estimators: 500
              precision    recall  f1-score   support

           0       0.80      0.80      0.80        25
           1       0.80      0.80      0.80        25

    accuracy                           0.80        50
   macro avg       0.80      0.80      0.80        50
weighted avg       0.80      0.80      0.80        50


 Best parameter set found:  {'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init__ccp_alpha': 0.0, 'init__class_weight': None, 'init__criterion': 'gini', 'init__max_depth': 1, 'init__max_features': None, 'init__max_leaf_nodes': None, 'init__min_impurity_decrease': 0.0, 'init__min_samples_leaf': 20, 'init__min_samples_split': 2, 'init__min_weight_fraction_leaf': 0.0, 'init__monotonic_cst': None, 'init__random_state': 4561, 'init__splitter': 'best', 'init': DecisionTreeClassifier(max_depth=1, min_samples_leaf

{'SVM with tfidf': {'acc': 0.76,
  'recall': 0.68,
  'precision': 0.8095238095238095},
 'LR with tfidf': {'acc': 0.8,
  'recall': 0.76,
  'precision': 0.8260869565217391},
 'GBC with tfidf': {'acc': 0.8,
  'recall': 0.72,
  'precision': 0.8571428571428571},
 'SVM with Glove Vectors': {'acc': 0.88, 'recall': 0.88, 'precision': 0.88},
 'LR with glove': {'acc': 0.8,
  'recall': 0.88,
  'precision': 0.7586206896551724},
 'GBC with Glove Vectors': {'acc': 0.84,
  'recall': 0.88,
  'precision': 0.8148148148148148},
 'SVM with Glove Vectors SVD': {'acc': 0.88,
  'recall': 0.88,
  'precision': 0.88},
 'LR with glove SVD': {'acc': 0.8,
  'recall': 0.84,
  'precision': 0.7777777777777778},
 'GBC with Glove Vectors SVD': {'acc': 0.82,
  'recall': 0.84,
  'precision': 0.8076923076923077},
 'SVM with longformer Vectors': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'LR with longformer': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'GBC with longformer V

In [70]:
pd.DataFrame(confusion_matrix(y_test, y_predict),
             columns=["Predicted Not Good", "Predicted Good"],
             index = ['Actual negative', 'Actual positive']
).head()


Unnamed: 0,Predicted Not Good,Predicted Good
Actual negative,20,5
Actual positive,4,21


### SVD Decomp

#### Train/Test Split

In [71]:
X = dv_df.drop(columns=['Appname', 'long_game']) # select all features
X.shape
X.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.173997,-0.296805,0.082794,0.006975,0.334605,-0.201644,-0.491259,-0.414791,-0.118877,-0.257893,...,-0.224852,0.035691,0.189252,0.135999,-0.222344,0.033356,-0.268679,-0.083682,0.17918,0.022458
1,0.169874,-0.299146,0.086429,0.007647,0.340567,-0.207801,-0.491052,-0.419101,-0.124809,-0.255853,...,-0.237307,0.043545,0.1853,0.139098,-0.222746,0.026723,-0.260892,-0.078126,0.179297,0.021012
2,0.171363,-0.295498,0.087645,0.009147,0.337713,-0.205972,-0.49159,-0.41391,-0.119851,-0.257497,...,-0.222498,0.038607,0.191247,0.137627,-0.224637,0.032388,-0.27082,-0.080635,0.179872,0.026028


In [72]:
var_explained = []
# Uses a range with steps initially (2 to 100 by 5)
# then larger steps (100 to 300 by 20)

n_components_range = np.hstack(
                              [np.arange(2, 100, 5), np.arange(100, 300, 20)]
                                )

n_comp_90 = None

for i in n_components_range:
 if i <= X.shape[1]:
  svd = TruncatedSVD(n_components=i)
  svd.fit(X)
  var_explained.append(svd.explained_variance_ratio_.sum())
  print(f"n_components={i}, Variance Explained={var_explained[-1]}")

  if not n_comp_90 and var_explained[-1] >= 0.9:
    n_comp_90 = i

print("Number of Components to Capture 90% of variance:", n_comp_90)

n_components=2, Variance Explained=0.43760807629633974
n_components=7, Variance Explained=0.6992766762379352
n_components=12, Variance Explained=0.791409582386992
n_components=17, Variance Explained=0.844080222331769
n_components=22, Variance Explained=0.8779309085943774
n_components=27, Variance Explained=0.9007931372018603
n_components=32, Variance Explained=0.9177662627671067
n_components=37, Variance Explained=0.9306668108291788
n_components=42, Variance Explained=0.9410828103249248
n_components=47, Variance Explained=0.9495272533793724
n_components=52, Variance Explained=0.9563927035868537
n_components=57, Variance Explained=0.9621695753529595
n_components=62, Variance Explained=0.9667037236191723
n_components=67, Variance Explained=0.9706839073622121
n_components=72, Variance Explained=0.9740982476012526
n_components=77, Variance Explained=0.9770795149941186
n_components=82, Variance Explained=0.979686985474712
n_components=87, Variance Explained=0.9820129156443478
n_components=9

In [73]:

## truncate SVD to 90% variance
svd = TruncatedSVD(n_components=n_comp_90)
svd.fit(X)
X_svd = svd.transform(X)
X_svd.shape

(247, 27)

In [74]:
from sklearn.model_selection import train_test_split

# Split the df into train and test

y = dv_df['long_game'] # Target variable
# Perform the train/test split with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(
X_svd, y, test_size=0.2, random_state=42, stratify=y)

print("The length of training set:", len(X_train))
print("The shape of training/test feature set:", X_train.shape)
print("The length of testing:", X_test.shape)

The length of training set: 197
The shape of training/test feature set: (197, 27)
The length of testing: (50, 27)


#### SVM

In [75]:
dv_svm, y_predict = run_svm_with_grid_search(X_train, X_test, y_train, y_test, svm_param_grid, 4561)

score = {"acc": accuracy_score(y_test, y_predict),
         "recall": recall_score(y_test, y_predict),
         "precision": precision_score(y_test, y_predict)}

model_table['SVM with longformer Vectors SVD'] = score
model_table

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best score: 0.802
Best parameters set:
	C: 10
	gamma: 0.1
	kernel: 'poly'
              precision    recall  f1-score   support

           0       0.81      0.84      0.82        25
           1       0.83      0.80      0.82        25

    accuracy                           0.82        50
   macro avg       0.82      0.82      0.82        50
weighted avg       0.82      0.82      0.82        50


 Best parameter set found:  {'C': 10, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 0.1, 'kernel': 'poly', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}
	C: 10
	gamma: 0.1
	kernel: 'poly'
accuracy:  0.82


{'SVM with tfidf': {'acc': 0.76,
  'recall': 0.68,
  'precision': 0.8095238095238095},
 'LR with tfidf': {'acc': 0.8,
  'recall': 0.76,
  'precision': 0.8260869565217391},
 'GBC with tfidf': {'acc': 0.8,
  'recall': 0.72,
  'precision': 0.8571428571428571},
 'SVM with Glove Vectors': {'acc': 0.88, 'recall': 0.88, 'precision': 0.88},
 'LR with glove': {'acc': 0.8,
  'recall': 0.88,
  'precision': 0.7586206896551724},
 'GBC with Glove Vectors': {'acc': 0.84,
  'recall': 0.88,
  'precision': 0.8148148148148148},
 'SVM with Glove Vectors SVD': {'acc': 0.88,
  'recall': 0.88,
  'precision': 0.88},
 'LR with glove SVD': {'acc': 0.8,
  'recall': 0.84,
  'precision': 0.7777777777777778},
 'GBC with Glove Vectors SVD': {'acc': 0.82,
  'recall': 0.84,
  'precision': 0.8076923076923077},
 'SVM with longformer Vectors': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'LR with longformer': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'GBC with longformer V

In [76]:
pd.DataFrame(confusion_matrix(y_test, y_predict),
             columns=["Predicted Not Good", "Predicted Good"],
             index = ['Actual negative', 'Actual positive']
).head()

Unnamed: 0,Predicted Not Good,Predicted Good
Actual negative,21,4
Actual positive,5,20


#### Logistic Regression

In [77]:
dv_lr, y_predict = run_lr_with_grid_search(X_train, X_test, y_train, y_test, lr_param_grid, 4561)

score = {"acc": accuracy_score(y_test, y_predict),
         "recall": recall_score(y_test, y_predict),
         "precision": precision_score(y_test, y_predict)}

model_table['LR with longformer SVD'] = score
model_table

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best score: 0.797
Best parameters set:
	C: 10
	penalty: 'l1'
              precision    recall  f1-score   support

           0       0.83      0.80      0.82        25
           1       0.81      0.84      0.82        25

    accuracy                           0.82        50
   macro avg       0.82      0.82      0.82        50
weighted avg       0.82      0.82      0.82        50


 Best parameter set found:  {'C': 10, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 500, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l1', 'random_state': None, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
	C: 10
	penalty: 'l1'
accuracy:  0.82


{'SVM with tfidf': {'acc': 0.76,
  'recall': 0.68,
  'precision': 0.8095238095238095},
 'LR with tfidf': {'acc': 0.8,
  'recall': 0.76,
  'precision': 0.8260869565217391},
 'GBC with tfidf': {'acc': 0.8,
  'recall': 0.72,
  'precision': 0.8571428571428571},
 'SVM with Glove Vectors': {'acc': 0.88, 'recall': 0.88, 'precision': 0.88},
 'LR with glove': {'acc': 0.8,
  'recall': 0.88,
  'precision': 0.7586206896551724},
 'GBC with Glove Vectors': {'acc': 0.84,
  'recall': 0.88,
  'precision': 0.8148148148148148},
 'SVM with Glove Vectors SVD': {'acc': 0.88,
  'recall': 0.88,
  'precision': 0.88},
 'LR with glove SVD': {'acc': 0.8,
  'recall': 0.84,
  'precision': 0.7777777777777778},
 'GBC with Glove Vectors SVD': {'acc': 0.82,
  'recall': 0.84,
  'precision': 0.8076923076923077},
 'SVM with longformer Vectors': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'LR with longformer': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'GBC with longformer V

In [78]:
pd.DataFrame(confusion_matrix(y_test, y_predict),
             columns=["Predicted Not Good", "Predicted Good"],
             index = ['Actual negative', 'Actual positive']
).head()

Unnamed: 0,Predicted Not Good,Predicted Good
Actual negative,20,5
Actual positive,4,21


#### Gradient Boost

In [79]:
dv_gbc, y_predict = run_grad_boost_with_grid_search(X_train, X_test, y_train, y_test, best_depth, best_leafsize, gbc_param_grid, 4561)

score = {"acc": accuracy_score(y_test, y_predict),
         "recall": recall_score(y_test, y_predict),
         "precision": precision_score(y_test, y_predict)}

model_table['GBC with longformer Vectors SVD'] = score
model_table

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best score: 0.822
Best parameters set:
	learning_rate: 1
	max_features: 'sqrt'
	n_estimators: 100
              precision    recall  f1-score   support

           0       0.77      0.80      0.78        25
           1       0.79      0.76      0.78        25

    accuracy                           0.78        50
   macro avg       0.78      0.78      0.78        50
weighted avg       0.78      0.78      0.78        50


 Best parameter set found:  {'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init__ccp_alpha': 0.0, 'init__class_weight': None, 'init__criterion': 'gini', 'init__max_depth': 1, 'init__max_features': None, 'init__max_leaf_nodes': None, 'init__min_impurity_decrease': 0.0, 'init__min_samples_leaf': 20, 'init__min_samples_split': 2, 'init__min_weight_fraction_leaf': 0.0, 'init__monotonic_cst': None, 'init__random_state': 4561, 'init__splitter': 'best', 'init': DecisionTreeClassifier(max_depth=1, min_samples_leaf

{'SVM with tfidf': {'acc': 0.76,
  'recall': 0.68,
  'precision': 0.8095238095238095},
 'LR with tfidf': {'acc': 0.8,
  'recall': 0.76,
  'precision': 0.8260869565217391},
 'GBC with tfidf': {'acc': 0.8,
  'recall': 0.72,
  'precision': 0.8571428571428571},
 'SVM with Glove Vectors': {'acc': 0.88, 'recall': 0.88, 'precision': 0.88},
 'LR with glove': {'acc': 0.8,
  'recall': 0.88,
  'precision': 0.7586206896551724},
 'GBC with Glove Vectors': {'acc': 0.84,
  'recall': 0.88,
  'precision': 0.8148148148148148},
 'SVM with Glove Vectors SVD': {'acc': 0.88,
  'recall': 0.88,
  'precision': 0.88},
 'LR with glove SVD': {'acc': 0.8,
  'recall': 0.84,
  'precision': 0.7777777777777778},
 'GBC with Glove Vectors SVD': {'acc': 0.82,
  'recall': 0.84,
  'precision': 0.8076923076923077},
 'SVM with longformer Vectors': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'LR with longformer': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'GBC with longformer V

In [80]:
pd.DataFrame(confusion_matrix(y_test, y_predict),
             columns=["Predicted Not Good", "Predicted Good"],
             index = ['Actual negative', 'Actual positive']
).head()


Unnamed: 0,Predicted Not Good,Predicted Good
Actual negative,20,5
Actual positive,6,19


### Display Results

In [81]:
results_df = results_table(model_table)
results_df

Unnamed: 0,acc,recall,precision
SVM with tfidf,0.76,0.68,0.809524
LR with tfidf,0.8,0.76,0.826087
GBC with tfidf,0.8,0.72,0.857143
SVM with Glove Vectors,0.88,0.88,0.88
LR with glove,0.8,0.88,0.758621
GBC with Glove Vectors,0.84,0.88,0.814815
SVM with Glove Vectors SVD,0.88,0.88,0.88
LR with glove SVD,0.8,0.84,0.777778
GBC with Glove Vectors SVD,0.82,0.84,0.807692
SVM with longformer Vectors,0.82,0.8,0.833333


## Section 4: SA

### Prep the Data

In [82]:
# red the sentiment analysis
sa_df = pd.read_csv(f"{fp_path}/review_sa.csv")
sa_df.head()

Unnamed: 0,recommendationid,Appname,polarity,subjectivity,NLTK_Compound
0,212664845,ARC Raiders,0.0,0.9,-0.5106
1,212664705,ARC Raiders,0.030422,0.493232,-0.8888
2,212664692,ARC Raiders,-0.275,0.25,-0.2498
3,212664560,ARC Raiders,-0.021212,0.384848,0.3384
4,212664471,ARC Raiders,0.252083,0.510417,0.4404


In [83]:
## set up aggregate functions dictionary
aggregate_functions = {
}

for column in sa_df.columns[2:]:
  aggregate_functions[column] = 'mean' # For all the embedding columns, aggregate them by averaging

len(aggregate_functions)


3

In [84]:
## perform the aggregation
sa_df = (sa_df
          .groupby(['Appname'])
          .agg(
              aggregate_functions
              )
)

In [85]:
y_df = pd.read_csv(f"{fp_path}/y_variable.csv")
y_df["Appname"] = y_df["App Title"]
y_df = y_df[['Appname', 'long_game']]
y_df.head()
print(y_df.shape)

(247, 2)


In [86]:
sa_df = pd.merge(y_df, sa_df, on="Appname")
sa_df.shape

(247, 5)

In [87]:
(sa_df.isnull().sum()/len(sa_df)*100).sort_values()
## no NULL values, we can keep all the variables if we want

Appname          0.0
long_game        0.0
polarity         0.0
subjectivity     0.0
NLTK_Compound    0.0
dtype: float64

### Train/Test Split

In [88]:
sa_df.columns[0:10]

Index(['Appname', 'long_game', 'polarity', 'subjectivity', 'NLTK_Compound'], dtype='object')

In [89]:
from sklearn.model_selection import train_test_split

# Split the df into train and test

X = sa_df.drop(columns=["Appname", "long_game"])
y = sa_df['long_game'] # Target variable
# Perform the train/test split with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y)

print("The length of training set:", len(X_train))
print("The shape of training/test feature set:", X_train.shape)
print("The length of testing:", X_test.shape)

The length of training set: 197
The shape of training/test feature set: (197, 3)
The length of testing: (50, 3)


### SVM

In [90]:
ner_sa_svm, y_predict = run_svm_with_grid_search(X_train, X_test, y_train, y_test, svm_param_grid, 4561)

score = {"acc": accuracy_score(y_test, y_predict),
         "recall": recall_score(y_test, y_predict),
         "precision": precision_score(y_test, y_predict)}

model_table['SVM with SA'] = score
model_table

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best score: 0.624
Best parameters set:
	C: 10
	gamma: 1
	kernel: 'sigmoid'
              precision    recall  f1-score   support

           0       0.58      0.84      0.69        25
           1       0.71      0.40      0.51        25

    accuracy                           0.62        50
   macro avg       0.65      0.62      0.60        50
weighted avg       0.65      0.62      0.60        50


 Best parameter set found:  {'C': 10, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 1, 'kernel': 'sigmoid', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}
	C: 10
	gamma: 1
	kernel: 'sigmoid'
accuracy:  0.62


{'SVM with tfidf': {'acc': 0.76,
  'recall': 0.68,
  'precision': 0.8095238095238095},
 'LR with tfidf': {'acc': 0.8,
  'recall': 0.76,
  'precision': 0.8260869565217391},
 'GBC with tfidf': {'acc': 0.8,
  'recall': 0.72,
  'precision': 0.8571428571428571},
 'SVM with Glove Vectors': {'acc': 0.88, 'recall': 0.88, 'precision': 0.88},
 'LR with glove': {'acc': 0.8,
  'recall': 0.88,
  'precision': 0.7586206896551724},
 'GBC with Glove Vectors': {'acc': 0.84,
  'recall': 0.88,
  'precision': 0.8148148148148148},
 'SVM with Glove Vectors SVD': {'acc': 0.88,
  'recall': 0.88,
  'precision': 0.88},
 'LR with glove SVD': {'acc': 0.8,
  'recall': 0.84,
  'precision': 0.7777777777777778},
 'GBC with Glove Vectors SVD': {'acc': 0.82,
  'recall': 0.84,
  'precision': 0.8076923076923077},
 'SVM with longformer Vectors': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'LR with longformer': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'GBC with longformer V

In [91]:
pd.DataFrame(confusion_matrix(y_test, y_predict),
             columns=["Predicted Not Good", "Predicted Good"],
             index = ['Actual negative', 'Actual positive']
).head()

Unnamed: 0,Predicted Not Good,Predicted Good
Actual negative,21,4
Actual positive,15,10


### Logistic Regression

In [92]:
sa_lr, y_predict = run_lr_with_grid_search(X_train, X_test, y_train, y_test, lr_param_grid, 4561)

score = {"acc": accuracy_score(y_test, y_predict),
         "recall": recall_score(y_test, y_predict),
         "precision": precision_score(y_test, y_predict)}

model_table['LR with SA'] = score
model_table

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best score: 0.599
Best parameters set:
	C: 10
	penalty: 'l1'
              precision    recall  f1-score   support

           0       0.62      0.60      0.61        25
           1       0.62      0.64      0.63        25

    accuracy                           0.62        50
   macro avg       0.62      0.62      0.62        50
weighted avg       0.62      0.62      0.62        50


 Best parameter set found:  {'C': 10, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 500, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l1', 'random_state': None, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
	C: 10
	penalty: 'l1'
accuracy:  0.62


{'SVM with tfidf': {'acc': 0.76,
  'recall': 0.68,
  'precision': 0.8095238095238095},
 'LR with tfidf': {'acc': 0.8,
  'recall': 0.76,
  'precision': 0.8260869565217391},
 'GBC with tfidf': {'acc': 0.8,
  'recall': 0.72,
  'precision': 0.8571428571428571},
 'SVM with Glove Vectors': {'acc': 0.88, 'recall': 0.88, 'precision': 0.88},
 'LR with glove': {'acc': 0.8,
  'recall': 0.88,
  'precision': 0.7586206896551724},
 'GBC with Glove Vectors': {'acc': 0.84,
  'recall': 0.88,
  'precision': 0.8148148148148148},
 'SVM with Glove Vectors SVD': {'acc': 0.88,
  'recall': 0.88,
  'precision': 0.88},
 'LR with glove SVD': {'acc': 0.8,
  'recall': 0.84,
  'precision': 0.7777777777777778},
 'GBC with Glove Vectors SVD': {'acc': 0.82,
  'recall': 0.84,
  'precision': 0.8076923076923077},
 'SVM with longformer Vectors': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'LR with longformer': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'GBC with longformer V

In [93]:
pd.DataFrame(confusion_matrix(y_test, y_predict),
             columns=["Predicted Not Good", "Predicted Good"],
             index = ['Actual negative', 'Actual positive']
).head()

Unnamed: 0,Predicted Not Good,Predicted Good
Actual negative,15,10
Actual positive,9,16


### Gradient Boost

In [94]:
ner_sa_gbc, y_predict = run_grad_boost_with_grid_search(X_train, X_test, y_train, y_test, best_depth, best_leafsize, gbc_param_grid, 4561)

score = {"acc": accuracy_score(y_test, y_predict),
         "recall": recall_score(y_test, y_predict),
         "precision": precision_score(y_test, y_predict)}

model_table['GBC with SA'] = score
model_table

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best score: 0.594
Best parameters set:
	learning_rate: 0.01
	max_features: 'sqrt'
	n_estimators: 200
              precision    recall  f1-score   support

           0       0.58      0.60      0.59        25
           1       0.58      0.56      0.57        25

    accuracy                           0.58        50
   macro avg       0.58      0.58      0.58        50
weighted avg       0.58      0.58      0.58        50


 Best parameter set found:  {'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init__ccp_alpha': 0.0, 'init__class_weight': None, 'init__criterion': 'gini', 'init__max_depth': 1, 'init__max_features': None, 'init__max_leaf_nodes': None, 'init__min_impurity_decrease': 0.0, 'init__min_samples_leaf': 20, 'init__min_samples_split': 2, 'init__min_weight_fraction_leaf': 0.0, 'init__monotonic_cst': None, 'init__random_state': 4561, 'init__splitter': 'best', 'init': DecisionTreeClassifier(max_depth=1, min_samples_l

{'SVM with tfidf': {'acc': 0.76,
  'recall': 0.68,
  'precision': 0.8095238095238095},
 'LR with tfidf': {'acc': 0.8,
  'recall': 0.76,
  'precision': 0.8260869565217391},
 'GBC with tfidf': {'acc': 0.8,
  'recall': 0.72,
  'precision': 0.8571428571428571},
 'SVM with Glove Vectors': {'acc': 0.88, 'recall': 0.88, 'precision': 0.88},
 'LR with glove': {'acc': 0.8,
  'recall': 0.88,
  'precision': 0.7586206896551724},
 'GBC with Glove Vectors': {'acc': 0.84,
  'recall': 0.88,
  'precision': 0.8148148148148148},
 'SVM with Glove Vectors SVD': {'acc': 0.88,
  'recall': 0.88,
  'precision': 0.88},
 'LR with glove SVD': {'acc': 0.8,
  'recall': 0.84,
  'precision': 0.7777777777777778},
 'GBC with Glove Vectors SVD': {'acc': 0.82,
  'recall': 0.84,
  'precision': 0.8076923076923077},
 'SVM with longformer Vectors': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'LR with longformer': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'GBC with longformer V

In [95]:
pd.DataFrame(confusion_matrix(y_test, y_predict),
             columns=["Predicted Not Good", "Predicted Good"],
             index = ['Actual negative', 'Actual positive']
).head()


Unnamed: 0,Predicted Not Good,Predicted Good
Actual negative,16,9
Actual positive,10,15


### Display Results

In [96]:
results_df = results_table(model_table)
results_df

Unnamed: 0,acc,recall,precision
SVM with tfidf,0.76,0.68,0.809524
LR with tfidf,0.8,0.76,0.826087
GBC with tfidf,0.8,0.72,0.857143
SVM with Glove Vectors,0.88,0.88,0.88
LR with glove,0.8,0.88,0.758621
GBC with Glove Vectors,0.84,0.88,0.814815
SVM with Glove Vectors SVD,0.88,0.88,0.88
LR with glove SVD,0.8,0.84,0.777778
GBC with Glove Vectors SVD,0.82,0.84,0.807692
SVM with longformer Vectors,0.82,0.8,0.833333


## Section 5: knowledge based

### Prep the Data

In [97]:
meta_df = pd.read_csv(f"{fp_path}/app_metadata.csv")
meta_df["Appname"] = meta_df["name"]
meta_df = meta_df.drop(columns=['Unnamed: 0', 'type', 'steam_appid', 'name'])
# assert dv_df.shape[0] == 1075
meta_df.head()
# print(meta_df.columns)

Unnamed: 0,required_age,recommendation_count,has_dlc,Casual,Adventure,Sports,Photo Editing,Design & Illustration,Indie,RPG,...,Narrated Game Menus,PvP,Cross-Platform Multiplayer,Steam Trading Cards,Steam Cloud,Tracked Controller Support,VR Support,HDR available,Multi-player,Appname
0,0,266478.0,True,0,1,0,0,0,1,1,...,0,1,1,1,0,0,0,0,1,7 Days to Die
1,0,2987.0,True,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,A Total War Saga: TROY
2,0,163028.0,True,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,1,ARC Raiders
3,0,80878.0,True,0,1,0,0,0,1,1,...,0,1,1,0,0,0,0,0,1,ARK: Survival Ascended
4,0,35021.0,True,0,1,0,0,0,0,1,...,0,0,1,1,0,0,0,0,1,Abiotic Factor


In [98]:
y_df = pd.read_csv(f"{fp_path}/y_variable.csv")
y_df["Appname"] = y_df["App Title"]
y_df = y_df[['Appname', 'long_game']]
y_df.head()
print(y_df.shape)

(247, 2)


In [99]:
game_meta_df = pd.merge(y_df, meta_df, on="Appname")
game_meta_df.shape

(242, 76)

In [100]:
(game_meta_df.isnull().sum()/len(game_meta_df)*100).sort_values()

## in this case, we drop recommendation count because it as NULL values

game_meta_df = game_meta_df.drop(columns=['recommendation_count'])

### Full Feature Set

#### Train/Test Split

In [101]:
game_meta_df.columns[0:10]

Index(['Appname', 'long_game', 'required_age', 'has_dlc', 'Casual',
       'Adventure', 'Sports', 'Photo Editing', 'Design & Illustration',
       'Indie'],
      dtype='object')

In [102]:
from sklearn.model_selection import train_test_split

# Split the df into train and test

X = game_meta_df.drop(columns=['Appname', 'long_game']) # select all features
y = game_meta_df['long_game'] # Target variable
# Perform the train/test split with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y)

print("The length of training set:", len(X_train))
print("The shape of training/test feature set:", X_train.shape)
print("The length of testing:", X_test.shape)

The length of training set: 193
The shape of training/test feature set: (193, 73)
The length of testing: (49, 73)


#### SVM

In [103]:
dv_svm, y_predict = run_svm_with_grid_search(X_train, X_test, y_train, y_test, svm_param_grid, 4561)

score = {"acc": accuracy_score(y_test, y_predict),
         "recall": recall_score(y_test, y_predict),
         "precision": precision_score(y_test, y_predict)}

model_table['SVM with knowledge based'] = score
model_table

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best score: 0.741
Best parameters set:
	C: 1
	gamma: 0.1
	kernel: 'sigmoid'
              precision    recall  f1-score   support

           0       0.85      0.71      0.77        24
           1       0.76      0.88      0.81        25

    accuracy                           0.80        49
   macro avg       0.80      0.79      0.79        49
weighted avg       0.80      0.80      0.79        49


 Best parameter set found:  {'C': 1, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 0.1, 'kernel': 'sigmoid', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}
	C: 1
	gamma: 0.1
	kernel: 'sigmoid'
accuracy:  0.7959183673469388


{'SVM with tfidf': {'acc': 0.76,
  'recall': 0.68,
  'precision': 0.8095238095238095},
 'LR with tfidf': {'acc': 0.8,
  'recall': 0.76,
  'precision': 0.8260869565217391},
 'GBC with tfidf': {'acc': 0.8,
  'recall': 0.72,
  'precision': 0.8571428571428571},
 'SVM with Glove Vectors': {'acc': 0.88, 'recall': 0.88, 'precision': 0.88},
 'LR with glove': {'acc': 0.8,
  'recall': 0.88,
  'precision': 0.7586206896551724},
 'GBC with Glove Vectors': {'acc': 0.84,
  'recall': 0.88,
  'precision': 0.8148148148148148},
 'SVM with Glove Vectors SVD': {'acc': 0.88,
  'recall': 0.88,
  'precision': 0.88},
 'LR with glove SVD': {'acc': 0.8,
  'recall': 0.84,
  'precision': 0.7777777777777778},
 'GBC with Glove Vectors SVD': {'acc': 0.82,
  'recall': 0.84,
  'precision': 0.8076923076923077},
 'SVM with longformer Vectors': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'LR with longformer': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'GBC with longformer V

In [104]:
pd.DataFrame(confusion_matrix(y_test, y_predict),
             columns=["Predicted Not Good", "Predicted Good"],
             index = ['Actual negative', 'Actual positive']
).head()

Unnamed: 0,Predicted Not Good,Predicted Good
Actual negative,17,7
Actual positive,3,22


#### Logistic Regression

In [105]:
kb_lr, y_predict = run_lr_with_grid_search(X_train, X_test, y_train, y_test, lr_param_grid, 4561)

score = {"acc": accuracy_score(y_test, y_predict),
         "recall": recall_score(y_test, y_predict),
         "precision": precision_score(y_test, y_predict)}

model_table['LR with knowledge based'] = score
model_table

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best score: 0.725
Best parameters set:
	C: 0.1
	penalty: 'l2'
              precision    recall  f1-score   support

           0       0.82      0.58      0.68        24
           1       0.69      0.88      0.77        25

    accuracy                           0.73        49
   macro avg       0.76      0.73      0.73        49
weighted avg       0.75      0.73      0.73        49


 Best parameter set found:  {'C': 0.1, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 500, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
	C: 0.1
	penalty: 'l2'
accuracy:  0.7346938775510204


{'SVM with tfidf': {'acc': 0.76,
  'recall': 0.68,
  'precision': 0.8095238095238095},
 'LR with tfidf': {'acc': 0.8,
  'recall': 0.76,
  'precision': 0.8260869565217391},
 'GBC with tfidf': {'acc': 0.8,
  'recall': 0.72,
  'precision': 0.8571428571428571},
 'SVM with Glove Vectors': {'acc': 0.88, 'recall': 0.88, 'precision': 0.88},
 'LR with glove': {'acc': 0.8,
  'recall': 0.88,
  'precision': 0.7586206896551724},
 'GBC with Glove Vectors': {'acc': 0.84,
  'recall': 0.88,
  'precision': 0.8148148148148148},
 'SVM with Glove Vectors SVD': {'acc': 0.88,
  'recall': 0.88,
  'precision': 0.88},
 'LR with glove SVD': {'acc': 0.8,
  'recall': 0.84,
  'precision': 0.7777777777777778},
 'GBC with Glove Vectors SVD': {'acc': 0.82,
  'recall': 0.84,
  'precision': 0.8076923076923077},
 'SVM with longformer Vectors': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'LR with longformer': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'GBC with longformer V

In [106]:
pd.DataFrame(confusion_matrix(y_test, y_predict),
             columns=["Predicted Not Good", "Predicted Good"],
             index = ['Actual negative', 'Actual positive']
).head()

Unnamed: 0,Predicted Not Good,Predicted Good
Actual negative,14,10
Actual positive,3,22


#### Gradient Boost

In [107]:
dv_gbc, y_predict = run_grad_boost_with_grid_search(X_train, X_test, y_train, y_test, best_depth, best_leafsize, gbc_param_grid, 4561)

score = {"acc": accuracy_score(y_test, y_predict),
         "recall": recall_score(y_test, y_predict),
         "precision": precision_score(y_test, y_predict)}

model_table['GBC with knowledge based'] = score
model_table

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best score: 0.746
Best parameters set:
	learning_rate: 0.1
	max_features: 'sqrt'
	n_estimators: 200
              precision    recall  f1-score   support

           0       0.81      0.54      0.65        24
           1       0.67      0.88      0.76        25

    accuracy                           0.71        49
   macro avg       0.74      0.71      0.70        49
weighted avg       0.74      0.71      0.71        49


 Best parameter set found:  {'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init__ccp_alpha': 0.0, 'init__class_weight': None, 'init__criterion': 'gini', 'init__max_depth': 1, 'init__max_features': None, 'init__max_leaf_nodes': None, 'init__min_impurity_decrease': 0.0, 'init__min_samples_leaf': 20, 'init__min_samples_split': 2, 'init__min_weight_fraction_leaf': 0.0, 'init__monotonic_cst': None, 'init__random_state': 4561, 'init__splitter': 'best', 'init': DecisionTreeClassifier(max_depth=1, min_samples_le

{'SVM with tfidf': {'acc': 0.76,
  'recall': 0.68,
  'precision': 0.8095238095238095},
 'LR with tfidf': {'acc': 0.8,
  'recall': 0.76,
  'precision': 0.8260869565217391},
 'GBC with tfidf': {'acc': 0.8,
  'recall': 0.72,
  'precision': 0.8571428571428571},
 'SVM with Glove Vectors': {'acc': 0.88, 'recall': 0.88, 'precision': 0.88},
 'LR with glove': {'acc': 0.8,
  'recall': 0.88,
  'precision': 0.7586206896551724},
 'GBC with Glove Vectors': {'acc': 0.84,
  'recall': 0.88,
  'precision': 0.8148148148148148},
 'SVM with Glove Vectors SVD': {'acc': 0.88,
  'recall': 0.88,
  'precision': 0.88},
 'LR with glove SVD': {'acc': 0.8,
  'recall': 0.84,
  'precision': 0.7777777777777778},
 'GBC with Glove Vectors SVD': {'acc': 0.82,
  'recall': 0.84,
  'precision': 0.8076923076923077},
 'SVM with longformer Vectors': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'LR with longformer': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'GBC with longformer V

In [108]:
pd.DataFrame(confusion_matrix(y_test, y_predict),
             columns=["Predicted Not Good", "Predicted Good"],
             index = ['Actual negative', 'Actual positive']
).head()


Unnamed: 0,Predicted Not Good,Predicted Good
Actual negative,15,9
Actual positive,3,22


### SVD Decomp

#### Train/Test Split

In [109]:
X = game_meta_df.drop(columns=['Appname', 'long_game']) # select all features
X.shape
X.head(3)

Unnamed: 0,required_age,has_dlc,Casual,Adventure,Sports,Photo Editing,Design & Illustration,Indie,RPG,Action,...,LAN Co-op,Narrated Game Menus,PvP,Cross-Platform Multiplayer,Steam Trading Cards,Steam Cloud,Tracked Controller Support,VR Support,HDR available,Multi-player
0,0,True,0,1,0,0,0,1,1,1,...,1,0,1,1,1,0,0,0,0,1
1,0,True,0,0,0,0,0,0,0,1,...,1,0,1,0,0,0,0,0,0,1
2,0,True,0,0,0,0,0,0,0,1,...,0,0,1,1,0,0,0,0,0,1


In [110]:
var_explained = []
# Uses a range with steps initially (2 to 100 by 5)
# then larger steps (100 to 300 by 20)

n_components_range = np.hstack(
                              [np.arange(2, 100, 5), np.arange(100, 300, 20)]
                                )

n_comp_90 = None

for i in n_components_range:
 if i <= X.shape[1]:
  svd = TruncatedSVD(n_components=i)
  svd.fit(X)
  var_explained.append(svd.explained_variance_ratio_.sum())
  print(f"n_components={i}, Variance Explained={var_explained[-1]}")

  if not n_comp_90 and var_explained[-1] >= 0.9:
    n_comp_90 = i

print("Number of Components to Capture 90% of variance:", n_comp_90)

n_components=2, Variance Explained=0.8424146510618953
n_components=7, Variance Explained=0.9094250744102201
n_components=12, Variance Explained=0.9367273362921031
n_components=17, Variance Explained=0.9539397548210539
n_components=22, Variance Explained=0.965954985456995
n_components=27, Variance Explained=0.9749357867226913
n_components=32, Variance Explained=0.9820460217399563
n_components=37, Variance Explained=0.9875189084697243
n_components=42, Variance Explained=0.9917862466580433
n_components=47, Variance Explained=0.9949215263996122
n_components=52, Variance Explained=0.9972261073526073
n_components=57, Variance Explained=0.9986479843904452
n_components=62, Variance Explained=0.9995196813697165
n_components=67, Variance Explained=0.9999169321226856
n_components=72, Variance Explained=1.0000000000000002
Number of Components to Capture 90% of variance: 7


In [111]:

## truncate SVD to 90% variance
svd = TruncatedSVD(n_components=n_comp_90)
svd.fit(X)
X_svd = svd.transform(X)
X_svd.shape

(242, 7)

In [112]:
from sklearn.model_selection import train_test_split

# Split the df into train and test

y = game_meta_df['long_game'] # Target variable
# Perform the train/test split with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(
X_svd, y, test_size=0.2, random_state=42, stratify=y)

print("The length of training set:", len(X_train))
print("The shape of training/test feature set:", X_train.shape)
print("The length of testing:", X_test.shape)

The length of training set: 193
The shape of training/test feature set: (193, 7)
The length of testing: (49, 7)


#### SVM

In [113]:
dv_svm, y_predict = run_svm_with_grid_search(X_train, X_test, y_train, y_test, svm_param_grid, 4561)

score = {"acc": accuracy_score(y_test, y_predict),
         "recall": recall_score(y_test, y_predict),
         "precision": precision_score(y_test, y_predict)}

model_table['SVM with knowledge based SVD'] = score
model_table

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best score: 0.699
Best parameters set:
	C: 0.1
	gamma: 1
	kernel: 'poly'
              precision    recall  f1-score   support

           0       0.61      0.58      0.60        24
           1       0.62      0.64      0.63        25

    accuracy                           0.61        49
   macro avg       0.61      0.61      0.61        49
weighted avg       0.61      0.61      0.61        49


 Best parameter set found:  {'C': 0.1, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 1, 'kernel': 'poly', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}
	C: 0.1
	gamma: 1
	kernel: 'poly'
accuracy:  0.6122448979591837


{'SVM with tfidf': {'acc': 0.76,
  'recall': 0.68,
  'precision': 0.8095238095238095},
 'LR with tfidf': {'acc': 0.8,
  'recall': 0.76,
  'precision': 0.8260869565217391},
 'GBC with tfidf': {'acc': 0.8,
  'recall': 0.72,
  'precision': 0.8571428571428571},
 'SVM with Glove Vectors': {'acc': 0.88, 'recall': 0.88, 'precision': 0.88},
 'LR with glove': {'acc': 0.8,
  'recall': 0.88,
  'precision': 0.7586206896551724},
 'GBC with Glove Vectors': {'acc': 0.84,
  'recall': 0.88,
  'precision': 0.8148148148148148},
 'SVM with Glove Vectors SVD': {'acc': 0.88,
  'recall': 0.88,
  'precision': 0.88},
 'LR with glove SVD': {'acc': 0.8,
  'recall': 0.84,
  'precision': 0.7777777777777778},
 'GBC with Glove Vectors SVD': {'acc': 0.82,
  'recall': 0.84,
  'precision': 0.8076923076923077},
 'SVM with longformer Vectors': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'LR with longformer': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'GBC with longformer V

In [114]:
pd.DataFrame(confusion_matrix(y_test, y_predict),
             columns=["Predicted Not Good", "Predicted Good"],
             index = ['Actual negative', 'Actual positive']
).head()

Unnamed: 0,Predicted Not Good,Predicted Good
Actual negative,14,10
Actual positive,9,16


#### Logistic Regression

In [115]:
kb_lr, y_predict = run_lr_with_grid_search(X_train, X_test, y_train, y_test, lr_param_grid, 4561)

score = {"acc": accuracy_score(y_test, y_predict),
         "recall": recall_score(y_test, y_predict),
         "precision": precision_score(y_test, y_predict)}

model_table['LR with knowledge base SVD'] = score
model_table

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best score: 0.699
Best parameters set:
	C: 10
	penalty: 'l1'
              precision    recall  f1-score   support

           0       0.71      0.62      0.67        24
           1       0.68      0.76      0.72        25

    accuracy                           0.69        49
   macro avg       0.70      0.69      0.69        49
weighted avg       0.70      0.69      0.69        49


 Best parameter set found:  {'C': 10, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 500, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l1', 'random_state': None, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
	C: 10
	penalty: 'l1'
accuracy:  0.6938775510204082


{'SVM with tfidf': {'acc': 0.76,
  'recall': 0.68,
  'precision': 0.8095238095238095},
 'LR with tfidf': {'acc': 0.8,
  'recall': 0.76,
  'precision': 0.8260869565217391},
 'GBC with tfidf': {'acc': 0.8,
  'recall': 0.72,
  'precision': 0.8571428571428571},
 'SVM with Glove Vectors': {'acc': 0.88, 'recall': 0.88, 'precision': 0.88},
 'LR with glove': {'acc': 0.8,
  'recall': 0.88,
  'precision': 0.7586206896551724},
 'GBC with Glove Vectors': {'acc': 0.84,
  'recall': 0.88,
  'precision': 0.8148148148148148},
 'SVM with Glove Vectors SVD': {'acc': 0.88,
  'recall': 0.88,
  'precision': 0.88},
 'LR with glove SVD': {'acc': 0.8,
  'recall': 0.84,
  'precision': 0.7777777777777778},
 'GBC with Glove Vectors SVD': {'acc': 0.82,
  'recall': 0.84,
  'precision': 0.8076923076923077},
 'SVM with longformer Vectors': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'LR with longformer': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'GBC with longformer V

In [116]:
pd.DataFrame(confusion_matrix(y_test, y_predict),
             columns=["Predicted Not Good", "Predicted Good"],
             index = ['Actual negative', 'Actual positive']
).head()

Unnamed: 0,Predicted Not Good,Predicted Good
Actual negative,15,9
Actual positive,6,19


#### Gradient Boost

In [117]:
dv_gbc, y_predict = run_grad_boost_with_grid_search(X_train, X_test, y_train, y_test, best_depth, best_leafsize, gbc_param_grid, 4561)

score = {"acc": accuracy_score(y_test, y_predict),
         "recall": recall_score(y_test, y_predict),
         "precision": precision_score(y_test, y_predict)}

model_table['GBC with knowledge based SVD'] = score
model_table

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best score: 0.658
Best parameters set:
	learning_rate: 0.1
	max_features: 'sqrt'
	n_estimators: 500
              precision    recall  f1-score   support

           0       0.72      0.54      0.62        24
           1       0.65      0.80      0.71        25

    accuracy                           0.67        49
   macro avg       0.68      0.67      0.67        49
weighted avg       0.68      0.67      0.67        49


 Best parameter set found:  {'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init__ccp_alpha': 0.0, 'init__class_weight': None, 'init__criterion': 'gini', 'init__max_depth': 1, 'init__max_features': None, 'init__max_leaf_nodes': None, 'init__min_impurity_decrease': 0.0, 'init__min_samples_leaf': 20, 'init__min_samples_split': 2, 'init__min_weight_fraction_leaf': 0.0, 'init__monotonic_cst': None, 'init__random_state': 4561, 'init__splitter': 'best', 'init': DecisionTreeClassifier(max_depth=1, min_samples_le

{'SVM with tfidf': {'acc': 0.76,
  'recall': 0.68,
  'precision': 0.8095238095238095},
 'LR with tfidf': {'acc': 0.8,
  'recall': 0.76,
  'precision': 0.8260869565217391},
 'GBC with tfidf': {'acc': 0.8,
  'recall': 0.72,
  'precision': 0.8571428571428571},
 'SVM with Glove Vectors': {'acc': 0.88, 'recall': 0.88, 'precision': 0.88},
 'LR with glove': {'acc': 0.8,
  'recall': 0.88,
  'precision': 0.7586206896551724},
 'GBC with Glove Vectors': {'acc': 0.84,
  'recall': 0.88,
  'precision': 0.8148148148148148},
 'SVM with Glove Vectors SVD': {'acc': 0.88,
  'recall': 0.88,
  'precision': 0.88},
 'LR with glove SVD': {'acc': 0.8,
  'recall': 0.84,
  'precision': 0.7777777777777778},
 'GBC with Glove Vectors SVD': {'acc': 0.82,
  'recall': 0.84,
  'precision': 0.8076923076923077},
 'SVM with longformer Vectors': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'LR with longformer': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'GBC with longformer V

In [118]:
pd.DataFrame(confusion_matrix(y_test, y_predict),
             columns=["Predicted Not Good", "Predicted Good"],
             index = ['Actual negative', 'Actual positive']
).head()


Unnamed: 0,Predicted Not Good,Predicted Good
Actual negative,14,10
Actual positive,3,22


### Display Results

In [119]:
results_df = results_table(model_table)
results_df

Unnamed: 0,acc,recall,precision
SVM with tfidf,0.76,0.68,0.809524
LR with tfidf,0.8,0.76,0.826087
GBC with tfidf,0.8,0.72,0.857143
SVM with Glove Vectors,0.88,0.88,0.88
LR with glove,0.8,0.88,0.758621
GBC with Glove Vectors,0.84,0.88,0.814815
SVM with Glove Vectors SVD,0.88,0.88,0.88
LR with glove SVD,0.8,0.84,0.777778
GBC with Glove Vectors SVD,0.82,0.84,0.807692
SVM with longformer Vectors,0.82,0.8,0.833333


## Section 6 hybrid: 

### Prep the Data

In [120]:
longformer_df = pd.read_csv(f"{fp_path}/longformer.csv").drop(columns=['recommendationid', 'review_text'])
# assert dv_df.shape[0] == 1075
longformer_df.head()


Unnamed: 0,Appname,0,1,2,3,4,5,6,7,8,...,758,759,760,761,762,763,764,765,766,767
0,ARC Raiders,0.160998,-0.314331,0.12897,-0.002612,0.363178,-0.22401,-0.496514,-0.436305,-0.123515,...,-0.242919,0.057376,0.166702,0.129182,-0.262234,0.04228,-0.29784,-0.106902,0.149733,0.019978
1,ARC Raiders,0.171792,-0.321767,0.087026,-0.013351,0.341068,-0.23547,-0.495652,-0.37892,-0.13272,...,-0.243866,0.088027,0.174401,0.12542,-0.211613,0.038621,-0.270587,-0.055174,0.198233,0.028517
2,ARC Raiders,0.172855,-0.294004,0.098145,-0.008861,0.330265,-0.219347,-0.50227,-0.400064,-0.099364,...,-0.233194,0.068008,0.172816,0.134451,-0.217051,0.045791,-0.247482,-0.063542,0.169679,0.000969
3,ARC Raiders,0.192783,-0.339317,0.094364,0.009529,0.344692,-0.200341,-0.490265,-0.416988,-0.106434,...,-0.261185,0.022664,0.213612,0.160738,-0.22349,0.02688,-0.2638,-0.075613,0.200625,0.042513
4,ARC Raiders,0.166418,-0.311148,0.098709,-0.004635,0.343317,-0.196987,-0.491329,-0.400324,-0.114388,...,-0.209657,0.029193,0.16678,0.151393,-0.2231,0.029565,-0.26766,-0.06099,0.182768,0.028096


In [121]:
## set up aggregate functions dictionary
aggregate_functions = {
}

for column in longformer_df.columns[1:]:
  aggregate_functions[column] = 'mean' # For all the embedding columns, aggregate them by averaging

len(aggregate_functions)


768

In [122]:
## perform the aggregation
dv_df = (longformer_df
          .groupby(['Appname'])
          .agg(
              aggregate_functions
              )
)

In [123]:
# red the sentiment analysis
sa_df = pd.read_csv(f"{fp_path}/review_sa.csv").drop(columns=['recommendationid'])
sa_df.head()

Unnamed: 0,Appname,polarity,subjectivity,NLTK_Compound
0,ARC Raiders,0.0,0.9,-0.5106
1,ARC Raiders,0.030422,0.493232,-0.8888
2,ARC Raiders,-0.275,0.25,-0.2498
3,ARC Raiders,-0.021212,0.384848,0.3384
4,ARC Raiders,0.252083,0.510417,0.4404


In [124]:
## set up aggregate functions dictionary
aggregate_functions = {
}

for column in sa_df.columns[1:]:
  aggregate_functions[column] = 'mean' # For all the embedding columns, aggregate them by averaging

len(aggregate_functions)


3

In [125]:
## perform the aggregation
dsa_df = (sa_df
          .groupby(['Appname'])
          .agg(
              aggregate_functions
              )
)

In [126]:
meta_df = pd.read_csv(f"{fp_path}/app_metadata.csv")
meta_df["Appname"] = meta_df["name"]
meta_df = meta_df.drop(columns=['Unnamed: 0', 'type', 'steam_appid', 'name'])
# assert dv_df.shape[0] == 1075
meta_df.head()
# print(meta_df.columns)

Unnamed: 0,required_age,recommendation_count,has_dlc,Casual,Adventure,Sports,Photo Editing,Design & Illustration,Indie,RPG,...,Narrated Game Menus,PvP,Cross-Platform Multiplayer,Steam Trading Cards,Steam Cloud,Tracked Controller Support,VR Support,HDR available,Multi-player,Appname
0,0,266478.0,True,0,1,0,0,0,1,1,...,0,1,1,1,0,0,0,0,1,7 Days to Die
1,0,2987.0,True,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,A Total War Saga: TROY
2,0,163028.0,True,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,1,ARC Raiders
3,0,80878.0,True,0,1,0,0,0,1,1,...,0,1,1,0,0,0,0,0,1,ARK: Survival Ascended
4,0,35021.0,True,0,1,0,0,0,0,1,...,0,0,1,1,0,0,0,0,1,Abiotic Factor


In [127]:
dsa_df.shape

(247, 3)

In [128]:
dv_df.shape

(247, 768)

In [129]:
## merge the longformer vectors and the SA dataframe we used in section 3
hybrid_df = pd.merge(left=dsa_df,
right=dv_df,
on='Appname')

print(hybrid_df.shape)

(247, 771)


In [130]:
## merge the longformer vectors and the SA dataframe we used in section 3
hybrid_df = pd.merge(left=hybrid_df,
right=meta_df,
on='Appname')

print(hybrid_df.shape)

(242, 846)


In [131]:
y_df = pd.read_csv(f"{fp_path}/y_variable.csv")
y_df["Appname"] = y_df["App Title"]
y_df = y_df[['Appname', 'long_game']]
y_df.head()
print(y_df.shape)

(247, 2)


In [132]:
hybrid_df = pd.merge(y_df, hybrid_df, on="Appname")
hybrid_df.shape

(242, 847)

In [133]:
(hybrid_df.isnull().sum()/len(hybrid_df)*100).sort_values()


Appname                 0.000000
552                     0.000000
553                     0.000000
554                     0.000000
555                     0.000000
                          ...   
281                     0.000000
282                     0.000000
284                     0.000000
Multi-player            0.000000
recommendation_count    3.305785
Length: 847, dtype: float64

In [134]:
## in this case, we drop recommendation count because it as NULL values

hybrid_df = hybrid_df.drop(columns=['recommendation_count'])

### Train/Test Split

In [135]:
hybrid_df.columns[0:10]

Index(['Appname', 'long_game', 'polarity', 'subjectivity', 'NLTK_Compound',
       '0', '1', '2', '3', '4'],
      dtype='object')

In [136]:
from sklearn.model_selection import train_test_split

# Split the df into train and test

X = hybrid_df.drop(columns=['Appname', 'long_game']) # select all features
y = hybrid_df['long_game'] # Target variable
# Perform the train/test split with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y)

print("The length of training set:", len(X_train))
print("The shape of training/test feature set:", X_train.shape)
print("The length of testing:", X_test.shape)

The length of training set: 193
The shape of training/test feature set: (193, 844)
The length of testing: (49, 844)


### SVM

In [137]:
hybrid_svm, y_predict = run_svm_with_grid_search(X_train, X_test, y_train, y_test, svm_param_grid, 4561)

score = {"acc": accuracy_score(y_test, y_predict),
         "recall": recall_score(y_test, y_predict),
         "precision": precision_score(y_test, y_predict)}

model_table['SVM with Hybrid'] = score
model_table

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best score: 0.746
Best parameters set:
	C: 10
	gamma: 0.01
	kernel: 'sigmoid'
              precision    recall  f1-score   support

           0       0.77      0.71      0.74        24
           1       0.74      0.80      0.77        25

    accuracy                           0.76        49
   macro avg       0.76      0.75      0.75        49
weighted avg       0.76      0.76      0.75        49


 Best parameter set found:  {'C': 10, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 0.01, 'kernel': 'sigmoid', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}
	C: 10
	gamma: 0.01
	kernel: 'sigmoid'
accuracy:  0.7551020408163265


{'SVM with tfidf': {'acc': 0.76,
  'recall': 0.68,
  'precision': 0.8095238095238095},
 'LR with tfidf': {'acc': 0.8,
  'recall': 0.76,
  'precision': 0.8260869565217391},
 'GBC with tfidf': {'acc': 0.8,
  'recall': 0.72,
  'precision': 0.8571428571428571},
 'SVM with Glove Vectors': {'acc': 0.88, 'recall': 0.88, 'precision': 0.88},
 'LR with glove': {'acc': 0.8,
  'recall': 0.88,
  'precision': 0.7586206896551724},
 'GBC with Glove Vectors': {'acc': 0.84,
  'recall': 0.88,
  'precision': 0.8148148148148148},
 'SVM with Glove Vectors SVD': {'acc': 0.88,
  'recall': 0.88,
  'precision': 0.88},
 'LR with glove SVD': {'acc': 0.8,
  'recall': 0.84,
  'precision': 0.7777777777777778},
 'GBC with Glove Vectors SVD': {'acc': 0.82,
  'recall': 0.84,
  'precision': 0.8076923076923077},
 'SVM with longformer Vectors': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'LR with longformer': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'GBC with longformer V

In [138]:
pd.DataFrame(confusion_matrix(y_test, y_predict),
             columns=["Predicted Not Good", "Predicted Good"],
             index = ['Actual negative', 'Actual positive']
).head()

Unnamed: 0,Predicted Not Good,Predicted Good
Actual negative,17,7
Actual positive,5,20


### Logistic Regression

In [139]:
hybrid_lr, y_predict = run_lr_with_grid_search(X_train, X_test, y_train, y_test, lr_param_grid, 4561)

score = {"acc": accuracy_score(y_test, y_predict),
         "recall": recall_score(y_test, y_predict),
         "precision": precision_score(y_test, y_predict)}

model_table['LR with hybrid'] = score
model_table

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best score: 0.767
Best parameters set:
	C: 1
	penalty: 'l1'
              precision    recall  f1-score   support

           0       0.71      0.62      0.67        24
           1       0.68      0.76      0.72        25

    accuracy                           0.69        49
   macro avg       0.70      0.69      0.69        49
weighted avg       0.70      0.69      0.69        49


 Best parameter set found:  {'C': 1, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 500, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l1', 'random_state': None, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
	C: 1
	penalty: 'l1'
accuracy:  0.6938775510204082


{'SVM with tfidf': {'acc': 0.76,
  'recall': 0.68,
  'precision': 0.8095238095238095},
 'LR with tfidf': {'acc': 0.8,
  'recall': 0.76,
  'precision': 0.8260869565217391},
 'GBC with tfidf': {'acc': 0.8,
  'recall': 0.72,
  'precision': 0.8571428571428571},
 'SVM with Glove Vectors': {'acc': 0.88, 'recall': 0.88, 'precision': 0.88},
 'LR with glove': {'acc': 0.8,
  'recall': 0.88,
  'precision': 0.7586206896551724},
 'GBC with Glove Vectors': {'acc': 0.84,
  'recall': 0.88,
  'precision': 0.8148148148148148},
 'SVM with Glove Vectors SVD': {'acc': 0.88,
  'recall': 0.88,
  'precision': 0.88},
 'LR with glove SVD': {'acc': 0.8,
  'recall': 0.84,
  'precision': 0.7777777777777778},
 'GBC with Glove Vectors SVD': {'acc': 0.82,
  'recall': 0.84,
  'precision': 0.8076923076923077},
 'SVM with longformer Vectors': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'LR with longformer': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'GBC with longformer V

In [140]:
pd.DataFrame(confusion_matrix(y_test, y_predict),
             columns=["Predicted Not Good", "Predicted Good"],
             index = ['Actual negative', 'Actual positive']
).head()

Unnamed: 0,Predicted Not Good,Predicted Good
Actual negative,15,9
Actual positive,6,19


### Gradient Boost

In [141]:
hybrid_gbc, y_predict = run_grad_boost_with_grid_search(X_train, X_test, y_train, y_test, best_depth, best_leafsize, gbc_param_grid, 4561)

score = {"acc": accuracy_score(y_test, y_predict),
         "recall": recall_score(y_test, y_predict),
         "precision": precision_score(y_test, y_predict)}

model_table['GBC with Hybrid'] = score
model_table

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best score: 0.782
Best parameters set:
	learning_rate: 1
	max_features: 'sqrt'
	n_estimators: 200
              precision    recall  f1-score   support

           0       0.88      0.96      0.92        24
           1       0.96      0.88      0.92        25

    accuracy                           0.92        49
   macro avg       0.92      0.92      0.92        49
weighted avg       0.92      0.92      0.92        49


 Best parameter set found:  {'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init__ccp_alpha': 0.0, 'init__class_weight': None, 'init__criterion': 'gini', 'init__max_depth': 1, 'init__max_features': None, 'init__max_leaf_nodes': None, 'init__min_impurity_decrease': 0.0, 'init__min_samples_leaf': 20, 'init__min_samples_split': 2, 'init__min_weight_fraction_leaf': 0.0, 'init__monotonic_cst': None, 'init__random_state': 4561, 'init__splitter': 'best', 'init': DecisionTreeClassifier(max_depth=1, min_samples_leaf

{'SVM with tfidf': {'acc': 0.76,
  'recall': 0.68,
  'precision': 0.8095238095238095},
 'LR with tfidf': {'acc': 0.8,
  'recall': 0.76,
  'precision': 0.8260869565217391},
 'GBC with tfidf': {'acc': 0.8,
  'recall': 0.72,
  'precision': 0.8571428571428571},
 'SVM with Glove Vectors': {'acc': 0.88, 'recall': 0.88, 'precision': 0.88},
 'LR with glove': {'acc': 0.8,
  'recall': 0.88,
  'precision': 0.7586206896551724},
 'GBC with Glove Vectors': {'acc': 0.84,
  'recall': 0.88,
  'precision': 0.8148148148148148},
 'SVM with Glove Vectors SVD': {'acc': 0.88,
  'recall': 0.88,
  'precision': 0.88},
 'LR with glove SVD': {'acc': 0.8,
  'recall': 0.84,
  'precision': 0.7777777777777778},
 'GBC with Glove Vectors SVD': {'acc': 0.82,
  'recall': 0.84,
  'precision': 0.8076923076923077},
 'SVM with longformer Vectors': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'LR with longformer': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'GBC with longformer V

In [142]:
pd.DataFrame(confusion_matrix(y_test, y_predict),
             columns=["Predicted Not Good", "Predicted Good"],
             index = ['Actual negative', 'Actual positive']
).head()


Unnamed: 0,Predicted Not Good,Predicted Good
Actual negative,22,2
Actual positive,2,23


### SVD Decomp

#### Train/Test Split

In [143]:
X = hybrid_df.drop(columns=['Appname', 'long_game']) # select all features
X.shape
X.head(3)

Unnamed: 0,polarity,subjectivity,NLTK_Compound,0,1,2,3,4,5,6,...,LAN Co-op,Narrated Game Menus,PvP,Cross-Platform Multiplayer,Steam Trading Cards,Steam Cloud,Tracked Controller Support,VR Support,HDR available,Multi-player
0,0.112722,0.461096,0.499505,0.173997,-0.296805,0.082794,0.006975,0.334605,-0.201644,-0.491259,...,1,0,1,1,1,0,0,0,0,1
1,0.068671,0.511913,0.233699,0.169874,-0.299146,0.086429,0.007647,0.340567,-0.207801,-0.491052,...,1,0,1,0,0,0,0,0,0,1
2,0.076805,0.465599,0.277117,0.171363,-0.295498,0.087645,0.009147,0.337713,-0.205972,-0.49159,...,0,0,1,1,0,0,0,0,0,1


In [144]:
var_explained = []
# Uses a range with steps initially (2 to 100 by 5)
# then larger steps (100 to 300 by 20)

n_components_range = np.hstack(
                              [np.arange(2, 100, 5), np.arange(100, 300, 20)]
                                )

n_comp_90 = None

for i in n_components_range:
 if i <= X.shape[1]:
  svd = TruncatedSVD(n_components=i)
  svd.fit(X)
  var_explained.append(svd.explained_variance_ratio_.sum())
  print(f"n_components={i}, Variance Explained={var_explained[-1]}")

  if not n_comp_90 and var_explained[-1] >= 0.9:
    n_comp_90 = i

print("Number of Components to Capture 90% of variance:", n_comp_90)

n_components=2, Variance Explained=0.8333526727277462
n_components=7, Variance Explained=0.9055085924345172
n_components=12, Variance Explained=0.9339394096341359
n_components=17, Variance Explained=0.9516958326119724
n_components=22, Variance Explained=0.9643198504612704
n_components=27, Variance Explained=0.9734635590578429
n_components=32, Variance Explained=0.9807110302430944
n_components=37, Variance Explained=0.9863619736969673
n_components=42, Variance Explained=0.9907767083338165
n_components=47, Variance Explained=0.994087469271765
n_components=52, Variance Explained=0.9965114658325948
n_components=57, Variance Explained=0.9980909412987053
n_components=62, Variance Explained=0.9990888584626785
n_components=67, Variance Explained=0.9996564613627732
n_components=72, Variance Explained=0.999880994118109
n_components=77, Variance Explained=0.9999478664134759
n_components=82, Variance Explained=0.9999650917964439
n_components=87, Variance Explained=0.9999738260332139
n_components=9

In [145]:

## truncate SVD to 90% variance
svd = TruncatedSVD(n_components=n_comp_90)
svd.fit(X)
X_svd = svd.transform(X)
X_svd.shape

(242, 7)

In [146]:
from sklearn.model_selection import train_test_split

# Split the df into train and test

y = hybrid_df['long_game'] # Target variable
# Perform the train/test split with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(
X_svd, y, test_size=0.2, random_state=42, stratify=y)

print("The length of training set:", len(X_train))
print("The shape of training/test feature set:", X_train.shape)
print("The length of testing:", X_test.shape)

The length of training set: 193
The shape of training/test feature set: (193, 7)
The length of testing: (49, 7)


#### SVM

In [147]:
dv_svm, y_predict = run_svm_with_grid_search(X_train, X_test, y_train, y_test, svm_param_grid, 4561)

score = {"acc": accuracy_score(y_test, y_predict),
         "recall": recall_score(y_test, y_predict),
         "precision": precision_score(y_test, y_predict)}

model_table['SVM with hybrid SVD'] = score
model_table

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best score: 0.658
Best parameters set:
	C: 10
	gamma: 1
	kernel: 'poly'
              precision    recall  f1-score   support

           0       0.67      0.58      0.62        24
           1       0.64      0.72      0.68        25

    accuracy                           0.65        49
   macro avg       0.65      0.65      0.65        49
weighted avg       0.65      0.65      0.65        49


 Best parameter set found:  {'C': 10, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 1, 'kernel': 'poly', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}
	C: 10
	gamma: 1
	kernel: 'poly'
accuracy:  0.6530612244897959


{'SVM with tfidf': {'acc': 0.76,
  'recall': 0.68,
  'precision': 0.8095238095238095},
 'LR with tfidf': {'acc': 0.8,
  'recall': 0.76,
  'precision': 0.8260869565217391},
 'GBC with tfidf': {'acc': 0.8,
  'recall': 0.72,
  'precision': 0.8571428571428571},
 'SVM with Glove Vectors': {'acc': 0.88, 'recall': 0.88, 'precision': 0.88},
 'LR with glove': {'acc': 0.8,
  'recall': 0.88,
  'precision': 0.7586206896551724},
 'GBC with Glove Vectors': {'acc': 0.84,
  'recall': 0.88,
  'precision': 0.8148148148148148},
 'SVM with Glove Vectors SVD': {'acc': 0.88,
  'recall': 0.88,
  'precision': 0.88},
 'LR with glove SVD': {'acc': 0.8,
  'recall': 0.84,
  'precision': 0.7777777777777778},
 'GBC with Glove Vectors SVD': {'acc': 0.82,
  'recall': 0.84,
  'precision': 0.8076923076923077},
 'SVM with longformer Vectors': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'LR with longformer': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'GBC with longformer V

In [148]:
pd.DataFrame(confusion_matrix(y_test, y_predict),
             columns=["Predicted Not Good", "Predicted Good"],
             index = ['Actual negative', 'Actual positive']
).head()

Unnamed: 0,Predicted Not Good,Predicted Good
Actual negative,14,10
Actual positive,7,18


#### Logistic Regression

In [149]:
kb_lr, y_predict = run_lr_with_grid_search(X_train, X_test, y_train, y_test, lr_param_grid, 4561)

score = {"acc": accuracy_score(y_test, y_predict),
         "recall": recall_score(y_test, y_predict),
         "precision": precision_score(y_test, y_predict)}

model_table['LR with hybrid SVD'] = score
model_table

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best score: 0.617
Best parameters set:
	C: 0.001
	penalty: 'l2'
              precision    recall  f1-score   support

           0       0.75      0.75      0.75        24
           1       0.76      0.76      0.76        25

    accuracy                           0.76        49
   macro avg       0.76      0.76      0.76        49
weighted avg       0.76      0.76      0.76        49


 Best parameter set found:  {'C': 0.001, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 500, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
	C: 0.001
	penalty: 'l2'
accuracy:  0.7551020408163265


{'SVM with tfidf': {'acc': 0.76,
  'recall': 0.68,
  'precision': 0.8095238095238095},
 'LR with tfidf': {'acc': 0.8,
  'recall': 0.76,
  'precision': 0.8260869565217391},
 'GBC with tfidf': {'acc': 0.8,
  'recall': 0.72,
  'precision': 0.8571428571428571},
 'SVM with Glove Vectors': {'acc': 0.88, 'recall': 0.88, 'precision': 0.88},
 'LR with glove': {'acc': 0.8,
  'recall': 0.88,
  'precision': 0.7586206896551724},
 'GBC with Glove Vectors': {'acc': 0.84,
  'recall': 0.88,
  'precision': 0.8148148148148148},
 'SVM with Glove Vectors SVD': {'acc': 0.88,
  'recall': 0.88,
  'precision': 0.88},
 'LR with glove SVD': {'acc': 0.8,
  'recall': 0.84,
  'precision': 0.7777777777777778},
 'GBC with Glove Vectors SVD': {'acc': 0.82,
  'recall': 0.84,
  'precision': 0.8076923076923077},
 'SVM with longformer Vectors': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'LR with longformer': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'GBC with longformer V

In [150]:
pd.DataFrame(confusion_matrix(y_test, y_predict),
             columns=["Predicted Not Good", "Predicted Good"],
             index = ['Actual negative', 'Actual positive']
).head()

Unnamed: 0,Predicted Not Good,Predicted Good
Actual negative,18,6
Actual positive,6,19


#### Gradient Boost

In [151]:
dv_gbc, y_predict = run_grad_boost_with_grid_search(X_train, X_test, y_train, y_test, best_depth, best_leafsize, gbc_param_grid, 4561)

score = {"acc": accuracy_score(y_test, y_predict),
         "recall": recall_score(y_test, y_predict),
         "precision": precision_score(y_test, y_predict)}

model_table['GBC with hybrid SVD'] = score
model_table

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best score: 0.652
Best parameters set:
	learning_rate: 1
	max_features: 'log2'
	n_estimators: 100
              precision    recall  f1-score   support

           0       0.60      0.62      0.61        24
           1       0.62      0.60      0.61        25

    accuracy                           0.61        49
   macro avg       0.61      0.61      0.61        49
weighted avg       0.61      0.61      0.61        49


 Best parameter set found:  {'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init__ccp_alpha': 0.0, 'init__class_weight': None, 'init__criterion': 'gini', 'init__max_depth': 1, 'init__max_features': None, 'init__max_leaf_nodes': None, 'init__min_impurity_decrease': 0.0, 'init__min_samples_leaf': 20, 'init__min_samples_split': 2, 'init__min_weight_fraction_leaf': 0.0, 'init__monotonic_cst': None, 'init__random_state': 4561, 'init__splitter': 'best', 'init': DecisionTreeClassifier(max_depth=1, min_samples_leaf

{'SVM with tfidf': {'acc': 0.76,
  'recall': 0.68,
  'precision': 0.8095238095238095},
 'LR with tfidf': {'acc': 0.8,
  'recall': 0.76,
  'precision': 0.8260869565217391},
 'GBC with tfidf': {'acc': 0.8,
  'recall': 0.72,
  'precision': 0.8571428571428571},
 'SVM with Glove Vectors': {'acc': 0.88, 'recall': 0.88, 'precision': 0.88},
 'LR with glove': {'acc': 0.8,
  'recall': 0.88,
  'precision': 0.7586206896551724},
 'GBC with Glove Vectors': {'acc': 0.84,
  'recall': 0.88,
  'precision': 0.8148148148148148},
 'SVM with Glove Vectors SVD': {'acc': 0.88,
  'recall': 0.88,
  'precision': 0.88},
 'LR with glove SVD': {'acc': 0.8,
  'recall': 0.84,
  'precision': 0.7777777777777778},
 'GBC with Glove Vectors SVD': {'acc': 0.82,
  'recall': 0.84,
  'precision': 0.8076923076923077},
 'SVM with longformer Vectors': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'LR with longformer': {'acc': 0.82,
  'recall': 0.8,
  'precision': 0.8333333333333334},
 'GBC with longformer V

In [152]:
pd.DataFrame(confusion_matrix(y_test, y_predict),
             columns=["Predicted Not Good", "Predicted Good"],
             index = ['Actual negative', 'Actual positive']
).head()


Unnamed: 0,Predicted Not Good,Predicted Good
Actual negative,13,11
Actual positive,8,17


### Display Results

In [153]:
results_df = results_table(model_table)
results_df

Unnamed: 0,acc,recall,precision
SVM with tfidf,0.76,0.68,0.809524
LR with tfidf,0.8,0.76,0.826087
GBC with tfidf,0.8,0.72,0.857143
SVM with Glove Vectors,0.88,0.88,0.88
LR with glove,0.8,0.88,0.758621
GBC with Glove Vectors,0.84,0.88,0.814815
SVM with Glove Vectors SVD,0.88,0.88,0.88
LR with glove SVD,0.8,0.84,0.777778
GBC with Glove Vectors SVD,0.82,0.84,0.807692
SVM with longformer Vectors,0.82,0.8,0.833333


## Section 6: Reflection

### Display Results

In [154]:
results_df = results_table(model_table)
results_df.sort_values("acc", ascending=False)

Unnamed: 0,acc,recall,precision
GBC with Hybrid,0.918367,0.92,0.92
SVM with Glove Vectors,0.88,0.88,0.88
SVM with Glove Vectors SVD,0.88,0.88,0.88
GBC with Glove Vectors,0.84,0.88,0.814815
SVM with longformer Vectors SVD,0.82,0.8,0.833333
GBC with Glove Vectors SVD,0.82,0.84,0.807692
SVM with longformer Vectors,0.82,0.8,0.833333
LR with longformer,0.82,0.8,0.833333
GBC with longformer Vectors,0.82,0.84,0.807692
LR with longformer SVD,0.82,0.84,0.807692
