# Minecraft ReadMe Modeling

- [Modeling Begins Here](#Modeling-Begins-Here)
- [Module Functions](#Modules)

In [62]:
# import personal modules
import prepare as prep
#import acquire as ac


#import datascience libraries
import pandas as pd
import numpy as np


# import vizualization libraries
import matplotlib.pyplot as plt


# Sklearn modules including classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier  # Gradient Boosting Classifier
from sklearn.ensemble import HistGradientBoostingClassifier # Sklearn version of LGBM Classifier
from sklearn.naive_bayes import MultinomialNB  # Naive Bayes Classifier


# Sklearn testing, evaluating, and managing model
from sklearn.datasets import make_multilabel_classification
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE, f_regression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score


# additional, advanced classifiers
from xgboost import XGBClassifier as xgb  # XG Boost Classifier
from xgboost import DMatrix  # used to transform series to matrix for xg classifier
from lightgbm import LGBMClassifier # Light Gradient Boost Classifier
from catboost import CatBoostClassifier # Cat boost classifier


# import module from standard library
from time import time
from pprint import pprint # pretty print


# NLP related modules / libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk #Natural Language Tool Kit
import re   #Regular Expressions

np.random.seed(7)

## Modules

In [46]:
def get_xy():
    df = pd.read_csv('clean_scraped_data.csv', index_col=[0])
    df = prep.map_other_languages(df)
    x = df['lemmatized']
    y = df['language']

    cv = CountVectorizer()
    x_vectorized = cv.fit_transform(x)

    
    
    return x_vectorized, y

In [55]:
def get_split_data():
    df = pd.read_csv('clean_scraped_data.csv', index_col=[0])
    df = prep.map_other_languages(df)
    x = df['lemmatized']
    y = df['language']

    cv = CountVectorizer()
    #x_vectorized = cv.fit_transform(x)

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 7)
    
    x_train = cv.fit_transform(x_train)
    x_test = cv.transform(x_test)
    
    
    return x_train, y_train, x_test, y_test

In [88]:
#########################################################################
           ############       Random Forest       ##############     
  ######  Creates N number of trees using random starting values  ######
########################################################################

def random_forest_model(x, y):
    
    rf_classifier = RandomForestClassifier(
        min_samples_leaf=10,
        n_estimators=200,
        max_depth=5, 
        bootstrap=True,
        oob_score=True,
        n_jobs=-1,
        max_features='auto'
    )

    rf_classifier.fit(x, y)

    y_preds = rf_classifier.predict(x)
    
    return y_preds


#############################################################################
    ############       Gradient Boosting Classifier       ##############     
######  Creates a random forest where each tree learns from the last  ######
############################################################################

def gradient_booster_model(x_train, y_train, x_test = 0, y_test = 0, test = False):

    gradient_booster = GradientBoostingClassifier(
                            learning_rate=0.1,
                            max_depth = 5,
                            n_estimators=200)
    if test == False:
    
        gradient_booster.fit(x_train, y_train)
        y_preds = gradient_booster.predict(x_train)
        
        return y_preds

    if test == True:
        gradient_booster.fit(x_train, y_train)
        y_preds = gradient_booster.predict(x_test)

        return y_preds

#################################################################
############         XG Boosting Classifier       ##############     
    #######       Uses XG Boosting Algorthm       #######
#################################################################

def xgboost_model(x_train, y_train, x_test = 0, y_test = 0, test = False):
    
    xgb_params = {'max_depth'       : 3,
                  'eta'             : 0.01,
                  'silent'          : 0,
                  'eval_metric'     : 'auc',
                  'subsample'       : 0.8,
                  'colsample_bytree': 0.8,
                  'objective'       : 'binary:logistic'}

    xgboost = xgb(params = xgb_params,
                 num_boost_round = 2000,
                 verbose_eval = 50,
                 early_stopping_rounds = 500,
                 #feval = f1_score_cust,
                 #evals = evals,
                 maximize = True)

    if test == False:
        xgboost.fit(x_train, y_train)
        y_preds = xgboost.predict(x_train)

        return y_preds

    if test == True:
        xgboost.fit(x_train, y_train)
        y_preds = xgboost.predict(x_test)

    return y_preds
    

#################################################################
#########         LightGMB Boosting Classifier       ###########     
#######       Uses Light Gradient Boosting Algorthm       #######
#################################################################

def lgmboost_model(x, y):
    
    lgmboost = LGBMClassifier(
                learning_rate=0.1,
                max_depth = 5,
                n_estimators=200)

    lgmboost.fit(x, y)
    
    y_preds = lgmboost.predict(x)
    
    return y_preds


#################################################################
#########       HistGradientBoosting Classifier      ###########     
#######    Inspired by Light Gradient Boosting Algorthm    ######
#################################################################

def histgradientboost_model(x_train, y_train, x_test = 0, y_test = 0, test = False):
    
    HGboost = HistGradientBoostingClassifier(
                                            learning_rate=0.1,
                                            max_depth = 5)
   
    HGboost.fit(x_train, y_train)
    
    if test == False:
        y_preds = HGBoost.predict(x_train)
        
        return y_preds
        
    if test == True:
        y_preds = HGBoost.predict(x_test)
    
        return y_preds


##########################################################
#########         Cat Boost Classifier       ###########     
#######       Cat Boost Gradient Boosting Algorthm       ##
##########################################################

def catboost_model(x, y):
    
    catboost_params = {'loss_function' : 'Logloss',
                        'eval_metric' : 'AUC',
                        'verbose' : 200}
                      
    catboost = CatBoostClassifier()

    catboost.fit(x, y, use_best_model = True, plot = True)
    
    y_preds = lgmboost.predict(x)
    
    return y_preds

####################################################################
#########         Multinomial Naive Bayes Classifier     ###########     
#######     Uses Naive Bayes as Classification Algorithm     #######
####################################################################

def nb_model(x_train, y_train, x_test = 0, y_test = 0, test = False):
    
    naive_bayes = MultinomialNB()
    
    if test == False:
        naive_bayes.fit(x_train, y_train)
        y_preds = naive_bayes.predict(x_train)

        return y_preds
    
    if test == True:
        naive_bayes.fit(x_train, y_train)
        y_preds = naive_bayes.predict(x_test)

        return y_preds

## Modeling Begins Here

In [63]:
%%time
x_train, y_train, x_test, y_test = get_split_data()

x,y = get_xy()

CPU times: user 544 ms, sys: 44.6 ms, total: 589 ms
Wall time: 594 ms


<div class = 'alert alert-block alert-info'>

## Testing Bayes Model



## How does classifier perform using train / test split? 

In [64]:
%%time
NB_y_preds_train = nb_model(x_train, y_train)
report = classification_report(y_train, NB_y_preds_train)
print(report)

              precision    recall  f1-score   support

        Java       0.95      1.00      0.97       276
  JavaScript       0.97      0.59      0.73        61
       Other       0.89      0.95      0.92       230
      Python       0.95      0.82      0.88        51

    accuracy                           0.92       618
   macro avg       0.94      0.84      0.88       618
weighted avg       0.93      0.92      0.92       618

CPU times: user 19.2 ms, sys: 2.92 ms, total: 22.2 ms
Wall time: 20.4 ms


In [65]:
%%time
NB_y_preds_test = nb_model(x_train, y_train, x_test, y_test, test=True)
report = classification_report(y_test, NB_y_preds_test)
print(report)

              precision    recall  f1-score   support

        Java       0.80      0.83      0.81       126
  JavaScript       0.60      0.12      0.20        25
       Other       0.54      0.76      0.63        90
      Python       1.00      0.12      0.21        25

    accuracy                           0.67       266
   macro avg       0.73      0.46      0.46       266
weighted avg       0.71      0.67      0.64       266

CPU times: user 11.8 ms, sys: 1.73 ms, total: 13.6 ms
Wall time: 12 ms


## How does classifier perform using kfold cross validation?

In [66]:
%%time
x,y = get_xy()

model = MultinomialNB()


testing_df = pd.DataFrame(columns = ['speed', 'accuracy'])

start = time()
cv = RepeatedStratifiedKFold(n_splits = 4, n_repeats = 2, random_state = 7)
score = cross_val_score(model, x, y, scoring='accuracy', cv = cv, n_jobs=-1)

speed = np.round(time() - start, 3)
accuracy = np.mean(score).round(3)

testing_df.loc[0] = (speed, accuracy)
                          
print(f"Mean Accuracy: {testing_df['accuracy']} \nStd: {np.std(score): .3f} \nRun time: {testing_df['speed']}")

Mean Accuracy: 0    0.646
Name: accuracy, dtype: float64 
Std:  0.027 
Run time: 0    4.033
Name: speed, dtype: float64
CPU times: user 382 ms, sys: 152 ms, total: 534 ms
Wall time: 4.36 s


<div class = 'alert alert-block alert-info'>
    
## testing sklearn's gradient booster

In [67]:
%%time
x_train, y_train, x_test, y_test = get_split_data()
x,y = get_xy()

CPU times: user 739 ms, sys: 68.6 ms, total: 807 ms
Wall time: 843 ms


In [68]:
%%time
gb_y_preds_train = gradient_booster_model(x_train, y_train)
report = classification_report(y_train, gb_y_preds_train)
print(report)

              precision    recall  f1-score   support

        Java       1.00      0.99      1.00       276
  JavaScript       1.00      1.00      1.00        61
       Other       0.99      1.00      1.00       230
      Python       1.00      1.00      1.00        51

    accuracy                           1.00       618
   macro avg       1.00      1.00      1.00       618
weighted avg       1.00      1.00      1.00       618

CPU times: user 20.7 s, sys: 124 ms, total: 20.8 s
Wall time: 21 s


In [81]:
%%time
gb_y_preds_test = gradient_booster_model(x_train, y_train, x_test, y_test, test = True)
report = classification_report(y_test, gb_y_preds_test)
print(report)

              precision    recall  f1-score   support

        Java       0.84      0.83      0.83       126
  JavaScript       0.78      0.56      0.65        25
       Other       0.64      0.72      0.68        90
      Python       0.78      0.72      0.75        25

    accuracy                           0.76       266
   macro avg       0.76      0.71      0.73       266
weighted avg       0.76      0.76      0.76       266

CPU times: user 20.4 s, sys: 68.5 ms, total: 20.4 s
Wall time: 20.6 s


<div class = 'alert alert-block alert-info'>
    
## testing hist gradient booster (sklearn's version of Light GMB classifier)

In [93]:
%%time
x_train, y_train, x_test, y_test = get_split_data()
x,y = get_xy()
x_train.toarray()

CPU times: user 529 ms, sys: 61.5 ms, total: 591 ms
Wall time: 592 ms


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [94]:
%%time
hgb_y_preds_train = histgradientboost_model(x_train, y_train)
report = classification_report(y_train, hgb_y_preds_train)
print(report)

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

In [None]:
%%time
hgb_y_preds_test = histgradientboost_model(x_train, y_train, x_test, y_test, test = True)
report = classification_report(y_test, hgb_y_preds_test)
print(report)

<div class = 'alert alert-block alert-info'>
    
## testing xg gradient booster

In [86]:
df = pd.read_csv('clean_scraped_data.csv', index_col=[0])
df = prep.map_other_languages(df)
df['language'] = df['language'].map({'Python': 3, 'Other': 2, 'Java' : 0, 'JavaScript' : 1})

x = df['lemmatized']
y = df['language']

cv = CountVectorizer()
#x_vectorized = cv.fit_transform(x)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 7)

x_train = cv.fit_transform(x_train)
x_test = cv.transform(x_test)

In [None]:
%%time                                     
                                     
xgb_preds_train = xgboost_model(x_train, y_train)
report = classification_report(y_train, xgb_preds_train)
print(report)

In [73]:
x,y = get_xy()

xgb_params = {'max_depth'       : 3,
                      'eta'             : 0.01,
                      'silent'          : 0,
                      'eval_metric'     : 'auc',
                      'subsample'       : 0.8,
                      'colsample_bytree': 0.8,
                      'objective'       : 'binary:logistic'}
    
    
#dtrain = xgb.DMatrix(x_train, y_train, feature_names = x_train.columns.values)
#dtest  = xgb.DMatrix(x_test, y_test, feature_names = x_test.columns.values)


xgboost = xgb(params = xgb_params,
                     num_boost_round = 2000,
                     verbose_eval = 50,
                     early_stopping_rounds = 500,
                     #feval = f1_score_cust,
                     #evals = evals,
                     maximize = True)


In [None]:
def histgradientboost_model(x, y):
    
    HGboost = HistGradientBoostingClassifier(
                                            learning_rate=0.1,
                                            max_depth = 5)

    HGboost.fit(x, y)
    
    y_preds = lgmboost.predict(x)
    
    return y_preds

In [None]:
model = [MultinomialNB(), 

In [30]:
%%time
x,y = get_xy()

model = MultinomialNB()


testing_df = pd.DataFrame(columns = ['speed', 'accuracy'])

start = time()
cv = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 2, random_state = 7)
score = cross_val_score(model, x, y, scoring='accuracy', cv = cv, n_jobs=-1)

speed = np.round(time() - start, 3)
accuracy = np.mean(score).round(3)

testing_df.loc[0] = (speed, accuracy)
                          
print(f"Mean Accuracy: {testing_df['accuracy']} \nStd: {np.std(score): .3f} \nRun time: {testing_df['speed']}")

ValueError: For evaluating multiple scores, use sklearn.model_selection.cross_validate instead. ['accuracy', 'f1'] was passed.

In [17]:
np.mean(score)

0.6532742681047765

In [24]:
testing_df = pd.DataFrame(columns = ['speed', 'accuracy'])

In [29]:
testing_df.head()

Unnamed: 0,speed,accuracy
0,0.747,0.653
