# Minecraft ReadMe Modeling

- [Modeling Begins Here](#Modeling-Begins-Here)
- [Module Functions](#Modules)

In [4]:
# import personal modules
import prepare as prep
#import acquire as ac


# import modules from libraries
#from prepare import basic_clean, lemmatize
from pprint import pprint

#import datascience libraries
import pandas as pd
import numpy as np

# import vizualization libraries
import matplotlib.pyplot as plt

# Sklearn modules including classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier  # Gradient Boosting Classifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB  # Naive Bayes Classifier


# Sklearn testing, evaluating, and managing model
from sklearn.datasets import make_multilabel_classification
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE, f_regression
from sklearn.multioutput import MultiOutputClassifier as MOC 
from sklearn.pipeline import Pipeline as pipeline 

# more classifiers
from xgboost import XGBClassifier  # XG Boost Classifier
from lightgbm import LGBMClassifier # Light Gradient Boost Classifier


import nltk #Natural Language Tool Kit
import re   #Regular Expressions

# NLP related modules
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## Modules

In [79]:
#########################################################################
           ############       Random Forest       ##############     
  ######  Creates N number of trees using random starting values  ######
########################################################################

def random_forest_model(x, y):
    
    rf_classifier = RandomForestClassifier(
        min_samples_leaf=10,
        n_estimators=200,
        max_depth=5, 
        bootstrap=True,
        oob_score=True,
        n_jobs=-1,
        max_features='auto'
    )

    rf_classifier.fit(x, y)

    y_preds = rf_classifier.predict(x)
    
    return y_preds


#############################################################################
    ############       Gradient Boosting Classifier       ##############     
######  Creates a random forest where each tree learns from the last  ######
############################################################################

def gradient_booster_model(x_train, y_train, x_test = 0, y_test = 0, test = False):

    gradient_booster = GradientBoostingClassifier(
                            learning_rate=0.1,
                            max_depth = 5,
                            n_estimators=200)
    if test == False:
    
        gradient_booster.fit(x_train, y_train)
        y_preds = gradient_booster.predict(x_train)
        
        return y_preds

    if test == True:
        gradient_booster.fit(x_train, y_train)
        y_preds = gradient_booster.predict(x_test)

        return y_preds

#################################################################
############         XG Boosting Classifier       ##############     
    #######       Uses XG Boosting Algorthm       #######
#################################################################

def xgboost_model(x, y):
    
    xgboost = XGBClassifier(
                        base_score=None,
                        booster=None,
                        n_estimators=200,
                        learning_rate=0.1,
                        max_depth = 5
                        )

    xgboost.fit(x, y)
    
    y_preds = xgboost.predict(x)
    
    return y_preds


#################################################################
#########         LightGMB Boosting Classifier       ###########     
#######       Uses Light Gradient Boosting Algorthm       #######
#################################################################

def lgmboost_model(x, y):
    
    lgmboost = LGBMClassifier(
                learning_rate=0.1,
                max_depth = 5,
                n_estimators=200)

    lgmboost.fit(x, y)
    
    y_preds = lgmboost.predict(x)
    
    return y_preds


####################################################################
#########         Multinomial Naive Bayes Classifier     ###########     
#######     Uses Naive Bayes as Classification Algorithm     #######
####################################################################

def nb_model(x_train, y_train, x_test = 0, y_test = 0, test = False):
    
    naive_bayes = MultinomialNB()
    
    if test == False:
        naive_bayes.fit(x_train, y_train)
        y_preds = naive_bayes.predict(x_train)

        return y_preds
    
    if test == True:
        naive_bayes.fit(x_train, y_train)
        y_preds = naive_bayes.predict(x_test)

        return y_preds

## Modeling Begins Here

In [53]:
df = pd.read_csv('clean_scraped_data.csv', index_col=[0])
df = prep.map_other_languages(df)
#f = df[(df['language'] == 'Python') | (df['language']== 'Java') | (df['language'] == 'JavaScript')]
df.head()

Unnamed: 0,repo,language,readme_contents,clean,lemmatized
1,fogleman/Minecraft,Python,# Minecraft\n\nSimple Minecraft-inspired demo ...,minecraft simple minecraftinspired demo writte...,minecraft simple minecraftinspired demo writte...
2,itzg/docker-minecraft-server,Other,[![Docker Pulls](https://img.shields.io/docker...,docker pullshttpsimgshieldsiodockerpullsitzgmi...,docker pullshttpsimgshieldsiodockerpullsitzgmi...
3,overviewer/Minecraft-Overviewer,Python,====================================\nMinecraf...,minecraft overviewer build status andrew brown...,minecraft overviewer build status andrew brown...
4,TheGreyGhost/MinecraftByExample,Java,MinecraftByExample [1.16.4]\n=================...,minecraftbyexample 1164 purpose minecraftbyexa...,minecraftbyexample 1164 purpose minecraftbyexa...
5,minecraft-dev/MinecraftDev,Other,"<p align=""center""><a href=""https://minecraftde...",p aligncentera hrefhttpsminecraftdevorgimg src...,p aligncentera hrefhttpsminecraftdevorgimg src...


In [51]:
x = df['lemmatized']
y = df['language']

cv = CountVectorizer()
x_vectorized = cv.fit_transform(x)

x_train, x_test, y_train, y_test = train_test_split(x_vectorized, y, test_size = 0.3, random_state = 7)

In [56]:
df.language.value_counts()

Java          402
Other         320
JavaScript     86
Python         76
Name: language, dtype: int64

<div class = 'alert alert-block alert-info'>

## Testing Bayes Model



- How does classifier perform with only Java, JavaScript, and Python observations? 

In [52]:
%%time
NB_y_preds_train = nb_model(x_train, y_train)
report = classification_report(y_train, NB_y_preds_train)
print(report)

              precision    recall  f1-score   support

        Java       0.96      1.00      0.98       280
  JavaScript       1.00      0.80      0.89        61
      Python       0.98      0.96      0.97        53

    accuracy                           0.96       394
   macro avg       0.98      0.92      0.95       394
weighted avg       0.97      0.96      0.96       394

CPU times: user 12.8 ms, sys: 1.71 ms, total: 14.5 ms
Wall time: 12.8 ms


- How does classifier perform with every language left intact?

In [39]:
%%time
NB_y_preds_train = nb_model(x_train, y_train)
report = classification_report(y_train, NB_y_preds_train)
print(report)

                  precision    recall  f1-score   support

        Assembly       0.00      0.00      0.00         1
       Batchfile       1.00      0.33      0.50         3
               C       1.00      0.10      0.18        10
              C#       0.93      0.45      0.61        31
             C++       0.91      0.72      0.81        29
           CMake       0.00      0.00      0.00         1
             CSS       1.00      0.33      0.50         3
         Clojure       0.00      0.00      0.00         1
    CoffeeScript       0.00      0.00      0.00         2
      Dockerfile       0.00      0.00      0.00         4
          Elixir       0.00      0.00      0.00         1
        GDScript       0.00      0.00      0.00         1
            GLSL       0.00      0.00      0.00         4
              Go       1.00      0.31      0.48        16
             HCL       0.00      0.00      0.00         3
            HTML       0.00      0.00      0.00         3
         Hask

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


- How does model perform when all but our top three languages are classified as 'other'?

In [25]:
%%time
NB_y_preds_train = nb_model(x_train, y_train)
report = classification_report(y_train, NB_y_preds_train)
print(report)

              precision    recall  f1-score   support

        Java       0.93      1.00      0.96       276
  JavaScript       0.97      0.51      0.67        61
       Other       0.86      0.95      0.90       230
      Python       0.97      0.71      0.82        51

    accuracy                           0.91       618
   macro avg       0.93      0.79      0.84       618
weighted avg       0.91      0.91      0.90       618

CPU times: user 18.6 ms, sys: 1.62 ms, total: 20.2 ms
Wall time: 18.7 ms


In [81]:
%%time
NB_y_preds_test = nb_model(x_train, y_train, x_test, y_test, test=True)
report = classification_report(y_test, NB_y_preds_test)
print(report)

              precision    recall  f1-score   support

        Java       0.83      0.93      0.88       122
  JavaScript       0.62      0.20      0.30        25
      Python       0.54      0.61      0.57        23

    accuracy                           0.78       170
   macro avg       0.66      0.58      0.58       170
weighted avg       0.76      0.78      0.75       170

CPU times: user 9.51 ms, sys: 1.43 ms, total: 10.9 ms
Wall time: 9.29 ms


<div class = 'alert alert-block alert-info'>
    
## testing gradient booster

- using our earlier agreed upon idea that we should classify all languages but our top three as 'other'

In [54]:
%%time
gb_y_preds_train = gradient_booster_model(x_train, y_train)
report = classification_report(y_train, gb_y_preds_train)
print(report)

              precision    recall  f1-score   support

        Java       1.00      1.00      1.00       280
  JavaScript       1.00      1.00      1.00        61
      Python       1.00      1.00      1.00        53

    accuracy                           1.00       394
   macro avg       1.00      1.00      1.00       394
weighted avg       1.00      1.00      1.00       394

CPU times: user 12.9 s, sys: 64.1 ms, total: 12.9 s
Wall time: 13 s


In [76]:
%%time
gb_train_preds = gradient_booster_model(x_train, y_train)
report = classification_report(y_train, gb_train_preds)
print(report)

              precision    recall  f1-score   support

        Java       1.00      1.00      1.00       280
  JavaScript       1.00      1.00      1.00        61
      Python       1.00      1.00      1.00        53

    accuracy                           1.00       394
   macro avg       1.00      1.00      1.00       394
weighted avg       1.00      1.00      1.00       394

CPU times: user 12.4 s, sys: 24 ms, total: 12.4 s
Wall time: 12.4 s


In [75]:
%%time
gb_test_preds = gradient_booster_model(x_train, y_train, x_test, y_test, test = True)
report = classification_report(y_test, gb_test_preds)
print(report)

              precision    recall  f1-score   support

        Java       0.88      0.98      0.93       122
  JavaScript       0.87      0.52      0.65        25
      Python       0.75      0.65      0.70        23

    accuracy                           0.86       170
   macro avg       0.83      0.72      0.76       170
weighted avg       0.86      0.86      0.85       170

CPU times: user 13.5 s, sys: 87.9 ms, total: 13.6 s
Wall time: 13.7 s
