# Minecraft ReadMe Modeling

### Table of Contents
- [Modeling Section Beginning](#Modeling-Begins-Here)
- [Module Functions](#Modules)

In [1]:
# import personal modules
import prepare as prep
#import acquire as ac


#import datascience libraries
import pandas as pd
import numpy as np

# Sklearn modules including classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier  # Gradient Boosting Classifier
from sklearn.ensemble import HistGradientBoostingClassifier # Sklearn version of LGBM Classifier
from sklearn.naive_bayes import MultinomialNB  # Naive Bayes Classifier


# Sklearn testing, evaluating, and managing model
from sklearn.datasets import make_multilabel_classification
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE, f_regression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score


# additional, advanced classifiers
from xgboost import XGBClassifier as xgb  # XG Boost Classifier
from xgboost import DMatrix  # used to transform series to matrix for xg classifier
from lightgbm import LGBMClassifier as lgb # Light Gradient Boost Classifier
from catboost import CatBoostClassifier # Cat boost classifier
import lightgbm as lgb

# import modules from standard library
from time import time
from pprint import pprint # pretty print
from importlib import reload
import os


# NLP related modules / libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk #Natural Language Tool Kit
import re   #Regular Expressions

np.random.seed(7)

In [2]:
reload(prep)

<module 'prepare' from '/Users/CryanRedrose/codeup-data-science/CodeUp/MinecraftNLP/prepare.py'>

## Modules

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 884 entries, fogleman/Minecraft to deathcap/node-minecraft-ping
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   language         884 non-null    object
 1   readme_contents  884 non-null    object
 2   clean            884 non-null    object
 3   lemmatized       884 non-null    object
dtypes: object(4)
memory usage: 34.5+ KB


In [3]:
def get_df():
    
    if os.path.isfile('prepared_data.csv'):
        return pd.read_csv('prepared_data.csv', index_col=[0])
    else:
        df = pd.read_csv('clean_scraped_data.csv', index_col=[0])
        df = prep.map_other_languages(df)
        
        df.to_csv('prepared_data.csv', index=False)
        
        return df

In [4]:
def get_xy():
    df = get_df()
        
    x = df['lemmatized']
    y = df['language']

    cv = CountVectorizer()
    x_vectorized = cv.fit_transform(x)

    
    
    return x_vectorized, y

In [5]:
def get_split_data():
    
    df = get_df()
    
    x = df['lemmatized']
    y = df['language']

    cv = CountVectorizer()
    #x_vectorized = cv.fit_transform(x)

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 7)
    
    x_train = cv.fit_transform(x_train)
    x_test = cv.transform(x_test)
    
    
    return x_train, y_train, x_test, y_test

In [21]:
#########################################################################
           ############       Random Forest       ##############     
  ######  Creates N number of trees using random starting values  ######
########################################################################

def random_forest_model(x, y):
    
    rf_classifier = RandomForestClassifier(
        min_samples_leaf=10,
        n_estimators=200,
        max_depth=5, 
        bootstrap=True,
        oob_score=True,
        n_jobs=-1,
        max_features='auto'
    )

    rf_classifier.fit(x, y)

    y_preds = rf_classifier.predict(x)
    
    return y_preds


#############################################################################
    ############       Gradient Boosting Classifier       ##############     
######  Creates a random forest where each tree learns from the last  ######
############################################################################

def gradient_booster_model(x_train, y_train, x_test = 0, y_test = 0, test = False):

    gradient_booster = GradientBoostingClassifier(
                            learning_rate=0.1,
                            max_depth = 5,
                            n_estimators=200)
    if test == False:
    
        gradient_booster.fit(x_train, y_train)
        y_preds = gradient_booster.predict(x_train)
        
        return y_preds

    if test == True:
        gradient_booster.fit(x_train, y_train)
        y_preds = gradient_booster.predict(x_test)

        return y_preds

#################################################################
############         XG Boosting Classifier       ##############     
    #######       Uses XG Boosting Algorthm       #######
#################################################################

def xgboost_model(x_train, y_train, x_test = 0, y_test = 0, test = False):

    xgb_params = {'max_depth'       : 3,
                  'eta'             : 0.01,
                  'silent'          : 0,
                  'eval_metric'     : 'auc',
                  'subsample'       : 0.8,
                  'colsample_bytree': 0.8,
                  'objective'       : 'binary:logistic'}

    
    xgboost = xgb(params = xgb_params,
                 num_boost_round = 2000,
                 verbose_eval = 50,
                 #early_stopping_rounds = 500,
                 #feval = f1_score_cust,
                 #evals = evals,
                 maximize = True)
    xgboost.fit(x_train, y_train)
    
    
    if test == False:
        y_preds = xgboost.predict(x_train)

        return y_preds

    if test == True:
        y_preds = xgboost.predict(x_test)

        return y_preds
    

#################################################################
#########         LightGMB Boosting Classifier       ###########     
#######       Uses Light Gradient Boosting Algorthm       #######
#################################################################

def lgmboost_model(x, y):
    
    lgmboost = LGBMClassifier(
                learning_rate=0.1,
                max_depth = 5,
                n_estimators=200)

    lgmboost.fit(x, y)
    
    y_preds = lgmboost.predict(x)
    
    return y_preds


#################################################################
#########       HistGradientBoosting Classifier      ###########     
#######    Inspired by Light Gradient Boosting Algorthm    ######
#################################################################

def histgradientboost_model(x_train, y_train, x_test = 0, y_test = 0, test = False):
    
    HGboost = HistGradientBoostingClassifier(
                                            learning_rate=0.1,
                                            max_depth = 5,
                                            n_estimators=200)
   
    HGboost.fit(x_train, y_train)
    
    if test == False:
        y_preds = HGBoost.predict(x_train)
        
        return y_preds
        
    if test == True:
        y_preds = HGBoost.predict(x_test)
    
        return y_preds


##########################################################
#########          Cat Boost Classifier        ###########     
#######      Cat Boost Gradient Boosting Algorthm       ##
##########################################################

def catboost_model(x_train, y_train, x_test = 0, test = False):
    
    catboost_params = {'loss_function' : 'Logloss',
                        'eval_metric' : 'AUC',
                        'verbose' : 200}
                      
    catboost = CatBoostClassifier(params = catboost_params)

    catboost.fit(x_train, y_train, use_best_model = True)#, plot = True)
    
    if test == False:
        y_preds = catboost.predict(x_train)        
        return y_preds

    if test == True:
        y_preds = catboost.predict(x_test)
        return y_preds

####################################################################
#########         Multinomial Naive Bayes Classifier     ###########     
#######     Uses Naive Bayes as Classification Algorithm     #######
####################################################################

def nb_model(x_train, y_train, x_test = 0, y_test = 0, test = False):
    
    naive_bayes = MultinomialNB()
    
    if test == False:
        naive_bayes.fit(x_train, y_train)
        y_preds = naive_bayes.predict(x_train)

        return y_preds
    
    if test == True:
        naive_bayes.fit(x_train, y_train)
        y_preds = naive_bayes.predict(x_test)

        return y_preds

## Modeling Begins Here

[Back to top](#Table-of-Contents)

<div class = 'alert alert-block alert-info'>

## Testing Bayes Model



In [24]:
%%time
x_train, y_train, x_test, y_test = get_split_data()

x,y = get_xy()

CPU times: user 517 ms, sys: 44.9 ms, total: 562 ms
Wall time: 565 ms


## How does classifier perform using train / test split? 

In [25]:
%%time
NB_y_preds_train = nb_model(x_train, y_train)
report = classification_report(y_train, NB_y_preds_train)
print(report)

              precision    recall  f1-score   support

        Java       0.95      1.00      0.97       276
  JavaScript       0.97      0.59      0.73        61
       Other       0.89      0.95      0.92       230
      Python       0.95      0.82      0.88        51

    accuracy                           0.92       618
   macro avg       0.94      0.84      0.88       618
weighted avg       0.93      0.92      0.92       618

CPU times: user 18.9 ms, sys: 2.44 ms, total: 21.3 ms
Wall time: 20 ms


In [26]:
%%time
NB_y_preds_test = nb_model(x_train, y_train, x_test, y_test, test=True)
report = classification_report(y_test, NB_y_preds_test)
print(report)

              precision    recall  f1-score   support

        Java       0.80      0.83      0.81       126
  JavaScript       0.60      0.12      0.20        25
       Other       0.54      0.76      0.63        90
      Python       1.00      0.12      0.21        25

    accuracy                           0.67       266
   macro avg       0.73      0.46      0.46       266
weighted avg       0.71      0.67      0.64       266

CPU times: user 12.8 ms, sys: 1.72 ms, total: 14.5 ms
Wall time: 12.8 ms


## How does classifier perform using kfold cross validation?

In [27]:
%%time
x,y = get_xy()

model = MultinomialNB()


testing_df = pd.DataFrame(columns = ['speed', 'accuracy'])

start = time()
cv = RepeatedStratifiedKFold(n_splits = 4, n_repeats = 2, random_state = 7)
score = cross_val_score(model, x, y, scoring='accuracy', cv = cv, n_jobs=-1)

speed = np.round(time() - start, 3)
accuracy = np.mean(score).round(3)

testing_df.loc[0] = (speed, accuracy)
                          
print(f"Mean Accuracy: {testing_df['accuracy']} \nStd: {np.std(score): .3f} \nRun time: {testing_df['speed']}")

Mean Accuracy: 0    0.646
Name: accuracy, dtype: float64 
Std:  0.027 
Run time: 0    3.713
Name: speed, dtype: float64
CPU times: user 407 ms, sys: 213 ms, total: 621 ms
Wall time: 4.06 s


<div class = 'alert alert-block alert-info'>
    
## testing sklearn's gradient booster
    
[Back to top](#Table-of-Contents)

### Cross Validation

In [15]:
%%time
x,y = get_xy()

model = GradientBoostingClassifier(
                            learning_rate=0.1,
                            max_depth = 5,
                            n_estimators=200)


testing_df = pd.DataFrame(columns = ['speed', 'accuracy'])

start = time()
cv = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 2, random_state = 7)
score = cross_val_score(model, x, y, scoring='accuracy', cv = cv, n_jobs=-1)

speed = np.round(time() - start, 3)
accuracy = np.mean(score).round(3)

testing_df.loc[0] = (speed, accuracy)
                          
print(f"SKLearn's Gradient Booster \nMean Accuracy: {testing_df['accuracy']} \nStd: {np.std(score): .3f} \nRun time: {testing_df['speed']}")

SKLearn's Gradient Booster 
Mean Accuracy: 0    0.738
Name: accuracy, dtype: float64 
Std:  0.028 
Run time: 0    92.518
Name: speed, dtype: float64
CPU times: user 340 ms, sys: 39.9 ms, total: 380 ms
Wall time: 1min 32s


In [105]:
%%time
'''
prep work based on cross validation testing
'''
x_train, y_train, x_test, y_test = get_split_data()
x,y = get_xy()

CPU times: user 514 ms, sys: 38.7 ms, total: 552 ms
Wall time: 555 ms


In [102]:
df = pd.read_csv('clean_scraped_data.csv', index_col=[0])
#df = prep.map_other_languages(df)
#df['language'] = df['language'].map({'Python': 3, 'Other': 2, 'Java' : 0, 'JavaScript' : 1})

x = df['lemmatized']
y = df['language']

cv = CountVectorizer()
#x_vectorized = cv.fit_transform(x)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 7)

x_train = cv.fit_transform(x_train)
x_test = cv.transform(x_test)

In [106]:
%%time
gb_y_preds_train = gradient_booster_model(x_train, y_train)
report = classification_report(y_train, gb_y_preds_train)
print(report)

              precision    recall  f1-score   support

        Java       1.00      0.99      1.00       276
  JavaScript       1.00      1.00      1.00        61
       Other       0.99      1.00      1.00       230
      Python       1.00      1.00      1.00        51

    accuracy                           1.00       618
   macro avg       1.00      1.00      1.00       618
weighted avg       1.00      1.00      1.00       618

CPU times: user 20.2 s, sys: 70.8 ms, total: 20.3 s
Wall time: 20.4 s


In [107]:
%%time
gb_y_preds_test = gradient_booster_model(x_train, y_train, x_test, y_test, test = True)
report = classification_report(y_test, gb_y_preds_test)
print(report)

              precision    recall  f1-score   support

        Java       0.84      0.82      0.83       126
  JavaScript       0.78      0.56      0.65        25
       Other       0.63      0.72      0.67        90
      Python       0.74      0.68      0.71        25

    accuracy                           0.75       266
   macro avg       0.75      0.69      0.72       266
weighted avg       0.76      0.75      0.75       266

CPU times: user 20 s, sys: 46.5 ms, total: 20.1 s
Wall time: 20.1 s


## Tried gradient_booster_model without using 'other' option

In [103]:
%%time
gb_y_preds_train = gradient_booster_model(x_train, y_train)
report = classification_report(y_train, gb_y_preds_train)
print(report)

                  precision    recall  f1-score   support

        Assembly       1.00      1.00      1.00         1
       Batchfile       1.00      1.00      1.00         3
               C       1.00      1.00      1.00        10
              C#       1.00      1.00      1.00        31
             C++       1.00      1.00      1.00        29
           CMake       1.00      1.00      1.00         1
             CSS       1.00      1.00      1.00         3
         Clojure       1.00      1.00      1.00         1
    CoffeeScript       1.00      1.00      1.00         2
      Dockerfile       1.00      1.00      1.00         4
          Elixir       1.00      1.00      1.00         1
        GDScript       1.00      1.00      1.00         1
            GLSL       1.00      1.00      1.00         4
              Go       1.00      1.00      1.00        16
             HCL       1.00      1.00      1.00         3
            HTML       1.00      1.00      1.00         3
         Hask

In [104]:
%%time
gb_y_preds_test = gradient_booster_model(x_train, y_train, x_test, y_test, test = True)
report = classification_report(y_test, gb_y_preds_test)
print(report)

                   precision    recall  f1-score   support

       AutoHotkey       0.00      0.00      0.00         1
                C       0.50      0.17      0.25         6
               C#       0.60      0.50      0.55        12
              C++       0.75      0.43      0.55         7
              CSS       0.00      0.00      0.00         1
     CoffeeScript       0.00      0.00      0.00         1
           Elixir       0.00      0.00      0.00         1
               Go       1.00      0.33      0.50         3
             HTML       0.00      0.00      0.00         3
             Java       0.68      0.94      0.79       126
       JavaScript       0.67      0.56      0.61        25
 Jupyter Notebook       0.00      0.00      0.00         2
           Kotlin       1.00      0.14      0.25         7
              Lua       0.00      0.00      0.00         1
         Mustache       0.00      0.00      0.00         1
              PHP       0.60      0.60      0.60       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Trying new data frame's cleaning

In [None]:
%%time
''' 
after new prep changes were created
'''
x_train, y_train, x_test, y_test = get_split_data()

In [12]:
%%time
gb_y_preds_train = gradient_booster_model(x_train, y_train)
report = classification_report(y_train, gb_y_preds_train)
print(report)

              precision    recall  f1-score   support

        Java       1.00      0.99      1.00       276
  JavaScript       1.00      1.00      1.00        61
       Other       0.99      1.00      1.00       230
      Python       1.00      1.00      1.00        51

    accuracy                           1.00       618
   macro avg       1.00      1.00      1.00       618
weighted avg       1.00      1.00      1.00       618

CPU times: user 21 s, sys: 129 ms, total: 21.1 s
Wall time: 21.2 s


In [13]:
%%time
gb_y_preds_test = gradient_booster_model(x_train, y_train, x_test, y_test, test = True)
report = classification_report(y_test, gb_y_preds_test)
print(report)

              precision    recall  f1-score   support

        Java       0.84      0.82      0.83       126
  JavaScript       0.79      0.60      0.68        25
       Other       0.65      0.73      0.69        90
      Python       0.78      0.72      0.75        25

    accuracy                           0.76       266
   macro avg       0.77      0.72      0.74       266
weighted avg       0.77      0.76      0.76       266

CPU times: user 20.8 s, sys: 107 ms, total: 20.9 s
Wall time: 21 s


<div class = 'alert alert-block alert-info'>
    
## testing hist gradient booster (sklearn's version of Light GMB classifier)
    
# NO luck. Did not work for this project with this much time to learn it
    
[Back to top](#Table-of-Contents)

In [17]:
%%time
x,y = get_xy()

model = HistGradientBoostingClassifier(learning_rate=0.1,
                                        max_depth = 5
                                        )


testing_df = pd.DataFrame(columns = ['speed', 'accuracy'])

start = time()
cv = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 2, random_state = 7)
score = cross_val_score(model, x, y, scoring='accuracy', cv = cv, n_jobs=-1)

speed = np.round(time() - start, 3)
accuracy = np.mean(score).round(3)

testing_df.loc[0] = (speed, accuracy)
                          
print(f"SKLearn's Hist Gradient Booster \nMean Accuracy: {testing_df['accuracy']} \nStd: {np.std(score): .3f} \nRun time: {testing_df['speed']}")


SKLearn's Hist Gradient Booster 
Mean Accuracy: 0   NaN
Name: accuracy, dtype: float64 
Std:  nan 
Run time: 0    0.108
Name: speed, dtype: float64
CPU times: user 330 ms, sys: 58.4 ms, total: 388 ms
Wall time: 424 ms


10 fits failed out of a total of 10.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py", line 233, in fit
    X, y = self._validate_data(X, y, dtype=[X_DTYPE], force_all_finite=False)
  File "/usr/local/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 581, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/usr/local/anaconda3/lib/python3.9/site-

In [96]:
%%time
x_train, y_train, x_test, y_test = get_split_data()
x,y = get_xy()
x_train = x_train.toarray()

CPU times: user 534 ms, sys: 61.9 ms, total: 596 ms
Wall time: 599 ms


In [98]:
type(x_train)

numpy.ndarray

In [99]:
%%time
hgb_y_preds_train = histgradientboost_model(x_train, y_train)
report = classification_report(y_train, hgb_y_preds_train)
print(report)

NameError: name 'HGBoost' is not defined

In [None]:
%%time
hgb_y_preds_test = histgradientboost_model(x_train, y_train, x_test, y_test, test = True)
report = classification_report(y_test, hgb_y_preds_test)
print(report)

<div class = 'alert alert-block alert-info'>
    
## testing xg gradient booster
    
[Back to top](#Table-of-Contents)

In [21]:
%%time
x,y = get_xy()

df['language'] = df['language'].map({'Python': 3, 'Other': 2, 'Java' : 0, 'JavaScript' : 1})

x = df['lemmatized']
y = df['language']

'''
xgb_params = {'max_depth'       : 3,
              'eta'             : 0.01,
              'silent'          : 0,
              'eval_metric'     : 'auc',
              'subsample'       : 0.8,
              'colsample_bytree': 0.8,
              'objective'       : 'binary:logistic'}

model = xgb(params = xgb_params,
                 num_boost_round = 2000,
                 verbose_eval = 50,
                 #early_stopping_rounds = 500,
                 #feval = f1_score_cust,
                 #evals = evals,
                 maximize = True)
'''
model = xgb()

testing_df = pd.DataFrame(columns = ['speed', 'accuracy'])

start = time()
cv = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 2, random_state = 7)
score = cross_val_score(model, x, y, scoring='accuracy', cv = cv, n_jobs=-1)

speed = np.round(time() - start, 3)
accuracy = np.mean(score).round(3)

testing_df.loc[0] = (speed, accuracy)
                          
print(f"XG Boost \nMean Accuracy: {testing_df['accuracy']} \nStd: {np.std(score): .3f} \nRun time: {testing_df['speed']}")


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [16]:
df = get_df()
df['language'] = df['language'].map({'Python': 3, 'Other': 2, 'Java' : 0, 'JavaScript' : 1})

x = df['lemmatized']
y = df['language']

cv = CountVectorizer()
#x_vectorized = cv.fit_transform(x)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 7)

x_train = cv.fit_transform(x_train)
x_test = cv.transform(x_test)

In [22]:
%%time                                     
                                     
xgb_preds_train = xgboost_model(x_train, y_train)
report = classification_report(y_train, xgb_preds_train)
print(report)

Parameters: { "maximize", "num_boost_round", "params", "verbose_eval" } are not used.

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       276
           1       1.00      0.97      0.98        61
           2       1.00      1.00      1.00       230
           3       1.00      1.00      1.00        51

    accuracy                           1.00       618
   macro avg       1.00      0.99      0.99       618
weighted avg       1.00      1.00      1.00       618

CPU times: user 15.1 s, sys: 181 ms, total: 15.3 s
Wall time: 1.37 s


In [23]:
%%time                                     
                                     
xgb_preds_test = xgboost_model(x_train, y_train, x_test, y_test, test = True)
report = classification_report(y_test, xgb_preds_test)
print(report)

Parameters: { "maximize", "num_boost_round", "params", "verbose_eval" } are not used.

              precision    recall  f1-score   support

           0       0.81      0.83      0.82       126
           1       0.75      0.60      0.67        25
           2       0.65      0.68      0.66        90
           3       0.83      0.76      0.79        25

    accuracy                           0.75       266
   macro avg       0.76      0.72      0.74       266
weighted avg       0.75      0.75      0.75       266

CPU times: user 15 s, sys: 171 ms, total: 15.1 s
Wall time: 1.35 s


In [73]:
x,y = get_xy()

xgb_params = {'max_depth'       : 3,
                      'eta'             : 0.01,
                      'silent'          : 0,
                      'eval_metric'     : 'auc',
                      'subsample'       : 0.8,
                      'colsample_bytree': 0.8,
                      'objective'       : 'binary:logistic'}
    
    
#dtrain = xgb.DMatrix(x_train, y_train, feature_names = x_train.columns.values)
#dtest  = xgb.DMatrix(x_test, y_test, feature_names = x_test.columns.values)


xgboost = xgb(params = xgb_params,
                     num_boost_round = 2000,
                     verbose_eval = 50,
                     early_stopping_rounds = 500,
                     #feval = f1_score_cust,
                     #evals = evals,
                     maximize = True)


<div class = 'alert alert-block alert-info'>

# Testing with Catboost

[Back to top](#Table-of-Contents)

In [50]:
x_train, y_train, x_test, y_test = get_split_data()
model = CatBoostClassifier(loss_function = 'Logloss',
                        eval_metric = 'AUC',
                        learning_rate = 0.03)

model.fit(x_train, y_train, verbose = False, plot = True, use_best_model = True)

predictions = model.predict(x_train)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

CatBoostError: catboost/private/libs/target/target_converter.cpp:379: Target with classes must contain only 2 unique values for binary classification

In [47]:
def catboost_model(x_train, y_train, x_test = 0, test = False):
    
    
    catboost_params = {'loss_function' : 'Logloss',
                        'eval_metric' : 'AUC',
                        'verbose' : 200,
                        'plot' : True}

    model = CatBoostClassifier(#loss_function = 'Logloss',
                        #eval_metric = 'AUC',
                        learning_rate = 0.03)

    model.fit(x_train, y_train, verbose = False, plot = True, use_best_model = True)

    if test == False:
        y_preds = catboost.predict(x_train)        
        return y_preds

    if test == True:
        y_preds = catboost.predict(x_test)
        return y_preds

In [48]:
######  Catboost Model Train ######
def Catboost_training():
    
    x_train, y_train, x_test, y_test = get_split_data()
    

    catboost_model(x_train, y_train)
    #report = classification_report(y_train, Catboost_y_preds_train)
    #print('Catboost train')
    #print(report)

######  Catboost Model Test  ######
def Catboost_test():
    
    x_train, y_train, x_test, y_test = get_split_data()
    
    Catboost_y_preds_test = catboost_model(x_train, y_train, x_test, test = True)
    report = classification_report(y_test, Catboost_y_preds_test)
    print('Catboost test')
    print(report)

In [49]:
Catboost_training()

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

You should provide test set for use best model. use_best_model parameter has been switched to false value.


NameError: name 'catboost' is not defined

In [18]:
Catboost_test()

You should provide test set for use best model. use_best_model parameter has been switched to false value.


Learning rate set to 0.077208
0:	learn: 1.3423420	total: 191ms	remaining: 3m 10s
1:	learn: 1.3161455	total: 282ms	remaining: 2m 20s
2:	learn: 1.2753405	total: 358ms	remaining: 1m 58s
3:	learn: 1.2375012	total: 439ms	remaining: 1m 49s
4:	learn: 1.2117020	total: 513ms	remaining: 1m 41s
5:	learn: 1.1800652	total: 584ms	remaining: 1m 36s
6:	learn: 1.1487127	total: 657ms	remaining: 1m 33s
7:	learn: 1.1284531	total: 732ms	remaining: 1m 30s
8:	learn: 1.1094248	total: 805ms	remaining: 1m 28s
9:	learn: 1.0906197	total: 878ms	remaining: 1m 26s
10:	learn: 1.0710945	total: 952ms	remaining: 1m 25s
11:	learn: 1.0554719	total: 1.02s	remaining: 1m 24s
12:	learn: 1.0381624	total: 1.13s	remaining: 1m 25s
13:	learn: 1.0293388	total: 1.21s	remaining: 1m 25s
14:	learn: 1.0127369	total: 1.28s	remaining: 1m 24s
15:	learn: 1.0027754	total: 1.36s	remaining: 1m 23s
16:	learn: 0.9972500	total: 1.44s	remaining: 1m 23s
17:	learn: 0.9914397	total: 1.51s	remaining: 1m 22s
18:	learn: 0.9830598	total: 1.58s	remaining: