In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pickle

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Gradient boosting
## Load the data

In [2]:
# Dataframe
path_df = "Pickles/df.pickle"
with open(path_df, 'rb') as data:
    df = pickle.load(data)

# features_train
path_features_train = "Pickles/features_train.pickle"
with open(path_features_train, 'rb') as data:
    features_train = pickle.load(data)

# labels_train
path_labels_train = "Pickles/labels_train.pickle"
with open(path_labels_train, 'rb') as data:
    labels_train = pickle.load(data)

# features_test
path_features_test = "Pickles/features_test.pickle"
with open(path_features_test, 'rb') as data:
    features_test = pickle.load(data)

# labels_test
path_labels_test = "Pickles/labels_test.pickle"
with open(path_labels_test, 'rb') as data:
    labels_test = pickle.load(data)

### Cross-Validation for Hyperparameter tuning

Hyperparameters of the model:


In [3]:
gbc = GradientBoostingClassifier()

print('Parameters currently in use:\n')
print(gbc.get_params())

Parameters currently in use:

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'deviance',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}


#### Gradient boosting takes a lot of time, investigate only max_depth

In [4]:
max_depth = [3,4,5]

param_grid = {'max_depth': max_depth}

# Definition of the search
grid_search = GridSearchCV(estimator = gbc,
                           param_grid = param_grid,
                           scoring = 'accuracy',
                           cv = 3,
                           verbose = 2)

# Fit the random search model
grid_search.fit(features_train, labels_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] END ........................................max_depth=3; total time= 3.7min
[CV] END ........................................max_depth=3; total time= 3.7min
[CV] END ........................................max_depth=3; total time= 3.6min
[CV] END ........................................max_depth=4; total time= 4.8min
[CV] END ........................................max_depth=4; total time= 4.8min
[CV] END ........................................max_depth=4; total time= 4.7min
[CV] END ........................................max_depth=5; total time= 5.4min
[CV] END ........................................max_depth=5; total time= 5.5min
[CV] END ........................................max_depth=5; total time= 5.4min


GridSearchCV(cv=3, estimator=GradientBoostingClassifier(),
             param_grid={'max_depth': [3, 4, 5]}, scoring='accuracy',
             verbose=2)

In [5]:
print("The best hyperparameters from Random Search are:")
print(grid_search.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(grid_search.best_score_)

The best hyperparameters from Random Search are:
{'max_depth': 3}

The mean accuracy of a model with these hyperparameters is:
0.8070551549405574


In [6]:
best_gbc = grid_search.best_estimator_

### Model fit and performance

In [8]:
# fit
best_gbc.fit(features_train, labels_train)

GradientBoostingClassifier()

In [9]:
# predict
gbc_pred = best_gbc.predict(features_test)

In [10]:
# Training accuracy
print("The training accuracy is: ")
print(accuracy_score(labels_train, best_gbc.predict(features_train)))

# Test accuracy
print("The test accuracy is: ")
print(accuracy_score(labels_test, gbc_pred))

The training accuracy is: 
0.8957967907490417
The test accuracy is: 
0.8076448828606658


In [11]:
# Classification report
print("Classification report")
print(classification_report(labels_test,gbc_pred))

Classification report
              precision    recall  f1-score   support

           C       0.89      0.87      0.88        54
         C++       0.97      0.89      0.93        44
     Fortran       0.92      0.95      0.93        37
          Go       0.98      0.88      0.93        50
     Haskell       0.90      0.67      0.77        67
        Java       1.00      0.92      0.96        49
  JavaScript       0.85      0.72      0.78        54
       Julia       0.90      0.56      0.69        32
      Kotlin       0.97      0.97      0.97        32
      MATLAB       0.56      0.37      0.44        27
 Mathematica       0.36      0.85      0.51        46
         PHP       0.74      0.71      0.72        24
        Perl       0.91      0.91      0.91        57
      Python       0.78      0.86      0.82        72
           R       0.68      0.70      0.69        30
        Ruby       0.88      0.82      0.85        55
        Rust       1.00      0.90      0.95        21
     

In [1]:
conf_matrix = confusion_matrix(labels_test, gbc_pred, normalize = 'true')
plt.figure(figsize=(12.8,6))

sns.heatmap(conf_matrix, 
            cmap="YlGn",
            annot = True,
   #         fmt = '.2f',
            xticklabels = df['language'].unique(),
            yticklabels = df['language'].unique())
plt.ylabel('Predicted')
plt.xlabel('Actual')
plt.title('Confusion matrix, normalised')
plt.show()

NameError: name 'confusion_matrix' is not defined

In [13]:
d = {
     'Model': 'Gradient Boosting',
     'Training Set Accuracy': accuracy_score(labels_train, best_gbc.predict(features_train)),
     'Test Set Accuracy': accuracy_score(labels_test, gbc_pred)
}

df_models_gbc = pd.DataFrame(d, index=[0])

In [14]:
df_models_gbc

Unnamed: 0,Model,Training Set Accuracy,Test Set Accuracy
0,Gradient Boosting,0.895797,0.807645


In [15]:
with open('Models/best_gbc.pickle', 'wb') as output:
    pickle.dump(best_gbc, output)
    
with open('Models/df_models_gbc.pickle', 'wb') as output:
    pickle.dump(df_models_gbc, output)