# Classification with Decision Trees and Random Forests

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from classification_utils import *
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_validate
from sklearn.datasets import make_blobs
import statistics
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.decomposition import PCA
import pickle
from sklearn.metrics import RocCurveDisplay

The only preprocessing step applyed is the one-hot-encoding of the **Lang** feature

## Decision trees

Run grid search - cross validation over the Decision Tree sklearn implementation, the parameters over which the grid search will run are:
- **ccp_alpha**: Complexity parameter used for Minimal Cost-Complexity Pruning.
- **min_weight_fraction_leaf**: The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node.
- **min_sample_split**: The minimum number of samples required to split an internal node.
- **min_sample_leaf**: The minimum number of samples required to be at a leaf node.
- **criterion**: The function to measure the quality of a split.

In [None]:
data = pd.read_csv("dataset/cleaned_user_profiles.csv", index_col=0)
data = data.join(pd.get_dummies(data["lang"]))
train_set, test_set, train_label, test_label = prepare_data(data)

In [None]:
param_grid = {
              'ccp_alpha': [0.0, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001, 0.5, 0.2],
              'min_weight_fraction_leaf': [0.0, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001, 0.5, 0.2],
              'min_samples_split': [2, 3, 4],
              'min_samples_leaf': [1, 2, 3, 4],
              'criterion': ['gini', 'entropy']
             }


dt = tree.DecisionTreeClassifier()

grid = GridSearchCV(
    dt,
    param_grid,
    cv=3,
    scoring='accuracy',
    return_train_score=False,
    verbose=1
)


grid.fit(train_set.values, train_label)
print(
    "The best parameters are %s with a score of %0.5f"
    % (grid.best_params_, grid.best_score_)
)


In [None]:
dt = grid.best_estimator_
dt

## Print Metrics

In [None]:
train_pred = dt.predict(train_set.values)
test_pred = dt.predict(test_set.values)

In [None]:
print_metrics(train_label, train_pred, test_label, test_pred)

In [None]:
report_scores(test_label, test_pred)

### Confusion Matrix

In [None]:
predictions = dt.predict(test_set.values)

cm = confusion_matrix(test_label, predictions)
disp = ConfusionMatrixDisplay(confusion_matrix = cm)
disp.plot()
plt.show()

### ROC-Curve

In [None]:
RocCurveDisplay.from_estimator(dt, test_set.values, test_label)
plt.show()

### PCA blobs

In [None]:
pca = PCA(n_components=2)
test_set_reduced = pca.fit_transform(test_set.values)

In [None]:
#true labels
plt.scatter(test_set_reduced[:, 0], test_set_reduced[:, 1], c=test_label, s=25);

In [None]:
#predicted labels
plt.scatter(test_set_reduced[:, 0], test_set_reduced[:, 1], c=test_pred, s=25);

### Save model

In [None]:
with open("models_checkpoints/decision_tree_lang.bin", "wb") as f:
    pickle.dump(dt, f)

### Removed "lang" attribute for classification

Since we are not sure in the usage of Lang feature, we will try to remove that feature to see the difference in the accuracy, we have tryed to remove other features, but in those cases the model drop something in accuracy, we will see that for Lang this is not true.

In [None]:
data = pd.read_csv("dataset/cleaned_user_profiles.csv", index_col=0)
data.drop(columns=['lang'])
train_set, test_set, train_label, test_label = prepare_data(data)

In [None]:
param_grid = {
              'ccp_alpha': [0.0, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001, 0.5, 0.2],
              'min_weight_fraction_leaf': [0.0, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001, 0.5, 0.2],
              'min_samples_split': [2, 3, 4],
              'min_samples_leaf': [1, 2, 3, 4],
              'criterion': ['gini', 'entropy']
             }


dt = tree.DecisionTreeClassifier()

grid = GridSearchCV(
    dt,
    param_grid,
    cv=3,
    scoring='accuracy',
    return_train_score=False,
    verbose=1
)


grid.fit(train_set.values, train_label)
print(
    "The best parameters are %s with a score of %0.5f"
    % (grid.best_params_, grid.best_score_)
)


In [None]:
dt = grid.best_estimator_
dt

## Print Metrics

In [None]:
train_pred = dt.predict(train_set.values)
test_pred = dt.predict(test_set.values)

In [None]:
print_metrics(train_label, train_pred, test_label, test_pred)

In [None]:
report_scores(test_label,test_pred)

### Confusion Matrix

In [None]:
predictions = dt.predict(test_set.values)

cm = confusion_matrix(test_label, predictions)
disp = ConfusionMatrixDisplay(confusion_matrix = cm)
disp.plot()
plt.show()

### ROC-Curve

In [None]:
RocCurveDisplay.from_estimator(dt, test_set.values, test_label)
plt.show()

### PCA Blobs

In [None]:
pca = PCA(n_components=2)
test_set_reduced = pca.fit_transform(test_set.values)

In [None]:
#true labels
plt.scatter(test_set_reduced[:, 0], test_set_reduced[:, 1], c=test_label, s=25);

In [None]:
#predicted labels
plt.scatter(test_set_reduced[:, 0], test_set_reduced[:, 1], c=test_pred, s=25);

### Save model

In [None]:
with open("models_checkpoints/decision_tree_no_lang.bin", "wb") as f:
    pickle.dump(dt, f)

## Random forest

A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting.

Run grid search - cross validation over the RandomForestClassifier sklearn implementation, the parameters over which the grid search will run are:
- **ccp_alpha**: Complexity parameter used for Minimal Cost-Complexity Pruning.
- **min_weight_fraction_leaf**: The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node.
- **min_sample_split**: The minimum number of samples required to split an internal node.
- **min_sample_leaf**: The minimum number of samples required to be at a leaf node.
- **n_estimators**: The number of trees in the forest.

In [None]:
data = pd.read_csv("dataset/cleaned_user_profiles.csv", index_col=0)
data = data.join(pd.get_dummies(data["lang"]))
train_set, test_set, train_label, test_label = prepare_data(data)

In [None]:
param_grid = {
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [2, 3, 4],
    'ccp_alpha': [0.0, 0.1, 0.05, 0.001],
    'min_weight_fraction_leaf': [0.0, 0.1, 0.05, 0.001],
    'n_estimators': np.arange(80, 150, 10),
}

rf = RandomForestClassifier()

grid = GridSearchCV(
    rf,
    param_grid,
    cv=3,
    scoring='accuracy',
    return_train_score=False,
    verbose=1
)

grid.fit(train_set.values, train_label)

In [None]:
rf = grid.best_estimator_
rf

## Print Metrics

In [None]:
train_pred = rf.predict(train_set.values)
test_pred = rf.predict(test_set.values)

In [None]:
print_metrics(train_label, train_pred, test_label, test_pred)

### Confusion Matrix

In [None]:
predictions = rf.predict(test_set.values)

cm = confusion_matrix(test_label, predictions)
disp = ConfusionMatrixDisplay(confusion_matrix = cm)
disp.plot()
plt.show()

### ROC-Curve

In [None]:
RocCurveDisplay.from_estimator(rf, test_set.values, test_label)
plt.show()

### PCA Blobs

In [None]:
pca = PCA(n_components=2)
test_set_reduced = pca.fit_transform(test_set.values)

In [None]:
#true labels
plt.scatter(test_set_reduced[:, 0], test_set_reduced[:, 1], c=test_label, s=25);

In [None]:
#predicted labels
plt.scatter(test_set_reduced[:, 0], test_set_reduced[:, 1], c=test_pred, s=25);

### Save model

In [None]:
with open("models_checkpoints/random_forest_lang.bin", "wb") as f:
    pickle.dump(rf, f)

### Removed "lang" attribute for classification

In [None]:
data = pd.read_csv("dataset/cleaned_user_profiles.csv", index_col=0)
train_set, test_set, train_label, test_label = prepare_data(data)

In [None]:
param_grid = {
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [2, 3, 4],
    'ccp_alpha': [0.0, 0.1, 0.05, 0.001],
    'min_weight_fraction_leaf': [0.0, 0.1, 0.05, 0.001],
    'n_estimators': np.arange(80, 150, 10),
}

rf = RandomForestClassifier()

grid = GridSearchCV(
    rf,
    param_grid,
    cv=3,
    scoring='accuracy',
    return_train_score=False,
    verbose=4
)

grid.fit(train_set.values, train_label)

In [None]:
rf = grid.best_estimator_
rf

## Print Metrics

In [None]:
train_pred = rf.predict(train_set.values)
test_pred = rf.predict(test_set.values)

In [None]:
print_metrics(train_label, train_pred, test_label, test_pred)

### Confusion Matrix

In [None]:
predictions = rf.predict(test_set.values)

cm = confusion_matrix(test_label, predictions)
disp = ConfusionMatrixDisplay(confusion_matrix = cm)
disp.plot()
plt.show()

### ROC-Curve

In [None]:
RocCurveDisplay.from_estimator(rf, test_set.values, test_label)
plt.show()

### PCA Blobs

In [None]:
pca = PCA(n_components=2)
test_set_reduced = pca.fit_transform(test_set.values)

In [None]:
#true labels
plt.scatter(test_set_reduced[:, 0], test_set_reduced[:, 1], c=test_label, s=25);

In [None]:
#predicted labels
plt.scatter(test_set_reduced[:, 0], test_set_reduced[:, 1], c=test_pred, s=25);

### Save model

In [None]:
with open("models_checkpoints/random_forest_no_lang.bin", "wb") as f:
    pickle.dump(rf, f)

## Random Hyperparameter Grid Search

Try the RandomizedSearchCV from Sklearn, over the dataset without the **Lang** feature.

In contrast to GridSearchCV, not all parameter values are tried out, but rather a fixed number of parameter settings is sampled from the specified distributions

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf = RandomForestClassifier()

rf_random = RandomizedSearchCV(
    estimator = rf,
    param_distributions = random_grid,
    n_iter = 100,
    cv = 3,
    verbose=4,
    random_state=42,
    n_jobs = -1
)

rf_random.fit(train_set.values, train_label)
print(
    "The best parameters are %s with a score of %0.5f"
    % (rf_random.best_params_, rf_random.best_score_)
)

In [None]:
rf = rf_random.best_estimator_

## Print Metrics

In [None]:
train_pred = rf.predict(train_set.values)
test_pred = rf.predict(test_set.values)

In [None]:
print_metrics(train_label, train_pred, test_label, test_pred)

### Confusion Matrix

In [None]:
predictions = rf.predict(test_set.values)

cm = confusion_matrix(test_label, predictions)
disp = ConfusionMatrixDisplay(confusion_matrix = cm)
disp.plot()
plt.show()

### ROC-Curve

In [None]:
RocCurveDisplay.from_estimator(rf, test_set.values, test_label)
plt.show()

### PCA Blobs

In [None]:
pca = PCA(n_components=2)
test_set_reduced = pca.fit_transform(test_set.values)

In [None]:
#true labels
plt.scatter(test_set_reduced[:, 0], test_set_reduced[:, 1], c=test_label, s=25);

In [None]:
#predicted labels
plt.scatter(test_set_reduced[:, 0], test_set_reduced[:, 1], c=test_pred, s=25);

### Save Model

In [None]:
with open("models_checkpoints/random_forest_2_no_lang.bin", "wb") as f:
    pickle.dump(rf, f)