In [1]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from nltk.corpus import stopwords
from utils import pipeline, calculate_metrics

In [2]:
fp_data = "./op_spam_v1.4/negative_polarity/"
min_df = 0.05  # Minimal document frequency of a word to be included in the vocabulary
stop_words = set(stopwords.words("english"))
val_size = None  # Size of the validation set as a percentage of the training set, set to None to disable
random_state = 42

## Using a classification tree and random forests on uni-grams
We first train and tune both the classification tree and the random forest on the uni-grams of the data.

We will use the cost-complexity pruning, max_depth and minimal sample leaf as the hyperparameters for the classification tree



In [3]:
max_features = None  # Maximum vocab size
ngram_range = (1, 1)  # Range of n-grams to include in the vocabulary

# load in the data using the pipeline
X_train, X_test, y_train, y_test, vectorizer = pipeline(fp_data, max_features, ngram_range, min_df,
                                                                        stop_words=stop_words, val_size=val_size)
# print the sizes of the train and test set
print(f"X_train: {X_train.shape}; y_train: {len(y_train)}")
# print(f"X_val: {X_val.shape}; y_val: {len(y_val)}")
print(f"X_test: {X_test.shape}; y_test: {len(y_test)}")
print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")

X_train: (640, 326); y_train: 640
X_test: (160, 326); y_test: 160
Vocabulary size: 326


In [4]:
class_tree_params = {
    "max_depth": [None, 10, 20, 30],
    "ccp_alpha": [0.0, 0.05, 0.1, 0.2],
    "min_samples_leaf": [1, 2, 4, 6]
}

rand_forest_params = {
    "n_estimators": [50, 100, 150],
    "max_depth": [None, 10, 20, 30],
    "max_features": [None, 'sqrt', 0.25]
}

def tune_model(model, params, x, y):
    clf = GridSearchCV(model, params, cv=5)
    clf.fit(x, y)
    return clf.best_params_

def print_metrics(y_true, y_pred):
    acc, pre, rec, f1 = calculate_metrics(np.asarray(y_test), np.asarray(y_pred))
    return f'accuracy: {acc}; precision: {pre}; recall: {rec}; f1_score: {f1}'

In [5]:
# training the uni-gram models (decision_tree)
tree = DecisionTreeClassifier(random_state=random_state)
best_params = tune_model(tree, class_tree_params, X_train, y_train)

# train and evaluate the model with the best parameters
bp_tree = DecisionTreeClassifier(random_state=random_state).set_params(**best_params).fit(X_train, y_train)
y_pred = bp_tree.predict(X_test)
print(f"best parameters decision tree (uni-gram):\n{print_metrics(y_test, y_pred)}\nbest params found: {best_params}")

best parameters decision tree (uni-gram):
accuracy: 0.64375; precision: 0.6419753086419753; recall: 0.65; f1_score: 0.6459627329192547
best params found: {'ccp_alpha': 0.05, 'max_depth': None, 'min_samples_leaf': 1}


In [6]:
# training the uni-gram models (random forests)
rd_forest = RandomForestClassifier(random_state=random_state)
best_params = tune_model(rd_forest, rand_forest_params, X_train, y_train)

# train and evaluate the model with the best parameters
bp_forest = RandomForestClassifier(random_state=random_state).set_params(**best_params).fit(X_train, y_train)
y_pred = bp_forest.predict(X_test)
print(f"best parameters random forest (uni-gram):\n{print_metrics(y_test, y_pred)}\nbest params found: {best_params}")

best parameters random forest (uni-gram):
accuracy: 0.76875; precision: 0.7471264367816092; recall: 0.8125; f1_score: 0.778443113772455
best params found: {'max_depth': 20, 'max_features': 'sqrt', 'n_estimators': 100}


In [7]:
# visualizing


## Using a classification tree and random forests on bi-grams

In [8]:
ngram_range = (1, 2)  # Range of n-grams to include in the vocabulary

# load in the data using the pipeline
X_train, X_test, y_train, y_test, vectorizer = pipeline(fp_data, max_features, ngram_range, min_df,
                                                                        stop_words=stop_words, val_size=val_size)
# print the sizes of the train and test set
print(f"X_train: {X_train.shape}; y_train: {len(y_train)}")
# print(f"X_val: {X_val.shape}; y_val: {len(y_val)}")
print(f"X_test: {X_test.shape}; y_test: {len(y_test)}")
print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")

X_train: (640, 340); y_train: 640
X_test: (160, 340); y_test: 160
Vocabulary size: 340


In [9]:
# training the bi-gram models (decision_tree)
tree = DecisionTreeClassifier(random_state=random_state)
best_params = tune_model(tree, class_tree_params, X_train, y_train)

# train and evaluate the model with the best parameters
bp_tree = DecisionTreeClassifier(random_state=random_state, **best_params).fit(X_train, y_train)
y_pred = bp_tree.predict(X_test)
print(f"best parameters decision tree (bi-gram):\n{print_metrics(y_test, y_pred)}\nbest params found: {best_params}")

best parameters decision tree (bi-gram):
accuracy: 0.64375; precision: 0.6419753086419753; recall: 0.65; f1_score: 0.6459627329192547
best params found: {'ccp_alpha': 0.05, 'max_depth': None, 'min_samples_leaf': 1}


In [10]:
# training the bi-gram models (random forests)
rd_forest = RandomForestClassifier(random_state=random_state)
best_params = tune_model(rd_forest, rand_forest_params, X_train, y_train)

# train and evaluate the model with the best parameters
bp_forest = RandomForestClassifier(random_state=random_state, **best_params).fit(X_train, y_train)
y_pred = bp_forest.predict(X_test)
print(f"best parameters random forest (bi-gram):\n{print_metrics(y_test, y_pred)}\nbest params found: {best_params}")

best parameters random forest (bi-gram):
accuracy: 0.725; precision: 0.7142857142857143; recall: 0.75; f1_score: 0.7317073170731706
best params found: {'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 100}
