In [6]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from nltk.corpus import stopwords
from utils import pipeline, calculate_metrics


In [3]:
fp_data = "./op_spam_v1.4/negative_polarity/"
max_features = None  # Maximum vocab size
ngram_range = (1, 1)  # Range of n-grams to include in the vocabulary
min_df = 0.05  # Minimal document frequency of a word to be included in the vocabulary
stop_words = set(stopwords.words("english"))
val_size = 0.2  # Size of the validation set as a percentage of the training set, set to None to disable

X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = pipeline(fp_data, max_features, ngram_range, min_df,
                                                                        stop_words=stop_words, val_size=val_size)

print(f"X_train: {X_train.shape}; y_train: {len(y_train)}")
print(f"X_val: {X_val.shape}; y_val: {len(y_val)}")
print(f"X_test: {X_test.shape}; y_test: {len(y_test)}")
print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")


X_train: (512, 326); y_train: 512
X_val: (128, 326); y_val: 128
X_test: (160, 326); y_test: 160
Vocabulary size: 326


In [9]:
def tuning(model, values: list, param: str):
    model_tune = model
    tuning = GridSearchCV(model_tune, {param:values}, cv=10)
    tuning.fit(X_val, y_val)
    best_param = tuning.best_params_[param]
    print('tuned parameter ', best_param)
    return best_param

def model_creation(model):
    
    model.fit(X_train,y_train)
    pred = model.predict(X_test)
    acc, pre, rec, f1 = calculate_metrics(np.asarray(y_test), np.asarray(pred))
    print('accuracy: ', acc, 'precision:', pre, 'recall: ', rec,  'f1_score:', f1)
    

In [11]:
values = [0.01, 0.1, 1, 10, 100]
tuned_param = tuning(MultinomialNB(),values, 'alpha')

print('tuned model')
model_creation(MultinomialNB(alpha=tuned_param))
print('default model')
model_creation(MultinomialNB())



tuned parameter  1
tuned model
accuracy:  0.84375 precision: 0.8571428571428571 recall:  0.825 f1_score: 0.8407643312101911
default model
accuracy:  0.84375 precision: 0.8571428571428571 recall:  0.825 f1_score: 0.8407643312101911


In [12]:
values = [0.01, 0.1, 1, 10, 100]
tuned_param = tuning(LogisticRegression(penalty='l1', solver= 'liblinear'),values, 'C')

print('tuned model')                     
model_creation(LogisticRegression(penalty='l1', solver= 'liblinear', C =tuned_param))
print('default model')
model_creation(LogisticRegression(penalty='l1', solver='liblinear'))

tuned parameter  10
tuned model
accuracy:  0.7875 precision: 0.7804878048780488 recall:  0.8 f1_score: 0.7901234567901235
default model
accuracy:  0.8 precision: 0.8157894736842105 recall:  0.775 f1_score: 0.7948717948717949


In [15]:
ngram_range = (1,2)
X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = pipeline(fp_data, max_features, ngram_range, min_df,
                                                                        stop_words=stop_words, val_size=val_size)

values = [0.01, 0.1, 1, 10, 100]
tuned_param = tuning(MultinomialNB(),values, 'alpha')

print('tuned model')
model_creation(MultinomialNB(alpha=tuned_param))
print('default model')
model_creation(MultinomialNB())


tuned parameter  10
tuned model
accuracy:  0.825 precision: 0.8421052631578947 recall:  0.8 f1_score: 0.8205128205128205
default model
accuracy:  0.8375 precision: 0.8375 recall:  0.8375 f1_score: 0.8375


In [16]:
values = [0.01, 0.1, 1, 10, 100]
tuned_param = tuning(LogisticRegression(penalty='l1', solver= 'liblinear'),values, 'C')
                     
print('tuned model')                    
model_creation(LogisticRegression(penalty='l1', solver= 'liblinear', C =tuned_param))
print('default model')
model_creation(LogisticRegression(penalty='l1', solver='liblinear'))

tuned parameter  10
tuned model
accuracy:  0.76875 precision: 0.7721518987341772 recall:  0.7625 f1_score: 0.7672955974842767
default model
accuracy:  0.8125 precision: 0.8289473684210527 recall:  0.7875 f1_score: 0.8076923076923076
