In [1]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from nltk.corpus import stopwords
from utils import pipeline, calculate_metrics


In [2]:
log_regr_params = {
    "penalty": ["l1", "l2"],
    "C": [0.01, 0.1, 1, 10, 100]
}

bayes_params = {
    "alpha": [0.01, 0.1, 1, 10, 100]
}

def tune_model(model, params, x, y):
    clf = GridSearchCV(model, params, cv=10)
    clf.fit(x, y)
    print('tuned parameters: ', clf.best_params_)
    return clf.best_params_

def print_metrics(y_true, y_pred):
    acc, pre, rec, f1 = calculate_metrics(np.asarray(y_test), np.asarray(y_pred))
    return f'accuracy: {acc}; precision: {pre}; recall: {rec}; f1_score: {f1}'

In [3]:
fp_data = "./op_spam_v1.4/negative_polarity/"
max_features = None  # Maximum vocab size
ngram_range = (1, 1)  # Range of n-grams to include in the vocabulary
min_df = 0.05  # Minimal document frequency of a word to be included in the vocabulary
stop_words = set(stopwords.words("english"))
val_size = 0.2  # Size of the validation set as a percentage of the training set, set to None to disable
random_state = 42

X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = pipeline(fp_data, max_features, ngram_range, min_df,
                                                                        stop_words=stop_words, val_size=val_size)

print(f"X_train: {X_train.shape}; y_train: {len(y_train)}")
print(f"X_val: {X_val.shape}; y_val: {len(y_val)}")
print(f"X_test: {X_test.shape}; y_test: {len(y_test)}")
print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")


X_train: (512, 326); y_train: 512
X_val: (128, 326); y_val: 128
X_test: (160, 326); y_test: 160
Vocabulary size: 326


In [4]:
# def tuning(model, values: list, param: str):
#     model_tune = model
#     tuning = GridSearchCV(model_tune, {param:values}, cv=10)
#     tuning.fit(X_val, y_val)
#     best_param = tuning.best_params_[param]
#     print('tuned parameter ', best_param)
#     return best_param

# def model_creation(model):
    
#     model.fit(X_train,y_train)
#     pred = model.predict(X_test)
#     acc, pre, rec, f1 = calculate_metrics(np.asarray(y_test), np.asarray(pred))
#     print('accuracy: ', acc, 'precision:', pre, 'recall: ', rec,  'f1_score:', f1)
    

In [5]:
tuned_params = tune_model(MultinomialNB(), bayes_params, X_train, y_train)

print('tuned model')
model = MultinomialNB(**tuned_params).fit(X_train, y_train)
print_metrics(y_test, model.predict(X_test))

tuned parameters:  {'alpha': 10}
tuned model


'accuracy: 0.83125; precision: 0.8533333333333334; recall: 0.8; f1_score: 0.8258064516129033'

In [6]:
tuned_params = tune_model(LogisticRegression(random_state=random_state, solver='liblinear'), log_regr_params, X_train, y_train)

print('tuned model')                     
model = LogisticRegression(random_state=random_state, solver='liblinear', **tuned_params).fit(X_train, y_train)
print_metrics(y_test, model.predict(X_test))

tuned parameters:  {'C': 0.01, 'penalty': 'l2'}
tuned model


'accuracy: 0.8; precision: 0.8243243243243243; recall: 0.7625; f1_score: 0.7922077922077922'

In [11]:
ngram_range = (1,2)
X_train, X_val, X_test, y_train, y_val, y_test, vectorizer = pipeline(fp_data, max_features, ngram_range, min_df,
                                                                        stop_words=stop_words, val_size=val_size)

tuned_params = tune_model(MultinomialNB(), bayes_params, X_train, y_train)

print('tuned model')
bay_model = MultinomialNB(**tuned_params).fit(X_train, y_train)
print_metrics(y_test, bay_model.predict(X_test))

tuned parameters:  {'alpha': 1}
tuned model


'accuracy: 0.8375; precision: 0.8375; recall: 0.8375; f1_score: 0.8375'

In [12]:
tuned_params = tune_model(LogisticRegression(random_state=random_state, solver='liblinear'), log_regr_params, X_train, y_train)

print('tuned model')                     
log_model = LogisticRegression(random_state=random_state, solver='liblinear', **tuned_params).fit(X_train, y_train)
print_metrics(y_test, log_model.predict(X_test))

tuned parameters:  {'C': 0.01, 'penalty': 'l2'}
tuned model


'accuracy: 0.79375; precision: 0.8133333333333334; recall: 0.7625; f1_score: 0.7870967741935484'

## Getting feature importance

In [24]:
# get the coefficients of the model
coef = log_model.coef_[0]
feature_names = vectorizer.get_feature_names_out()

weighted_features = list(zip(feature_names, coef))

print("Genuine review: ", sorted(weighted_features, key=lambda x: x[1], reverse=True)[:5])
print("\nFake review: ", sorted(weighted_features, key=lambda x: x[1], reverse=False)[:5])

Genuine review:  [('location', 0.16240093926252808), ('great', 0.15540009295116428), ('bed', 0.15063790151163126), ('elevator', 0.1438176541048017), ('door', 0.12343650334327459)]

Fake review:  [('chicago', -0.2989570972857869), ('seemed', -0.11526175366992326), ('experience', -0.1133431415094073), ('finally', -0.11317218026981682), ('luxury', -0.11218933057781014)]
