In [56]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier


df = pd.read_pickle("untracked_data/data_prepro_train_01.pkl")

df_test = pd.read_pickle("untracked_data/data_prepro_test_01.pkl")

X_train = df['text']
y_train = df['label']

X_test = df_test['text']
y_test = df_test['label']




Multinomial Naive Bayes (Unigram)

In [57]:
vectorizer  = CountVectorizer(ngram_range=(1, 1), min_df=5)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

clf = MultinomialNB(alpha=0.2)
clf.fit(X_train_vec, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))

Accuracy: 0.86875
Precision: 0.8692692849319356
Recall: 0.86875
F1 Score: 0.8687038411941698


Multinomial Naive Bayes (Bigram)

In [58]:
vectorizer  = CountVectorizer(ngram_range=(1, 2), min_df=2)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

clf = MultinomialNB(alpha=0.05)
clf.fit(X_train_vec, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))

Accuracy: 0.825
Precision: 0.8268384663733499
Recall: 0.825
F1 Score: 0.8247535596933186


Logistic Regression (Unigram)

In [59]:
vectorizer = CountVectorizer(ngram_range=(1, 1), min_df=1)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

clf = LogisticRegression(C=1/0.001, solver='liblinear', random_state=42)
clf.fit(X_train_vec, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))

Accuracy: 0.78125
Precision: 0.783419933868682
Recall: 0.78125
F1 Score: 0.7808304958710031


Logistic Regression (Bigram)

In [60]:
vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=1)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

clf = LogisticRegression(C=1/0.01, solver='liblinear', random_state=42)
clf.fit(X_train_vec, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))

Accuracy: 0.83125
Precision: 0.8354961228042411
Recall: 0.83125
F1 Score: 0.8307143696853323


Single classification tree unigram

In [67]:
CountVectorizer(ngram_range=(1, 1), min_df=10)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

clf = DecisionTreeClassifier(

        max_depth=10,

        min_samples_leaf=5,

        ccp_alpha=0.001,

        random_state=42

)
clf.fit(X_train_vec, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))

Accuracy: 0.6125
Precision: 0.6125703564727955
Recall: 0.6125
F1 Score: 0.6124394436630723


Single classification tree bigram

In [68]:
CountVectorizer(ngram_range=(1, 2), min_df=2)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

clf = DecisionTreeClassifier(

        max_depth=None,

        min_samples_leaf=5,

        ccp_alpha=0.0,

        random_state=42

)

clf.fit(X_train_vec, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))

Accuracy: 0.6125
Precision: 0.6125703564727955
Recall: 0.6125
F1 Score: 0.6124394436630723


Random forest unigram

In [69]:
CountVectorizer(ngram_range=(1, 1), min_df=5)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

clf = RandomForestClassifier(

        n_estimators=500,

        max_features='log2',

        max_depth=20,

        random_state=42,

        n_jobs=-1

)


clf.fit(X_train_vec, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))

Accuracy: 0.79375
Precision: 0.7960163753739569
Recall: 0.79375
F1 Score: 0.7933544675355172


Random forest bigram

In [70]:
CountVectorizer(ngram_range=(1, 2), min_df=5)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

clf = RandomForestClassifier(

        n_estimators=500,

        max_features='log2',

        max_depth=None,

        random_state=42,

        n_jobs=-1

)



clf.fit(X_train_vec, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))

Accuracy: 0.81875
Precision: 0.82
Recall: 0.81875
F1 Score: 0.818572825024438


Gradient boosting unigram

In [65]:
CountVectorizer(ngram_range=(1, 1), min_df=10)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

clf = GradientBoostingClassifier(

        learning_rate=0.2,

        n_estimators=300,

        max_depth=5,

        random_state=42

)


clf.fit(X_train_vec, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))

Accuracy: 0.79375
Precision: 0.8017172203498635
Recall: 0.79375
F1 Score: 0.7923793794974637


Gradient boosting (bigram)

In [66]:
CountVectorizer(ngram_range=(1, 2), min_df=10)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

clf = GradientBoostingClassifier(

        learning_rate=0.2,

        n_estimators=300,

        max_depth=2,

        random_state=42

)

clf.fit(X_train_vec, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))

Accuracy: 0.8
Precision: 0.8030303030303031
Recall: 0.8
F1 Score: 0.7994987468671679
