# Setup

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
nltk.download('omw-1.4')

In [None]:
from sklearn.ensemble import AdaBoostClassifier as AdaBoost, RandomForestClassifier as RandomForest, GradientBoostingClassifier as XGBoost
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Preprocessing

In [None]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [None]:
stopword_list = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_str(input: str) -> str:
    """
    Tokenize, remove stopwords, lemmatize, and then reassemble into one string.
    
    String type output required for easier ingestion by sklearn TfidfVectorizer
    """
    
    tokens = word_tokenize(input)
    tokens = [token for token in tokens if token not in ['.', ','] and token not in stopword_list]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return " ".join(tokens)

In [None]:
df_train["processed"] = df_train["text"].apply(preprocess_str)

In [None]:
df_test["processed"] = df_test["text"].apply(preprocess_str)

In [None]:
vectorizer = TfidfVectorizer(strip_accents='ascii')
train_processed_tfidf = vectorizer.fit_transform(df_train["processed"])
test_processed_tfidf = vectorizer.transform(df_test["processed"])
train_tfidf = vectorizer.fit_transform(df_train["text"])
test_tfidf = vectorizer.transform(df_test["text"])

# Model Training

## SVM

In [None]:
SVM = SVC(random_state=42)
SVM_search = GridSearchCV(SVM, param_grid=[{"kernel":["linear", "rbf"], "C":[0.1, 0.3, 1, 3, 10]}, {"kernel":["poly"], "degree":list(range(2, 6))}], n_jobs=-1, verbose=3)

In [None]:
SVM_search.fit(train_tfidf, df_train["label"])

In [None]:
svm_param = SVM_search.best_params_
svm_score = SVM_search.best_score_
print(svm_param, svm_score)

In [None]:
SVM_search.fit(train_processed_tfidf, df_train["label"])

In [None]:
svm_processed_param = SVM_search.best_params_
svm_processed_score = SVM_search.best_score_
print(svm_processed_param, svm_processed_score)

## Naive Bayes

In [None]:
NB = MultinomialNB()
NB_score = cross_val_score(NB, train_tfidf, df_train["label"], cv=5, n_jobs=-1)
NB_processed_score = cross_val_score(NB, train_processed_tfidf, df_train["label"], cv=5, n_jobs=-1)

In [None]:
print("Original text: ", NB_score.mean())
print("Processed text: ", NB_processed_score.mean())

## Logistic Regression

In [None]:
LR = LogisticRegressionCV(Cs = [0.1, 0.3, 1, 3, 10, 30, 100, 300, 1000], fit_intercept=False, n_jobs=-1, random_state=42)
LR_score = cross_val_score(LR, train_tfidf, df_train["label"], cv=5, n_jobs=-1)
LR_processed_score = cross_val_score(LR, train_processed_tfidf, df_train["label"], cv=5, n_jobs=-1)

In [None]:
print("Original text: ", LR_score.mean())
print("Processed text: ", LR_processed_score.mean())

## AdaBoost

In [None]:
Ada = AdaBoost(random_state=42)
Ada_search = GridSearchCV(Ada, param_grid = {"n_estimators":list(range(10, 101, 10)), "learning_rate":[0.1, 0.3, 1, 3, 10]}, n_jobs=-1)

In [None]:
Ada_search.fit(train_tfidf, df_train["label"])

In [None]:
Ada_param = Ada_search.best_params_
Ada_score = Ada_search.best_score_
print(Ada_param, Ada_score)

In [None]:
Ada_search.fit(train_processed_tfidf, df_train["label"])

In [None]:
Ada_processed_param = Ada_search.best_params_
Ada_processed_score = Ada_search.best_score_
print(Ada_processed_param, Ada_processed_score)

## Random Forest

In [None]:
RF = RandomForest(n_jobs=-1, random_state=42)
RF_Search = GridSearchCV(RF, param_grid={"n_estimators":list(range(50, 301, 50))}, n_jobs=-1, cv=5)

In [None]:
RF_Search.fit(train_tfidf, df_train["label"])

In [None]:
RF_param = RF_Search.best_params_
RF_score = RF_Search.best_score_
print(RF_param, RF_score)

In [None]:
RF_Search.fit(train_processed_tfidf, df_train["label"])

In [None]:
RF_processed_param = RF_Search.best_params_
RF_processed_score = RF_Search.best_score_
print(RF_processed_param, RF_processed_score)

## XGBoost

In [None]:
XG = XGBoost(random_state=42)
XG_Search = GridSearchCV(XG, param_grid={"learning_rate": [0.01, 0.03, 0.1, 0.3, 1], "n_estimators": list(range(50, 301, 50)), "subsample": [0.5, 1]}, n_jobs=-1, cv=5)

In [None]:
XG_Search.fit(train_tfidf, df_train["label"])

In [None]:
XG_param = XG_Search.best_params_
XG_score = XG_Search.best_score_
print(XG_param, XG_score)

In [None]:
XG_Search.fit(train_processed_tfidf, df_train["label"])

In [None]:
XG_processed_param = XG_Search.best_params_
XG_processed_score = XG_Search.best_score_
print(XG_processed_param, XG_processed_score)

## KNN

In [None]:
KNN = KNeighborsClassifier(n_jobs=-1)
KNN_Search = GridSearchCV(KNN, param_grid={"n_neighbors":[2, 4, 6, 8, 10], "weights": ["uniform", "distance"]}, n_jobs=-1, cv=5)

In [None]:
KNN_Search.fit(train_tfidf, df_train["label"])

In [None]:
KNN_param = KNN_Search.best_params_
KNN_score = KNN_Search.best_score_
print(KNN_param, KNN_score)

In [None]:
KNN_Search.fit(train_processed_tfidf, df_train["label"])

In [None]:
KNN_processed_param = KNN_Search.best_params_
KNN_processed_score = KNN_Search.best_score_
print(KNN_processed_param, KNN_processed_score)

## Hugging Face

In [None]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

In [None]:
def predict(sequence, labels, clf):
    output = clf(sequence, labels)
    return output["scores"]

In [None]:
labels = ["false", "real"]

predict(df_test.iloc[500]["text"], labels, classifier)
# df_test["HF"] = df_test["text"].apply(lambda sequence: predict(sequence, labels, classifier))

# Prediction

In [None]:
def predict(clf, embedding):
    Id = df_test["Id"]
    pred = pd.Series(clf.predict(embedding))
    
    return pd.DataFrame({'Id': Id, 'Category': pred})

In [None]:
SVM_pred = SVC(C=10, kernel="rbf", random_state=42)
SVM_pred.fit(train_tfidf, df_train["label"])

df_SVM = predict(SVM_pred, test_tfidf)

In [None]:
df_SVM.to_csv("SVM.csv", index=False)

In [None]:
RF_pred = RandomForest(n_estimators=300, n_jobs=-1, random_state=4)
RF_pred.fit(train_tfidf, df_train["label"])
df_RF = predict(RF_pred, test_tfidf)
df_RF.to_csv("RandomForest.csv", index=False)