# The Task

## Project Overview

In the file dataset/data.csv, you will find a dataset containing news articles with the following columns:

label: 0 if the news is fake, 1 if the news is real.
title: The headline of the news article.
text: The full content of the article.
subject: The category or topic of the news.
date: The publication date of the article.
Your goal is to build a classifier that is able to distinguish between the two.

Once you have a classifier built, then use it to predict the labels for dataset/validation_data.csv. Generate a new file where the label 2 has been replaced by 0 (fake) or 1 (real) according to your model. Please respect the original file format, do not include extra columns, and respect the column separator.

Please ensure to split the data.csv into training and test datasets before using it for model training or evaluation.

Guidance
Like in a real life scenario, you are able to make your own choices and text treatment. Use the techniques you have learned and the common packages to process this data and classify the text.

Deliverables
Python Code: Provide well-documented Python code that conducts the analysis.
Predictions: A csv file in the same format as validation_data.csv but with the predicted labels (0 or 1)
Accuracy estimation: Provide the teacher with your estimation of how your model will perform.
Presentation: You will present your model in a 10-minute presentation. Your teacher will provide further instructions.

# Import

In [None]:
import re
import nltk
import string
import numpy as np
import pandas as pd
import seaborn as sb
from nltk import pos_tag
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.naive_bayes import MultinomialNB
from nltk.stem.snowball import SnowballStemmer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, auc, roc_curve

# Loading the Data

In [None]:
data = pd.read_csv('../dataset/training_data_lowercase.csv', sep='\t', names=['labels', 'text'])

In [None]:
data

# Undertanding the Data

In [None]:
data.info()

In [None]:
print('Its unique values are ',data['labels'].unique())
print(print(data['labels'].describe()))

In [None]:
plt.hist(data.labels, color='red')
plt.show()

In [None]:
data.head()

In [None]:
data.tail()

# Preprocessing the Data

## Basic cleaning 

In [None]:
def clean_html_text(text: str) -> str:
    if text is None:
        return ''
    text = str(text)
    # Remove inline JavaScript/CSS
    text = re.sub(r"(?is)<script.*?>.*?</script>", " ", text)
    text = re.sub(r"(?is)<style.*?>.*?</style>", " ", text)
    # Remove HTML comments
    text = re.sub(r"(?s)<!--.*?-->", " ", text)
    # Remove the remaining tag
    text = re.sub(r"(?s)<[^>]+>", " ", text)
    # Remove prefixed b
    text = re.sub(r"^\s*b[\"'](.+?)[\"']\s*$", r"\1", text)
    # Remove video
    # text = re.sub(r"\s*\[video\]$", r"\1", text)    
    # Remove end of the line characters
    text = re.sub(r"\s*[\[\(][^\]\)]+[\]\)]\s*$", "", text)    
    # Remove \t from middle and end of the texts
    text = re.sub(r"\b\\t"," ",text)
    # Remove \t from startof the texts
    text = re.sub(r"^\\t"," ",text)
    # Remove all the special characters and numbers
    text = re.sub(r"[^A-Za-z\s]", " ", text)
    # Remove all single characters
    text = re.sub(r"\b[A-Za-z]\b", " ", text)
    # Remove single characters from the start
    text = re.sub(r"^[A-Za-z]\s+", " ", text)
    # Substitute multiple spaces with single space
    text = re.sub(r"\s+", " ", text).strip()
    # Convert to lowercase
    text = text.lower()
    return text

punct_pattern = f"[{re.escape(string.punctuation)}]"

In [None]:
data['pre_text'] = data['text'].astype(str).apply(lambda x: clean_html_text(x))
data['pre_text'] = data['pre_text'].astype(str).apply(lambda x: re.sub(punct_pattern, "", x))
data['pre_text'] = data['pre_text'].astype(str).apply(lambda x: word_tokenize(x))
data.head()

## Removing stop words

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
data['pre_text'] = data['pre_text'].apply(lambda tokens: [word for word in tokens if word not in stop_words])

In [None]:
bag_of_words = {}

for lista in data['pre_text']:
    for word in lista:
        if bag_of_words == 0:
            bag_of_words[word] = 1
        elif word in bag_of_words:
            bag_of_words[word] +=1
        else:
            bag_of_words[word] = 1

print(sorted(bag_of_words.items(), key=lambda x: -x[1])[:100])

In [None]:
words_to_filter = ['video','says', 'tweets', 'tells','screenshots',
                   'details', 'fck', 'btch', 'images', 'cck', 'image'
                   ,'videos','ahole']

In [None]:
data['pre_text_filter'] = data['pre_text'].apply(lambda tokens: [word for word in tokens if word not in words_to_filter])

In [None]:
bag_of_words = {}

for lista in data['pre_text_filter']:
    for word in lista:
        if bag_of_words == 0:
            bag_of_words[word] = 1
        elif word in bag_of_words:
            bag_of_words[word] +=1
        else:
            bag_of_words[word] = 1

print(sorted(bag_of_words.items(), key=lambda x: -x[1])[:100])

### Using Stemmer

#### Snowball

In [None]:
snowball = SnowballStemmer('english')

In [None]:
data['snow_text'] = data['pre_text'].apply(lambda tokens: [snowball.stem(token) for token in tokens])

#### Porter

In [None]:
porter = PorterStemmer()

In [None]:
data['porter_text'] = data['pre_text'].apply(lambda tokens: [porter.stem(token) for token in tokens])

### Using Lemmatizer

In [None]:
lemm = WordNetLemmatizer()

In [None]:
data['lemm_text'] = data['pre_text'].apply(lambda tokens: [lemm.lemmatize(token) for token in tokens])

In [None]:
data.info()

# Spliting the data into Training and Test

In [None]:
X = data.iloc[:,2:]

In [None]:
y = data.iloc[:,0]

In [None]:
print(X.shape, y.shape)

## Using only the preprocessed text

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X['pre_text'], y, test_size=0.2, random_state=42)

## Using the preprocessed text + snow stemmer

In [None]:
X_train_snow, X_test_snow, y_train_snow, y_test_snow = train_test_split(X['snow_text'], y, test_size=0.2, random_state=42)

## Using the preprocessed text + porter stemmer

In [None]:
X_train_porter, X_test_porter, y_train_porter, y_test_porter = train_test_split(X['porter_text'], y, test_size=0.2, random_state=42)

## Using the preprocessed text + noise removal

In [None]:
X_train_filt, X_test_filt, y_train_filt, y_test_filt = train_test_split(X['pre_text_filter'], y, test_size=0.2, random_state=42)

## Using the preprocessed text + lemmatizer

In [None]:
X_train_lemm, X_test_lemm, y_train_lemm, y_test_lemm = train_test_split(X['lemm_text'], y, test_size=0.2, random_state=42)

# Training some classifiers

### Only preprocessed text - Best(TF-IDF Passive Agressive Classifier - Acc: 91.27 %, Gini: 94.29 %)

#### TF-IDF - Best (Passive Agressive Classifier - Acc: 91.27 %, Gini: 94.29 %)

In [None]:
vectorizer = TfidfVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    lowercase=False
)

X_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

##### Decision Tree metrics

In [None]:
dt_classifier = DecisionTreeClassifier(random_state=42).fit(X_tfidf, y_train)
y_hat = dt_classifier.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_hat))
print("Classification Report:\n", classification_report(y_test, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test, y_hat))

y_proba = np.argmax(dt_classifier.predict_proba(X_test_tfidf), axis=1)
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### Logistic Regression metrics

In [None]:
log_classifier = LogisticRegression(random_state=42).fit(X_tfidf, y_train)
y_hat = log_classifier.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_hat))
print("Classification Report:\n", classification_report(y_test, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test, y_hat))

y_proba = np.argmax(log_classifier.predict_proba(X_test_tfidf), axis=1)
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### Naive Bayes metrics

In [None]:
nb_classifier = MultinomialNB().fit(X_tfidf, y_train)
y_hat = nb_classifier.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_hat))
print("Classification Report:\n", classification_report(y_test, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test, y_hat))

y_proba = np.argmax(nb_classifier.predict_proba(X_test_tfidf), axis=1)
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### Random Forest metrics

In [None]:
rf_classifier = RandomForestClassifier(random_state=42, n_jobs=-1).fit(X_tfidf, y_train)
y_hat = rf_classifier.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_hat))
print("Classification Report:\n", classification_report(y_test, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test, y_hat))

y_proba = np.argmax(rf_classifier.predict_proba(X_test_tfidf), axis=1)
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### PassiveAggressiveClassifier

In [None]:
pac = PassiveAggressiveClassifier(max_iter=50, random_state=42, n_jobs=-1).fit(X_tfidf, y_train)
y_hat = pac.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_hat))
print("Classification Report:\n", classification_report(y_test, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test, y_hat))

y_proba = pac.decision_function(X_test_tfidf)
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

#### BoW - Best (Naive Bayes - Acc: 93.01 %, Gini: 85.98 % )

In [None]:
vectorizer = CountVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    lowercase=False
)

X_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

##### Decision Tree metrics

In [None]:
dt_classifier = DecisionTreeClassifier(random_state=42).fit(X_bow, y_train)
y_hat = dt_classifier.predict(X_test_bow)

print("Accuracy:", accuracy_score(y_test, y_hat))
print("Classification Report:\n", classification_report(y_test, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test, y_hat))

y_proba = np.argmax(dt_classifier.predict_proba(X_test_bow), axis=1)
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### Logistic Regression metrics

In [None]:
log_classifier = LogisticRegression(random_state=42).fit(X_bow, y_train)
y_hat = log_classifier.predict(X_test_bow)

print("Accuracy:", accuracy_score(y_test, y_hat))
print("Classification Report:\n", classification_report(y_test, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test, y_hat))

y_proba = np.argmax(log_classifier.predict_proba(X_test_bow), axis=1)
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### Naive Bayes metrics

In [None]:
nb_classifier = MultinomialNB().fit(X_bow, y_train)
y_hat = nb_classifier.predict(X_test_bow)

print("Accuracy:", accuracy_score(y_test, y_hat))
print("Classification Report:\n", classification_report(y_test, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test, y_hat))

y_proba = np.argmax(nb_classifier.predict_proba(X_test_bow), axis=1)
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### Random Forest metrics

In [None]:
rf_classifier = RandomForestClassifier(random_state=42, n_jobs=-1).fit(X_bow, y_train)
y_hat = rf_classifier.predict(X_test_bow)

print("Accuracy:", accuracy_score(y_test, y_hat))
print("Classification Report:\n", classification_report(y_test, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test, y_hat))

y_proba = np.argmax(rf_classifier.predict_proba(X_test_bow), axis=1)
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### PassiveAggressiveClassifier

In [None]:
pac = PassiveAggressiveClassifier(max_iter=50, random_state=42, n_jobs=-1).fit(X_bow, y_train)
y_hat = pac.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_hat))
print("Classification Report:\n", classification_report(y_test, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test, y_hat))

y_proba = pac.decision_function(X_test_bow)
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

### Preprocessed text + noise removal - Best(TF-IDF Passive Agressive Classifier - Acc: 91.21 %, Gini: 93.88 %)

#### TF-IDF - Best (Passive Agressive Classifier - Acc: 91.21 %, Gini: 93.88 % )¶

In [None]:
vectorizer = TfidfVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    lowercase=False
)

X_tfidf = vectorizer.fit_transform(X_train_filt)
X_test_tfidf = vectorizer.transform(X_test_filt)

##### Decision Tree metrics

In [None]:
dt_classifier = DecisionTreeClassifier(random_state=42).fit(X_tfidf, y_train_filt)
y_hat = dt_classifier.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test_filt, y_hat))
print("Classification Report:\n", classification_report(y_test_filt, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_filt, y_hat))

y_proba = np.argmax(dt_classifier.predict_proba(X_test_tfidf), axis=1)
fpr, tpr, thresholds = roc_curve(y_test_filt, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### Logistic Regression metrics

In [None]:
log_classifier = LogisticRegression(random_state=42).fit(X_tfidf, y_train_filt)
y_hat = log_classifier.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test_filt, y_hat))
print("Classification Report:\n", classification_report(y_test_filt, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_filt, y_hat))

y_proba = np.argmax(log_classifier.predict_proba(X_test_tfidf), axis=1)
fpr, tpr, thresholds = roc_curve(y_test_filt, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### Naive Bayes metrics

In [None]:
nb_classifier = MultinomialNB().fit(X_tfidf, y_train_filt)
y_hat = nb_classifier.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test_filt, y_hat))
print("Classification Report:\n", classification_report(y_test_filt, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_filt, y_hat))

y_proba = np.argmax(nb_classifier.predict_proba(X_test_tfidf), axis=1)
fpr, tpr, thresholds = roc_curve(y_test_filt, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### Random Forest metrics

In [None]:
rf_classifier = RandomForestClassifier(random_state=42, n_jobs=-1).fit(X_tfidf, y_train_filt)
y_hat = rf_classifier.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test_filt, y_hat))
print("Classification Report:\n", classification_report(y_test_filt, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_filt, y_hat))

y_proba = np.argmax(rf_classifier.predict_proba(X_test_tfidf), axis=1)
fpr, tpr, thresholds = roc_curve(y_test_filt, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### PassiveAggressiveClassifier

In [None]:
pac = PassiveAggressiveClassifier(max_iter=50, random_state=42, n_jobs=-1).fit(X_tfidf, y_train_filt)
y_hat = pac.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test_filt, y_hat))
print("Classification Report:\n", classification_report(y_test_filt, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_filt, y_hat))

y_proba = pac.decision_function(X_test_tfidf)
fpr, tpr, thresholds = roc_curve(y_test_filt, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

#### BoW - Best (Passive Agressive Classifier - Acc: 90.95 %, Gini: 93.68 % )¶

In [None]:
vectorizer = CountVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    lowercase=False
)

X_bow = vectorizer.fit_transform(X_train_filt)
X_test_bow = vectorizer.transform(X_test_filt)

##### Decision Tree metrics

In [None]:
dt_classifier = DecisionTreeClassifier(random_state=42).fit(X_bow, y_train_filt)
y_hat = dt_classifier.predict(X_test_bow)

print("Accuracy:", accuracy_score(y_test_filt, y_hat))
print("Classification Report:\n", classification_report(y_test_filt, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_filt, y_hat))

y_proba = np.argmax(dt_classifier.predict_proba(X_test_bow), axis=1)
fpr, tpr, thresholds = roc_curve(y_test_filt, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### Logistic Regression metrics

In [None]:
log_classifier = LogisticRegression(random_state=42).fit(X_bow, y_train_filt)
y_hat = log_classifier.predict(X_test_bow)

print("Accuracy:", accuracy_score(y_test_filt, y_hat))
print("Classification Report:\n", classification_report(y_test_filt, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_filt, y_hat))

y_proba = np.argmax(log_classifier.predict_proba(X_test_bow), axis=1)
fpr, tpr, thresholds = roc_curve(y_test_filt, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### Naive Bayes metrics

In [None]:
nb_classifier = MultinomialNB().fit(X_bow, y_train_filt)
y_hat = nb_classifier.predict(X_test_bow)

print("Accuracy:", accuracy_score(y_test_filt, y_hat))
print("Classification Report:\n", classification_report(y_test_filt, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_filt, y_hat))

y_proba = np.argmax(nb_classifier.predict_proba(X_test_bow), axis=1)
fpr, tpr, thresholds = roc_curve(y_test_filt, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### Random Forest metrics

In [None]:
rf_classifier = RandomForestClassifier(random_state=42, n_jobs=-1).fit(X_bow, y_train_filt)
y_hat = rf_classifier.predict(X_test_bow)

print("Accuracy:", accuracy_score(y_test_filt, y_hat))
print("Classification Report:\n", classification_report(y_test_filt, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_filt, y_hat))

y_proba = np.argmax(rf_classifier.predict_proba(X_test_bow), axis=1)
fpr, tpr, thresholds = roc_curve(y_test_filt, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### PassiveAggressiveClassifier

In [None]:
pac = PassiveAggressiveClassifier(max_iter=50, random_state=42, n_jobs=-1).fit(X_bow, y_train_filt)
y_hat = pac.predict(X_test_bow)

print("Accuracy:", accuracy_score(y_test_filt, y_hat))
print("Classification Report:\n", classification_report(y_test_filt, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_filt, y_hat))

y_proba = pac.decision_function(X_test_bow)
fpr, tpr, thresholds = roc_curve(y_test_filt, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

### Preprocessed text + snow stemmer - Best(TF-IDF Passive Agressive Classifier - Acc: 90.54 %, Gini: 93.15 %)

#### TF-IDF - Best (Passive Agressive Classifier - Acc: 90.54 %, Gini: 93.15 % )¶

In [None]:
vectorizer = TfidfVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    lowercase=False
)

X_tfidf = vectorizer.fit_transform(X_train_snow)
X_test_tfidf = vectorizer.transform(X_test_snow)

##### Decision Tree metrics

In [None]:
dt_classifier = DecisionTreeClassifier(random_state=42).fit(X_tfidf, y_train_snow)
y_hat = dt_classifier.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test_snow, y_hat))
print("Classification Report:\n", classification_report(y_test_snow, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_snow, y_hat))

y_proba = np.argmax(dt_classifier.predict_proba(X_test_tfidf), axis=1)
fpr, tpr, thresholds = roc_curve(y_test_snow, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### Logistic Regression metrics

In [None]:
log_classifier = LogisticRegression(random_state=42).fit(X_tfidf, y_train_snow)
y_hat = log_classifier.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test_snow, y_hat))
print("Classification Report:\n", classification_report(y_test_snow, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_snow, y_hat))

y_proba = np.argmax(log_classifier.predict_proba(X_test_tfidf), axis=1)
fpr, tpr, thresholds = roc_curve(y_test_snow, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### Naive Bayes metrics

In [None]:
nb_classifier = MultinomialNB().fit(X_tfidf, y_train_snow)
y_hat = nb_classifier.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test_snow, y_hat))
print("Classification Report:\n", classification_report(y_test_snow, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_snow, y_hat))

y_proba = np.argmax(nb_classifier.predict_proba(X_test_tfidf), axis=1)
fpr, tpr, thresholds = roc_curve(y_test_snow, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### Random Forest metrics

In [None]:
rf_classifier = RandomForestClassifier(random_state=42, n_jobs=-1).fit(X_tfidf, y_train_snow)
y_hat = rf_classifier.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test_snow, y_hat))
print("Classification Report:\n", classification_report(y_test_snow, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_snow, y_hat))

y_proba = np.argmax(rf_classifier.predict_proba(X_test_tfidf), axis=1)
fpr, tpr, thresholds = roc_curve(y_test_snow, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### PassiveAggressiveClassifier

In [None]:
pac = PassiveAggressiveClassifier(max_iter=50, random_state=42, n_jobs=-1).fit(X_tfidf, y_train_snow)
y_hat = pac.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test_snow, y_hat))
print("Classification Report:\n", classification_report(y_test_snow, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_snow, y_hat))

y_proba = pac.decision_function(X_test_tfidf)
fpr, tpr, thresholds = roc_curve(y_test_snow, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

#### BoW - Best (Passive Agressive Classifier - Acc: 90.26 %, Gini: 93.18 % )

In [None]:
vectorizer = CountVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    lowercase=False
)

X_bow = vectorizer.fit_transform(X_train_snow)
X_test_bow = vectorizer.transform(X_test_snow)

##### Decision Tree metrics

In [None]:
dt_classifier = DecisionTreeClassifier(random_state=42).fit(X_bow, y_train_snow)
y_hat = dt_classifier.predict(X_test_bow)

print("Accuracy:", accuracy_score(y_test_snow, y_hat))
print("Classification Report:\n", classification_report(y_test_snow, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_snow, y_hat))

y_proba = np.argmax(dt_classifier.predict_proba(X_test_bow), axis=1)
fpr, tpr, thresholds = roc_curve(y_test_snow, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### Logistic Regression metrics

In [None]:
log_classifier = LogisticRegression(random_state=42).fit(X_bow, y_train_snow)
y_hat = log_classifier.predict(X_test_bow)

print("Accuracy:", accuracy_score(y_test_snow, y_hat))
print("Classification Report:\n", classification_report(y_test_snow, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_snow, y_hat))

y_proba = np.argmax(log_classifier.predict_proba(X_test_bow), axis=1)
fpr, tpr, thresholds = roc_curve(y_test_snow, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### Naive Bayes metrics

In [None]:
nb_classifier = MultinomialNB().fit(X_bow, y_train_snow)
y_hat = nb_classifier.predict(X_test_bow)

print("Accuracy:", accuracy_score(y_test_snow, y_hat))
print("Classification Report:\n", classification_report(y_test_snow, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_snow, y_hat))

y_proba = np.argmax(nb_classifier.predict_proba(X_test_bow), axis=1)
fpr, tpr, thresholds = roc_curve(y_test_snow, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### Random Forest metrics

In [None]:
rf_classifier = RandomForestClassifier(random_state=42, n_jobs=-1).fit(X_bow, y_train_snow)
y_hat = rf_classifier.predict(X_test_bow)

print("Accuracy:", accuracy_score(y_test_snow, y_hat))
print("Classification Report:\n", classification_report(y_test_snow, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_snow, y_hat))

y_proba = np.argmax(nb_classifier.predict_proba(X_test_bow), axis=1)
fpr, tpr, thresholds = roc_curve(y_test_snow, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### PassiveAggressiveClassifier

In [None]:
pac = PassiveAggressiveClassifier(max_iter=50, random_state=42, n_jobs=-1).fit(X_bow, y_train_snow)
y_hat = pac.predict(X_test_bow)

print("Accuracy:", accuracy_score(y_test_snow, y_hat))
print("Classification Report:\n", classification_report(y_test_snow, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_snow, y_hat))

y_proba = pac.decision_function(X_test_bow)
fpr, tpr, thresholds = roc_curve(y_test_snow, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

### Preprocessed text + porter stemmer - Best(TF-IDF Passive Agressive Classifier - Acc: 91.24 %, Gini: 93.65 %)

#### TF-IDF - Best (Passive Agressive Classifier - Acc: 91.24 %, Gini: 93.65 % )¶

In [None]:
vectorizer = TfidfVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    lowercase=False
)

X_tfidf = vectorizer.fit_transform(X_train_porter)
X_test_tfidf = vectorizer.transform(X_test_porter)

##### Decision Tree metrics

In [None]:
dt_classifier = DecisionTreeClassifier(random_state=42).fit(X_tfidf, y_train_porter)
y_hat = dt_classifier.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test_porter, y_hat))
print("Classification Report:\n", classification_report(y_test_porter, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_porter, y_hat))

y_proba = np.argmax(dt_classifier.predict_proba(X_test_tfidf), axis=1)
fpr, tpr, thresholds = roc_curve(y_test_porter, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### Logistic Regression metrics

In [None]:
log_classifier = LogisticRegression(random_state=42).fit(X_tfidf, y_train_porter)
y_hat = log_classifier.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test_porter, y_hat))
print("Classification Report:\n", classification_report(y_test_porter, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_porter, y_hat))

y_proba = np.argmax(log_classifier.predict_proba(X_test_tfidf), axis=1)
fpr, tpr, thresholds = roc_curve(y_test_porter, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### Naive Bayes metrics

In [None]:
nb_classifier = MultinomialNB().fit(X_tfidf, y_train_porter)
y_hat = nb_classifier.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test_porter, y_hat))
print("Classification Report:\n", classification_report(y_test_porter, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_porter, y_hat))

y_proba = np.argmax(nb_classifier.predict_proba(X_test_tfidf), axis=1)
fpr, tpr, thresholds = roc_curve(y_test_porter, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### Random Forest metrics

In [None]:
rf_classifier = RandomForestClassifier(random_state=42, n_jobs=-1).fit(X_tfidf, y_train_porter)
y_hat = rf_classifier.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test_porter, y_hat))
print("Classification Report:\n", classification_report(y_test_porter, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_porter, y_hat))

y_proba = np.argmax(rf_classifier.predict_proba(X_test_tfidf), axis=1)
fpr, tpr, thresholds = roc_curve(y_test_porter, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### PassiveAggressiveClassifier

In [None]:
pac = PassiveAggressiveClassifier(max_iter=50, random_state=42, n_jobs=-1).fit(X_tfidf, y_train_porter)
y_hat = pac.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test_porter, y_hat))
print("Classification Report:\n", classification_report(y_test_porter, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_porter, y_hat))

y_proba = pac.decision_function(X_test_tfidf)
fpr, tpr, thresholds = roc_curve(y_test_porter, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

#### BoW - Best (Passive Agressive Classifier - Acc: 90.48 %, Gini: 93.17 % )¶

In [None]:
vectorizer = CountVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    lowercase=False
)

X_bow = vectorizer.fit_transform(X_train_porter)
X_test_bow = vectorizer.transform(X_test_porter)

##### Decision Tree metrics

In [None]:
dt_classifier = DecisionTreeClassifier(random_state=42).fit(X_bow, y_train_porter)
y_hat = dt_classifier.predict(X_test_bow)

print("Accuracy:", accuracy_score(y_test_porter, y_hat))
print("Classification Report:\n", classification_report(y_test_porter, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_porter, y_hat))

y_proba = np.argmax(dt_classifier.predict_proba(X_test_bow), axis=1)
fpr, tpr, thresholds = roc_curve(y_test_snow, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### Logistic Regression metrics

In [None]:
log_classifier = LogisticRegression(random_state=42).fit(X_bow, y_train_porter)
y_hat = log_classifier.predict(X_test_bow)

print("Accuracy:", accuracy_score(y_test_porter, y_hat))
print("Classification Report:\n", classification_report(y_test_porter, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_porter, y_hat))

y_proba = np.argmax(log_classifier.predict_proba(X_test_bow), axis=1)
fpr, tpr, thresholds = roc_curve(y_test_snow, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### Naive Bayes metrics

In [None]:
nb_classifier = MultinomialNB().fit(X_bow, y_train_porter)
y_hat = nb_classifier.predict(X_test_bow)

print("Accuracy:", accuracy_score(y_test_porter, y_hat))
print("Classification Report:\n", classification_report(y_test_porter, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_porter, y_hat))

y_proba = np.argmax(nb_classifier.predict_proba(X_test_bow), axis=1)
fpr, tpr, thresholds = roc_curve(y_test_snow, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### Random Forest metrics

In [None]:
rf_classifier = RandomForestClassifier(random_state=42, n_jobs=-1).fit(X_bow, y_train_porter)
y_hat = rf_classifier.predict(X_test_bow)

print("Accuracy:", accuracy_score(y_test_porter, y_hat))
print("Classification Report:\n", classification_report(y_test_porter, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_porter, y_hat))

y_proba = np.argmax(rf_classifier.predict_proba(X_test_bow), axis=1)
fpr, tpr, thresholds = roc_curve(y_test_snow, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### PassiveAggressiveClassifier

In [None]:
pac = PassiveAggressiveClassifier(max_iter=50, random_state=42, n_jobs=-1).fit(X_bow, y_train_porter)
y_hat = pac.predict(X_test_bow)

print("Accuracy:", accuracy_score(y_test_porter, y_hat))
print("Classification Report:\n", classification_report(y_test_porter, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_porter, y_hat))

y_proba = pac.decision_function(X_test_bow)
fpr, tpr, thresholds = roc_curve(y_test_porter, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

### Preprocessed text + lemmatizer - Best (Passive Agressive Classifier - Acc: 91.56 %, Gini: 94.03 %

#### TF-IDF - Best (Passive Agressive Classifier - Acc: 91.56 %, Gini: 94.03 % )¶

In [None]:
vectorizer = TfidfVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    lowercase=False
)

X_tfidf = vectorizer.fit_transform(X_train_lemm)
X_test_tfidf = vectorizer.transform(X_test_lemm)

##### Decision Tree metrics

In [None]:
dt_classifier = DecisionTreeClassifier(random_state=42).fit(X_tfidf, y_train_lemm)
y_hat = dt_classifier.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test_lemm, y_hat))
print("Classification Report:\n", classification_report(y_test_lemm, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_lemm, y_hat))

y_proba = np.argmax(dt_classifier.predict_proba(X_test_tfidf), axis=1)
fpr, tpr, thresholds = roc_curve(y_test_lemm, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### Logistic Regression metrics

In [None]:
log_classifier = LogisticRegression(random_state=42).fit(X_tfidf, y_train_lemm)
y_hat = log_classifier.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test_lemm, y_hat))
print("Classification Report:\n", classification_report(y_test_lemm, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_lemm, y_hat))

y_proba = np.argmax(log_classifier.predict_proba(X_test_tfidf), axis=1)
fpr, tpr, thresholds = roc_curve(y_test_lemm, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### Naive Bayes metrics

In [None]:
nb_classifier = MultinomialNB().fit(X_tfidf, y_train_lemm)
y_hat = nb_classifier.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test_lemm, y_hat))
print("Classification Report:\n", classification_report(y_test_lemm, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_lemm, y_hat))

y_proba = np.argmax(nb_classifier.predict_proba(X_test_tfidf), axis=1)
fpr, tpr, thresholds = roc_curve(y_test_lemm, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### Random Forest metrics

In [None]:
rf_classifier = RandomForestClassifier(random_state=42, n_jobs=-1).fit(X_tfidf, y_train_lemm)
y_hat = rf_classifier.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test_lemm, y_hat))
print("Classification Report:\n", classification_report(y_test_lemm, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_lemm, y_hat))

y_proba = np.argmax(rf_classifier.predict_proba(X_test_tfidf), axis=1)
fpr, tpr, thresholds = roc_curve(y_test_lemm, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### PassiveAggressiveClassifier

In [None]:
pac = PassiveAggressiveClassifier(max_iter=50, random_state=42, n_jobs=-1).fit(X_tfidf, y_train_lemm)
y_hat = pac.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test_lemm, y_hat))
print("Classification Report:\n", classification_report(y_test_lemm, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_lemm, y_hat))

y_proba = pac.decision_function(X_test_tfidf)
fpr, tpr, thresholds = roc_curve(y_test_lemm, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

#### BoW - Best (Passive Agressive Classifier - Acc: 91.36 %, Gini: 93.7 % )¶

In [None]:
vectorizer = CountVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    lowercase=False
)

X_bow = vectorizer.fit_transform(X_train_lemm)
X_test_bow = vectorizer.transform(X_test_lemm)

##### Decision Tree metrics

In [None]:
dt_classifier = DecisionTreeClassifier(random_state=42).fit(X_bow, y_train_lemm)
y_hat = dt_classifier.predict(X_test_bow)

print("Accuracy:", accuracy_score(y_test_lemm, y_hat))
print("Classification Report:\n", classification_report(y_test_lemm, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_lemm, y_hat))

y_proba = np.argmax(dt_classifier.predict_proba(X_test_bow), axis=1)
fpr, tpr, thresholds = roc_curve(y_test_lemm, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### Logistic Regression metrics

In [None]:
log_classifier = LogisticRegression(random_state=42).fit(X_bow, y_train_lemm)
y_hat = log_classifier.predict(X_test_bow)

print("Accuracy:", accuracy_score(y_test_lemm, y_hat))
print("Classification Report:\n", classification_report(y_test_lemm, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_lemm, y_hat))

y_proba = np.argmax(log_classifier.predict_proba(X_test_bow), axis=1)
fpr, tpr, thresholds = roc_curve(y_test_lemm, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### Naive Bayes metrics

In [None]:
nb_classifier = MultinomialNB().fit(X_bow, y_train_lemm)
y_hat = nb_classifier.predict(X_test_bow)

print("Accuracy:", accuracy_score(y_test_lemm, y_hat))
print("Classification Report:\n", classification_report(y_test_lemm, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_lemm, y_hat))

y_proba = np.argmax(nb_classifier.predict_proba(X_test_bow), axis=1)
fpr, tpr, thresholds = roc_curve(y_test_lemm, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### Random Forest metrics

In [None]:
rf_classifier = RandomForestClassifier(random_state=42, n_jobs=-1).fit(X_bow, y_train_lemm)
y_hat = rf_classifier.predict(X_test_bow)

print("Accuracy:", accuracy_score(y_test_lemm, y_hat))
print("Classification Report:\n", classification_report(y_test_lemm, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_lemm, y_hat))

y_proba = np.argmax(rf_classifier.predict_proba(X_test_bow), axis=1)
fpr, tpr, thresholds = roc_curve(y_test_lemm, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

##### PassiveAggressiveClassifier

In [None]:
pac = PassiveAggressiveClassifier(max_iter=50, random_state=42, n_jobs=-1).fit(X_bow, y_train_lemm)
y_hat = pac.predict(X_test_bow)

print("Accuracy:", accuracy_score(y_test_lemm, y_hat))
print("Classification Report:\n", classification_report(y_test_lemm, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_lemm, y_hat))

y_proba = pac.decision_function(X_test_bow)
fpr, tpr, thresholds = roc_curve(y_test_lemm, y_proba)
print('Gini coef.:', 2*(auc(fpr, tpr))-1)

## Embeddings

In [None]:
%paste

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras import regularizers
from sklearn.metrics import roc_curve, auc
import numpy as np

# - DATA PREP

MAX_VOCAB = 20000
MAX_LEN = 300

# Create and train the Tokenizer
tokenizer = Tokenizer(num_words=MAX_VOCAB)
tokenizer.fit_on_texts(X_train_lemm) # Learn from training text

# Convert text to numbers
X_train_seq = tokenizer.texts_to_sequences(X_train_lemm)
X_test_seq = tokenizer.texts_to_sequences(X_test_lemm)

# Padding -all sequences same length-
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post', truncating='post')

# Get real vocab size for the model
vocab_size = len(tokenizer.word_index) + 1

print(f"Data ready. Train shape: {X_train_pad.shape}")
print(f"Vocab size: {vocab_size}")

# - MODEL DEFINITION

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=100, input_length=MAX_LEN),
    Bidirectional(LSTM(64, return_sequence=False)),
    Dropout(0.2),
    Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy', # for True/False
              metrics=['accuracy']
)

model.summary()

# - TRAIN

history = model.fit(
    X_train_pad, y_train_lemm,
    epochs=5,
    batch_size=64,
    validation_data=(X_test_pad, y_test_lemm))

# Predict
y_pred_keras = model.predict(X_test_pad).ravel()

# Geni
fpr, tpr, thresholds = roc_curve(y_test_lemm, y_pred_keras)
roc_auc = auc(fpr, tpr)
gini = 2 * roc_auc - 1

print(f"ROC AUC: {roc_auc:.4f}")
print(f"Gini Coefficient: {gini:.4f}")

acc = history.history['val_accuracy'][-1]
loss = history.history['val_loss'][-1]

print("Validation Accuracy:", acc)
print("Validation Loss:", loss)