## Import the data

In [342]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

In [343]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("training_data_lowercase.csv", header=None, names=['label', 'title'] ,sep="	")

data.head()

Unnamed: 0,label,title
0,0,donald trump sends out embarrassing new year‚s...
1,0,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...
3,0,trump is so obsessed he even has obama‚s name ...
4,0,pope francis just called out donald trump duri...


In [344]:
from sklearn.model_selection import train_test_split

X = data["title"]  # Features (input)
y = data["label"]  # Labels (output)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X.head(),"\n \n ")
print(y.head())

0    donald trump sends out embarrassing new year‚s...
1    drunk bragging trump staffer started russian c...
2    sheriff david clarke becomes an internet joke ...
3    trump is so obsessed he even has obama‚s name ...
4    pope francis just called out donald trump duri...
Name: title, dtype: object 
 
 
0    0
1    0
2    0
3    0
4    0
Name: label, dtype: int64


In [345]:
print(X)

0        donald trump sends out embarrassing new year‚s...
1        drunk bragging trump staffer started russian c...
2        sheriff david clarke becomes an internet joke ...
3        trump is so obsessed he even has obama‚s name ...
4        pope francis just called out donald trump duri...
                               ...                        
34147    tears in rain as thais gather for late king's ...
34148    pyongyang university needs non-u.s. teachers a...
34149    philippine president duterte to visit japan ah...
34150    japan's abe may have won election\tbut many do...
34151    demoralized and divided: inside catalonia's po...
Name: title, Length: 34152, dtype: object


### Clean titles

In [346]:
# Clean artefacts

import re

def clean_artefacts(X):

    cleaned_text =[]

    for sentence in X:

        # Turn into strings
        sentence = str(sentence)

        # Remove JS/CSS
        sentence = re.sub(r'<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>|<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>', '', sentence)

        # Remove HTML comments
        sentence = re.sub(r'<!--.*?-->', '', sentence)

        # Remove HTML tags
        sentence = re.sub(r'<[^>]+>', '', sentence)

        # Turn into a string
        sentence = str(sentence)

        # Remove all special characters
        sentence = re.sub(r'[^a-zA-Z\s]', ' ', sentence)  # Keep only letters and spaces
        
        # Remove numbers
        sentence = re.sub(r'\d+', ' ', sentence)
        
        # Remove all single characters
        sentence = re.sub(r'\b[a-zA-Z]\b', ' ', sentence)  # Remove isolated single letters
        
        # Remove single characters from the start
        sentence = re.sub(r'^[a-zA-Z]\s+', ' ', sentence)

        # Remove prefixed 'b' (commonly from bytes conversion)
        sentence = re.sub(r'\bb\s+', ' ', sentence)
        
        # Substitute multiple spaces with a single space
        sentence = re.sub(r'\s+', ' ', sentence).strip()
        
        # Convert to lowercase
        sentence = sentence.lower()
        
        # Append the cleaned sentence to the list
        cleaned_text.append(sentence)
    

    return cleaned_text

X_clean=clean_artefacts(X)

X_clean

['donald trump sends out embarrassing new year eve message this is disturbing',
 'drunk bragging trump staffer started russian collusion investigation',
 'sheriff david clarke becomes an internet joke for threatening to poke people in the eye',
 'trump is so obsessed he even has obama name coded into his website images',
 'pope francis just called out donald trump during his christmas speech',
 'racist alabama cops brutalize black boy while he is in handcuffs graphic images',
 'fresh off the golf course',
 'trump said some insanely racist stuff inside the oval office',
 'former cia director slams trump over un bullying',
 'brand new pro trump ad features so much kissing it will make you sick',
 'papa john founder retires',
 'paul ryan just told us he doesn care about struggling families living in blue states',
 'bad news for trump mitch mcconnell says no to repealing obamacare in',
 'lindsey graham trashes media for portraying trump as kooky forgets his own words',
 'heiress to disney 

### Remove stopwords

In [347]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')


print(type(X_clean))  # Should be a list of strings
print(type(stopwords))  # Should be a list or set

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each']
<class 'list'>
<class 'nltk.corpus.reader.wordlist.WordListCorpusReader'>


In [348]:
from nltk.corpus import stopwords
stopwords = set(stopwords.words("english"))


from nltk.tokenize import word_tokenize

def remove_stopwords(text, stopwords):
    cleaned_text = []  # To store cleaned sentences
    
    for sentence in text:
        # Tokenize the sentence into words
        sentence = word_tokenize(sentence)
        
        # Remove stopwords using set operations (retaining original structure)
        filtered_sentence = [word for word in sentence if word.lower() not in stopwords]
        
        # Join the filtered words back into a sentence
        cleaned_sentence = ' '.join(filtered_sentence)
        
        # Append the cleaned sentence to the result list
        cleaned_text.append(cleaned_sentence)
    
    return cleaned_text

X_nostop = remove_stopwords(X_clean, stopwords)

#keep stop words
X_nostop = X_clean

X_nostop

['donald trump sends out embarrassing new year eve message this is disturbing',
 'drunk bragging trump staffer started russian collusion investigation',
 'sheriff david clarke becomes an internet joke for threatening to poke people in the eye',
 'trump is so obsessed he even has obama name coded into his website images',
 'pope francis just called out donald trump during his christmas speech',
 'racist alabama cops brutalize black boy while he is in handcuffs graphic images',
 'fresh off the golf course',
 'trump said some insanely racist stuff inside the oval office',
 'former cia director slams trump over un bullying',
 'brand new pro trump ad features so much kissing it will make you sick',
 'papa john founder retires',
 'paul ryan just told us he doesn care about struggling families living in blue states',
 'bad news for trump mitch mcconnell says no to repealing obamacare in',
 'lindsey graham trashes media for portraying trump as kooky forgets his own words',
 'heiress to disney 

### Word split

In [367]:
def split_sentences_to_words(sentences_list):
    """
    Splits a list of sentences into a list of lists of words.

    Parameters:
        sentences_list (list of str): A list of sentences.

    Returns:
        list of list of str: A list where each sentence is represented as a list of words.
    """
    x_split = []
    for sentence in sentences_list:
        sentence_split = sentence.split()
        x_split.append(sentence_split)
    return x_split

# Example usage:
x_split = split_sentences_to_words(X_nostop)
print(x_split[:10])

[['donald', 'trump', 'sends', 'out', 'embarrassing', 'new', 'year', 'eve', 'message', 'this', 'is', 'disturbing'], ['drunk', 'bragging', 'trump', 'staffer', 'started', 'russian', 'collusion', 'investigation'], ['sheriff', 'david', 'clarke', 'becomes', 'an', 'internet', 'joke', 'for', 'threatening', 'to', 'poke', 'people', 'in', 'the', 'eye'], ['trump', 'is', 'so', 'obsessed', 'he', 'even', 'has', 'obama', 'name', 'coded', 'into', 'his', 'website', 'images'], ['pope', 'francis', 'just', 'called', 'out', 'donald', 'trump', 'during', 'his', 'christmas', 'speech'], ['racist', 'alabama', 'cops', 'brutalize', 'black', 'boy', 'while', 'he', 'is', 'in', 'handcuffs', 'graphic', 'images'], ['fresh', 'off', 'the', 'golf', 'course'], ['trump', 'said', 'some', 'insanely', 'racist', 'stuff', 'inside', 'the', 'oval', 'office'], ['former', 'cia', 'director', 'slams', 'trump', 'over', 'un', 'bullying'], ['brand', 'new', 'pro', 'trump', 'ad', 'features', 'so', 'much', 'kissing', 'it', 'will', 'make', 'y

## Model 1 : Lemmanization + BoW + NB = 93,44% acc

### Lemmanization - model 1

In [267]:
#Lemmanization

from nltk.stem.wordnet import WordNetLemmatizer

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

x_lemma1 = []

for word_list in x_split:  # Assuming x_split is a list of lists of words
    lemmatized_list = []  # To store lemmatized words for the current list
    for word in word_list:
        # Lemmatize the word
        lemmatized_word = lemmatizer.lemmatize(word)
        lemmatized_list.append(lemmatized_word)  # Add to the lemmatized list
    x_lemma1.append(lemmatized_list)  # Add the lemmatized list to x_lemma

print(x_lemma1[:10])

[['donald', 'trump', 'sends', 'out', 'embarrassing', 'new', 'year', 'eve', 'message', 'this', 'is', 'disturbing'], ['drunk', 'bragging', 'trump', 'staffer', 'started', 'russian', 'collusion', 'investigation'], ['sheriff', 'david', 'clarke', 'becomes', 'an', 'internet', 'joke', 'for', 'threatening', 'to', 'poke', 'people', 'in', 'the', 'eye'], ['trump', 'is', 'so', 'obsessed', 'he', 'even', 'ha', 'obama', 'name', 'coded', 'into', 'his', 'website', 'image'], ['pope', 'francis', 'just', 'called', 'out', 'donald', 'trump', 'during', 'his', 'christmas', 'speech'], ['racist', 'alabama', 'cop', 'brutalize', 'black', 'boy', 'while', 'he', 'is', 'in', 'handcuff', 'graphic', 'image'], ['fresh', 'off', 'the', 'golf', 'course'], ['trump', 'said', 'some', 'insanely', 'racist', 'stuff', 'inside', 'the', 'oval', 'office'], ['former', 'cia', 'director', 'slam', 'trump', 'over', 'un', 'bullying'], ['brand', 'new', 'pro', 'trump', 'ad', 'feature', 'so', 'much', 'kissing', 'it', 'will', 'make', 'you', 's

In [268]:
print(len(x_lemma1), len(y))

34152 34152


### Bag of words - 1

In [269]:
#Function

from nltk import probability
import math

def bag_of_words(sentences):
    return [dict((word, True) for word in sentence) for sentence in sentences]

In [270]:
#Bag of word lemma1

from sklearn.feature_extraction import DictVectorizer

# Generate Bag-of-Words dictionaries
Xbowl1 = bag_of_words(x_lemma1)

# Vectorize the Bag-of-Words
vectorizer = DictVectorizer(sparse=True)
X_features = vectorizer.fit_transform(Xbowl1)

print(Xbowl1[:10])

# Check the shape of the resulting feature matrix
print("Feature Matrix Shape:", X_features.shape)

# View the feature names
print("Feature Names:", vectorizer.get_feature_names_out())

[{'donald': True, 'trump': True, 'sends': True, 'out': True, 'embarrassing': True, 'new': True, 'year': True, 'eve': True, 'message': True, 'this': True, 'is': True, 'disturbing': True}, {'drunk': True, 'bragging': True, 'trump': True, 'staffer': True, 'started': True, 'russian': True, 'collusion': True, 'investigation': True}, {'sheriff': True, 'david': True, 'clarke': True, 'becomes': True, 'an': True, 'internet': True, 'joke': True, 'for': True, 'threatening': True, 'to': True, 'poke': True, 'people': True, 'in': True, 'the': True, 'eye': True}, {'trump': True, 'is': True, 'so': True, 'obsessed': True, 'he': True, 'even': True, 'ha': True, 'obama': True, 'name': True, 'coded': True, 'into': True, 'his': True, 'website': True, 'image': True}, {'pope': True, 'francis': True, 'just': True, 'called': True, 'out': True, 'donald': True, 'trump': True, 'during': True, 'his': True, 'christmas': True, 'speech': True}, {'racist': True, 'alabama': True, 'cop': True, 'brutalize': True, 'black':

In [271]:
print(len(Xbowl1), len(y))

34152 34152


In [272]:
Xbowl1 = bag_of_words(x_lemma1)
vectorizer = DictVectorizer(sparse=True)
X_features = vectorizer.fit_transform(Xbowl1)

### Train, test, split - bag of words 1

In [273]:
print(X_features.shape)
print(len(y))

(34152, 15871)
34152


In [274]:
from sklearn.model_selection import train_test_split

# Ensure `y` is your labels corresponding to `Xbow11`
X_train1, X_test, y_train1, y_test = train_test_split(X_features, y, test_size=0.2, random_state=42)

### Naive Baye test

In [275]:
from sklearn.naive_bayes import MultinomialNB

# Initialize the classifier
clf = MultinomialNB()

# Train the classifier
clf.fit(X_train1, y_train1)

In [276]:
# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
from sklearn.metrics import classification_report, accuracy_score
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9434929000146391
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.95      0.95      3529
           1       0.95      0.94      0.94      3302

    accuracy                           0.94      6831
   macro avg       0.94      0.94      0.94      6831
weighted avg       0.94      0.94      0.94      6831



In [277]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# Define parameter grid for alpha (smoothing parameter)
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 2.0, 5.0]  # Test a range of alpha values
}

# Initialize the MultinomialNB model
nb = MultinomialNB()

# Setup GridSearchCV
grid_search = GridSearchCV(estimator=nb, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)

# Perform grid search on training data
grid_search.fit(X_train1, y_train1)

# Get the best parameters and the corresponding score
best_alpha = grid_search.best_params_['alpha']
best_score = grid_search.best_score_

print(f"Best alpha: {best_alpha}")
print(f"Best cross-validated accuracy: {best_score}")

# Evaluate the best model on test data
best_nb = grid_search.best_estimator_
y_pred = best_nb.predict(X_test)

print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best alpha: 0.5
Best cross-validated accuracy: 0.9386186018038389

Test Accuracy: 0.9437856829161178
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.95      0.95      3529
           1       0.95      0.94      0.94      3302

    accuracy                           0.94      6831
   macro avg       0.94      0.94      0.94      6831
weighted avg       0.94      0.94      0.94      6831



## Model 2 - Lemmanization + bigram + NB = 89,06% acc

### Lemmanization - model 2

In [278]:
#Lemmanization

from nltk.stem.wordnet import WordNetLemmatizer

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

x_lemma1 = []

for word_list in x_split:  # Assuming x_split is a list of lists of words
    lemmatized_list = []  # To store lemmatized words for the current list
    for word in word_list:
        # Lemmatize the word
        lemmatized_word = lemmatizer.lemmatize(word)
        lemmatized_list.append(lemmatized_word)  # Add to the lemmatized list
    x_lemma1.append(lemmatized_list)  # Add the lemmatized list to x_lemma

print(x_lemma1[:10])

[['donald', 'trump', 'sends', 'out', 'embarrassing', 'new', 'year', 'eve', 'message', 'this', 'is', 'disturbing'], ['drunk', 'bragging', 'trump', 'staffer', 'started', 'russian', 'collusion', 'investigation'], ['sheriff', 'david', 'clarke', 'becomes', 'an', 'internet', 'joke', 'for', 'threatening', 'to', 'poke', 'people', 'in', 'the', 'eye'], ['trump', 'is', 'so', 'obsessed', 'he', 'even', 'ha', 'obama', 'name', 'coded', 'into', 'his', 'website', 'image'], ['pope', 'francis', 'just', 'called', 'out', 'donald', 'trump', 'during', 'his', 'christmas', 'speech'], ['racist', 'alabama', 'cop', 'brutalize', 'black', 'boy', 'while', 'he', 'is', 'in', 'handcuff', 'graphic', 'image'], ['fresh', 'off', 'the', 'golf', 'course'], ['trump', 'said', 'some', 'insanely', 'racist', 'stuff', 'inside', 'the', 'oval', 'office'], ['former', 'cia', 'director', 'slam', 'trump', 'over', 'un', 'bullying'], ['brand', 'new', 'pro', 'trump', 'ad', 'feature', 'so', 'much', 'kissing', 'it', 'will', 'make', 'you', 's

In [279]:
print(len(x_lemma1), len(y))

34152 34152


### N-gram

In [280]:
import nltk
from nltk.util import bigrams

def bigramer (sentences):
    #return dict([(word, True) for sentence in sentences for word in sentence])
    bigram_list = []
    for sentence in sentences:
        bigram_dict = dict([(bigram, True) for bigram in bigrams(sentence)])
        bigram_list.append(bigram_dict)
    return bigram_list

X_bigram = bigramer(x_lemma1)

print(X_bigram[:10])

[{('donald', 'trump'): True, ('trump', 'sends'): True, ('sends', 'out'): True, ('out', 'embarrassing'): True, ('embarrassing', 'new'): True, ('new', 'year'): True, ('year', 'eve'): True, ('eve', 'message'): True, ('message', 'this'): True, ('this', 'is'): True, ('is', 'disturbing'): True}, {('drunk', 'bragging'): True, ('bragging', 'trump'): True, ('trump', 'staffer'): True, ('staffer', 'started'): True, ('started', 'russian'): True, ('russian', 'collusion'): True, ('collusion', 'investigation'): True}, {('sheriff', 'david'): True, ('david', 'clarke'): True, ('clarke', 'becomes'): True, ('becomes', 'an'): True, ('an', 'internet'): True, ('internet', 'joke'): True, ('joke', 'for'): True, ('for', 'threatening'): True, ('threatening', 'to'): True, ('to', 'poke'): True, ('poke', 'people'): True, ('people', 'in'): True, ('in', 'the'): True, ('the', 'eye'): True}, {('trump', 'is'): True, ('is', 'so'): True, ('so', 'obsessed'): True, ('obsessed', 'he'): True, ('he', 'even'): True, ('even', 'h

In [281]:
# Vectorize bigram list

from sklearn.feature_extraction import DictVectorizer

# Assuming X_bigram is a list of dictionaries (Bag-of-Bigrams)
vectorizer = DictVectorizer(sparse=True)

# Fit and transform the Bag-of-Bigrams into a numerical feature matrix
X_bigram_feat = vectorizer.fit_transform(X_bigram)

# Check the shape of the matrix
print("Feature matrix shape:", X_bigram_feat.shape)


Feature matrix shape: (34152, 172018)


### Train, test, split - bigrams

In [282]:
X_train_bigram, X_test_bigram, y_train_bigram, y_test_bigram = train_test_split(X_bigram_feat, y, test_size=0.2, random_state=42)

### Naive Baye test

In [283]:
from sklearn.naive_bayes import MultinomialNB

# Initialize the classifier
clf_bigram = MultinomialNB()

# Train the classifier
clf_bigram.fit(X_train_bigram, y_train_bigram)

In [284]:
# Make predictions
y_pred_bigram = clf_bigram.predict(X_test_bigram)

# Evaluate the model
from sklearn.metrics import classification_report, accuracy_score
print("Accuracy:", accuracy_score(y_test, y_pred_bigram))
print("Classification Report:\n", classification_report(y_test, y_pred_bigram))

Accuracy: 0.9213877909530084
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.90      0.92      3529
           1       0.90      0.94      0.92      3302

    accuracy                           0.92      6831
   macro avg       0.92      0.92      0.92      6831
weighted avg       0.92      0.92      0.92      6831



## Model 3 : Lemmanization + TF-IDF + NB = 92,73% acc

### Lemmanization for TFIDF - model 2

In [285]:
#Lemmanization for TF-IDF

from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

x_lemma2 = []

def lem_tfidf(data):
    x_lemma = []  # Initialize list to store lemmatized sentences
    for word_list in data:
        lemmatized_list = []  # To store lemmatized words for the current list
        for word in word_list:
            # Lemmatize the word
            lemmatized_word = lemmatizer.lemmatize(word)
            lemmatized_list.append(lemmatized_word)  # Add to the lemmatized list
        x_lemma.append(" ".join(lemmatized_list))  # Combine into a sentence
    return x_lemma  # Return the lemmatized sentences

x_lemma2 = lem_tfidf(x_split)

print(x_lemma2[:10])

['donald trump sends out embarrassing new year eve message this is disturbing', 'drunk bragging trump staffer started russian collusion investigation', 'sheriff david clarke becomes an internet joke for threatening to poke people in the eye', 'trump is so obsessed he even ha obama name coded into his website image', 'pope francis just called out donald trump during his christmas speech', 'racist alabama cop brutalize black boy while he is in handcuff graphic image', 'fresh off the golf course', 'trump said some insanely racist stuff inside the oval office', 'former cia director slam trump over un bullying', 'brand new pro trump ad feature so much kissing it will make you sick']


In [286]:
print(len(x_lemma2), len(y))

34152 34152


### TF-IDF

In [287]:
# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Transform the lemmatized sentences into a TF-IDF feature matrix
X_tfidf = tfidf_vectorizer.fit_transform(x_lemma2)

# Check the shape of the TF-IDF matrix
print("TF-IDF Matrix Shape:", X_tfidf.shape)  # (number_of_documents, number_of_features)

# View feature names
print("Feature Names:", tfidf_vectorizer.get_feature_names_out())

TF-IDF Matrix Shape: (34152, 15860)
Feature Names: ['aar' 'aardvark' 'aaron' ... 'zuma' 'zummar' 'zurich']


### Train, test, split - bag of words 1

In [288]:
X_train_IDF, X_test_IDF, y_train_IDF, y_test_IDF= train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

### Naive Baye test

In [289]:
from sklearn.naive_bayes import MultinomialNB

# Initialize the classifier
clf_IDF = MultinomialNB()

# Train the classifier
clf_IDF.fit(X_train_IDF, y_train_IDF)

In [290]:
# Make predictions
y_pred_IDF = clf_IDF.predict(X_test_IDF)

# Evaluate the model
from sklearn.metrics import classification_report, accuracy_score
print("Accuracy:", accuracy_score(y_test_IDF, y_pred_IDF))
print("Classification Report:\n", classification_report(y_test_IDF, y_pred_IDF))

Accuracy: 0.9382228077880251
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.95      0.94      3529
           1       0.95      0.93      0.94      3302

    accuracy                           0.94      6831
   macro avg       0.94      0.94      0.94      6831
weighted avg       0.94      0.94      0.94      6831



In [291]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# Define parameter grid for alpha (smoothing parameter)
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 2.0, 5.0]  # Test a range of alpha values
}

# Initialize the MultinomialNB model
nb = MultinomialNB()

# Setup GridSearchCV
grid_search = GridSearchCV(estimator=nb, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)

# Perform grid search on training data
grid_search.fit(X_train_IDF, y_train_IDF)

# Get the best parameters and the corresponding score
best_alpha = grid_search.best_params_['alpha']
best_score = grid_search.best_score_

print(f"Best alpha: {best_alpha}")
print(f"Best cross-validated accuracy: {best_score}")

# Evaluate the best model on test data
best_nb = grid_search.best_estimator_
y_pred_IDF = best_nb.predict(X_test_IDF)

print("\nTest Accuracy:", accuracy_score(y_test_IDF, y_pred_IDF))
print("Classification Report:\n", classification_report(y_test_IDF, y_pred_IDF))


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best alpha: 0.5
Best cross-validated accuracy: 0.9331282994806562

Test Accuracy: 0.9391011564924608
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.95      0.94      3529
           1       0.95      0.93      0.94      3302

    accuracy                           0.94      6831
   macro avg       0.94      0.94      0.94      6831
weighted avg       0.94      0.94      0.94      6831



## Model 4 : Lemmanization + TF-IDF + Random Forest = 91,59% acc

### Lemmanization for TFIDF - model 2

In [292]:
#Lemmanization for TF-IDF

from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

x_lemma2 = []

def lem_tfidf(data):
    x_lemma = []  # Initialize list to store lemmatized sentences
    for word_list in data:
        lemmatized_list = []  # To store lemmatized words for the current list
        for word in word_list:
            # Lemmatize the word
            lemmatized_word = lemmatizer.lemmatize(word)
            lemmatized_list.append(lemmatized_word)  # Add to the lemmatized list
        x_lemma.append(" ".join(lemmatized_list))  # Combine into a sentence
    return x_lemma  # Return the lemmatized sentences

x_lemma2 = lem_tfidf(x_split)

print(x_lemma2[:10])

['donald trump sends out embarrassing new year eve message this is disturbing', 'drunk bragging trump staffer started russian collusion investigation', 'sheriff david clarke becomes an internet joke for threatening to poke people in the eye', 'trump is so obsessed he even ha obama name coded into his website image', 'pope francis just called out donald trump during his christmas speech', 'racist alabama cop brutalize black boy while he is in handcuff graphic image', 'fresh off the golf course', 'trump said some insanely racist stuff inside the oval office', 'former cia director slam trump over un bullying', 'brand new pro trump ad feature so much kissing it will make you sick']


In [293]:
print(len(x_lemma2), len(y))

34152 34152


### TF-IDF

In [294]:
# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Transform the lemmatized sentences into a TF-IDF feature matrix
X_tfidf = tfidf_vectorizer.fit_transform(x_lemma2)

# Check the shape of the TF-IDF matrix
print("TF-IDF Matrix Shape:", X_tfidf.shape)  # (number_of_documents, number_of_features)

# View feature names
print("Feature Names:", tfidf_vectorizer.get_feature_names_out())

TF-IDF Matrix Shape: (34152, 15860)
Feature Names: ['aar' 'aardvark' 'aaron' ... 'zuma' 'zummar' 'zurich']


### Train, test, split - bag of words 1

In [295]:
X_train_IDF, X_test_IDF, y_train_IDF, y_test_IDF= train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

### Random forest test

In [296]:
# Import the RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Initialize the Random Forest classifier
clf_rf = RandomForestClassifier(n_estimators=200, random_state=42)

# Train the classifier
clf_rf.fit(X_train_IDF, y_train_IDF)

In [297]:
# Make predictions
y_pred_rf = clf_rf.predict(X_test_IDF)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test_IDF, y_pred_rf))
print("Classification Report:\n", classification_report(y_test_IDF, y_pred_rf))

Accuracy: 0.9351485873225004
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.93      0.94      3529
           1       0.93      0.94      0.93      3302

    accuracy                           0.94      6831
   macro avg       0.94      0.94      0.94      6831
weighted avg       0.94      0.94      0.94      6831



## Model 5 : Lemmanization + TF-IDF + Linear Regression = 93,93% acc

### Lemmanization for TFIDF - model 2

In [298]:
#Lemmanization for TF-IDF

from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

x_lemma2 = []

def lem_tfidf(data):
    x_lemma = []  # Initialize list to store lemmatized sentences
    for word_list in data:
        lemmatized_list = []  # To store lemmatized words for the current list
        for word in word_list:
            # Lemmatize the word
            lemmatized_word = lemmatizer.lemmatize(word)
            lemmatized_list.append(lemmatized_word)  # Add to the lemmatized list
        x_lemma.append(" ".join(lemmatized_list))  # Combine into a sentence
    return x_lemma  # Return the lemmatized sentences

x_lemma2 = lem_tfidf(x_split)

print(x_lemma2[:10])

['donald trump sends out embarrassing new year eve message this is disturbing', 'drunk bragging trump staffer started russian collusion investigation', 'sheriff david clarke becomes an internet joke for threatening to poke people in the eye', 'trump is so obsessed he even ha obama name coded into his website image', 'pope francis just called out donald trump during his christmas speech', 'racist alabama cop brutalize black boy while he is in handcuff graphic image', 'fresh off the golf course', 'trump said some insanely racist stuff inside the oval office', 'former cia director slam trump over un bullying', 'brand new pro trump ad feature so much kissing it will make you sick']


In [299]:
print(len(x_lemma2), len(y))

34152 34152


### TF-IDF

In [300]:
# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Transform the lemmatized sentences into a TF-IDF feature matrix
X_tfidf = tfidf_vectorizer.fit_transform(x_lemma2)

# Check the shape of the TF-IDF matrix
print("TF-IDF Matrix Shape:", X_tfidf.shape)  # (number_of_documents, number_of_features)

# View feature names
print("Feature Names:", tfidf_vectorizer.get_feature_names_out())

TF-IDF Matrix Shape: (34152, 15860)
Feature Names: ['aar' 'aardvark' 'aaron' ... 'zuma' 'zummar' 'zurich']


### Train, test, split - bag of words 1

In [301]:
X_train_IDF, X_test_IDF, y_train_IDF, y_test_IDF= train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

### Logistic regression

In [302]:
# Import the Logistic Regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Initialize the Logistic Regression classifier
clf_lr = LogisticRegression(random_state=42, max_iter=1000)  # Increase max_iter if convergence warnings occur

# Train the Logistic Regression model
clf_lr.fit(X_train_IDF, y_train_IDF)

In [303]:
# Make predictions
y_pred_lr = clf_lr.predict(X_test_IDF)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test_IDF, y_pred_lr))
print("\nClassification Report:\n", classification_report(y_test_IDF, y_pred_lr))

Accuracy: 0.9432001171131605

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.94      0.94      3529
           1       0.93      0.95      0.94      3302

    accuracy                           0.94      6831
   macro avg       0.94      0.94      0.94      6831
weighted avg       0.94      0.94      0.94      6831



In [304]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# Define parameter grid for logistic regression
param_grid = {
    'C': [0.01, 0.1, 1.0, 10.0, 100.0],  # Inverse of regularization strength
    'solver': ['liblinear', 'lbfgs'],    # Different solvers
    'penalty': ['l2'],                   # L1 penalty for liblinear solver, L2 for both
    'max_iter': [100, 500, 1000]         # Different iteration limits
}

# Initialize the logistic regression model
lr = LogisticRegression(random_state=42)

# Setup GridSearchCV
grid_search = GridSearchCV(estimator=lr, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)

# Perform grid search on training data
grid_search.fit(X_train_IDF, y_train_IDF)

# Get the best parameters and the corresponding score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best parameters: {best_params}")
print(f"Best cross-validated accuracy: {best_score}")

# Evaluate the best model on test data
best_lr = grid_search.best_estimator_
y_pred_lr = best_lr.predict(X_test_IDF)

print("\nTest Accuracy:", accuracy_score(y_test_IDF, y_pred_lr))
print("Classification Report:\n", classification_report(y_test_IDF, y_pred_lr))


Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best parameters: {'C': 10.0, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best cross-validated accuracy: 0.9434865823910712

Test Accuracy: 0.9484702093397746
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.95      0.95      3529
           1       0.94      0.95      0.95      3302

    accuracy                           0.95      6831
   macro avg       0.95      0.95      0.95      6831
weighted avg       0.95      0.95      0.95      6831



## Model 6 : Stemming + BoW + NB = 93.98% acc

### Stemmer (snowball)

In [305]:
from nltk.stem.snowball import SnowballStemmer


# Initialize the stemmer
snowball = SnowballStemmer('english')

def stemming(x_split):
    stem_list = []  # To store stemmed words for the current list of lists
    for word_list in x_split:  # Assuming x_split is a list of lists of words
        stemmed_list = []  # To store stemmed words for the current list
        for word in word_list:
            # Stem the word
            stemmed_word = snowball.stem(word)
            stemmed_list.append(stemmed_word)  # Add to the stemmed list
        stem_list.append(stemmed_list)  # Add the stemmed list to the final list
    return stem_list

x_stem = stemming(x_split)

print(x_stem[:10])  # Print the first 10 results

[['donald', 'trump', 'send', 'out', 'embarrass', 'new', 'year', 'eve', 'messag', 'this', 'is', 'disturb'], ['drunk', 'brag', 'trump', 'staffer', 'start', 'russian', 'collus', 'investig'], ['sheriff', 'david', 'clark', 'becom', 'an', 'internet', 'joke', 'for', 'threaten', 'to', 'poke', 'peopl', 'in', 'the', 'eye'], ['trump', 'is', 'so', 'obsess', 'he', 'even', 'has', 'obama', 'name', 'code', 'into', 'his', 'websit', 'imag'], ['pope', 'franci', 'just', 'call', 'out', 'donald', 'trump', 'dure', 'his', 'christma', 'speech'], ['racist', 'alabama', 'cop', 'brutal', 'black', 'boy', 'while', 'he', 'is', 'in', 'handcuff', 'graphic', 'imag'], ['fresh', 'off', 'the', 'golf', 'cours'], ['trump', 'said', 'some', 'insan', 'racist', 'stuff', 'insid', 'the', 'oval', 'offic'], ['former', 'cia', 'director', 'slam', 'trump', 'over', 'un', 'bulli'], ['brand', 'new', 'pro', 'trump', 'ad', 'featur', 'so', 'much', 'kiss', 'it', 'will', 'make', 'you', 'sick']]


In [306]:
print(len(x_stem), len(y))

34152 34152


### Bag of words (stem)

In [307]:
#Function

from nltk import probability
import math

def bag_of_words(sentences):
    return [dict((word, True) for word in sentence) for sentence in sentences]

In [308]:
#Bag of word lemma1

from sklearn.feature_extraction import DictVectorizer

# Generate Bag-of-Words dictionaries
Xbows = bag_of_words(x_stem)

# Vectorize the Bag-of-Words
vectorizer = DictVectorizer(sparse=True)
X_features = vectorizer.fit_transform(Xbows)

print(Xbows[:10])

# Check the shape of the resulting feature matrix
print("Feature Matrix Shape:", X_features.shape)

# View the feature names
print("Feature Names:", vectorizer.get_feature_names_out())

[{'donald': True, 'trump': True, 'send': True, 'out': True, 'embarrass': True, 'new': True, 'year': True, 'eve': True, 'messag': True, 'this': True, 'is': True, 'disturb': True}, {'drunk': True, 'brag': True, 'trump': True, 'staffer': True, 'start': True, 'russian': True, 'collus': True, 'investig': True}, {'sheriff': True, 'david': True, 'clark': True, 'becom': True, 'an': True, 'internet': True, 'joke': True, 'for': True, 'threaten': True, 'to': True, 'poke': True, 'peopl': True, 'in': True, 'the': True, 'eye': True}, {'trump': True, 'is': True, 'so': True, 'obsess': True, 'he': True, 'even': True, 'has': True, 'obama': True, 'name': True, 'code': True, 'into': True, 'his': True, 'websit': True, 'imag': True}, {'pope': True, 'franci': True, 'just': True, 'call': True, 'out': True, 'donald': True, 'trump': True, 'dure': True, 'his': True, 'christma': True, 'speech': True}, {'racist': True, 'alabama': True, 'cop': True, 'brutal': True, 'black': True, 'boy': True, 'while': True, 'he': T

In [309]:
print(len(Xbows), len(y))

34152 34152


In [310]:
Xbows = bag_of_words(x_stem)
vectorizer = DictVectorizer(sparse=True)
X_features = vectorizer.fit_transform(Xbows)

### Train, test, split - bag of words 1

In [311]:
print(X_features.shape)
print(len(y))

(34152, 11785)
34152


In [312]:
from sklearn.model_selection import train_test_split

# Ensure `y` is your labels corresponding to `Xbows`
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_features, y, test_size=0.2, random_state=42)

### Naive Baye test

In [313]:
from sklearn.naive_bayes import MultinomialNB

# Initialize the classifier
clf = MultinomialNB()

# Train the classifier
clf.fit(X_train2, y_train2)

In [314]:
# Make predictions
y_pred_bows = clf.predict(X_test2)

# Evaluate the model
from sklearn.metrics import classification_report, accuracy_score
print("Accuracy:", accuracy_score(y_test2, y_pred_bows))
print("Classification Report:\n", classification_report(y_test2, y_pred_bows))

Accuracy: 0.9398331137461572
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.94      0.94      3529
           1       0.94      0.93      0.94      3302

    accuracy                           0.94      6831
   macro avg       0.94      0.94      0.94      6831
weighted avg       0.94      0.94      0.94      6831



In [315]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# Define parameter grid for alpha (smoothing parameter)
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 2.0, 5.0]  # Test a range of alpha values
}

# Initialize the MultinomialNB model
nb = MultinomialNB()

# Setup GridSearchCV
grid_search = GridSearchCV(estimator=nb, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)

# Perform grid search on training data
grid_search.fit(X_train2, y_train2)

# Get the best parameters and the corresponding score
best_alpha = grid_search.best_params_['alpha']
best_score = grid_search.best_score_

print(f"Best alpha: {best_alpha}")
print(f"Best cross-validated accuracy: {best_score}")

# Evaluate the best model on test data
best_nb = grid_search.best_estimator_
y_pred_bows = best_nb.predict(X_test2)

print("\nTest Accuracy:", accuracy_score(y_test2, y_pred_bows))
print("Classification Report:\n", classification_report(y_test2, y_pred_bows))


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best alpha: 1.0
Best cross-validated accuracy: 0.9347021777074662

Test Accuracy: 0.9398331137461572
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.94      0.94      3529
           1       0.94      0.93      0.94      3302

    accuracy                           0.94      6831
   macro avg       0.94      0.94      0.94      6831
weighted avg       0.94      0.94      0.94      6831



## Model 7 : Lemmanization + TF-IDF + SVM = 93.73%

### Lemmanization for TFIDF - model 2

In [316]:
#Lemmanization for TF-IDF

from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

x_lemma2 = []

def lem_tfidf(data):
    x_lemma = []  # Initialize list to store lemmatized sentences
    for word_list in data:
        lemmatized_list = []  # To store lemmatized words for the current list
        for word in word_list:
            # Lemmatize the word
            lemmatized_word = lemmatizer.lemmatize(word)
            lemmatized_list.append(lemmatized_word)  # Add to the lemmatized list
        x_lemma.append(" ".join(lemmatized_list))  # Combine into a sentence
    return x_lemma  # Return the lemmatized sentences

x_lemma2 = lem_tfidf(x_split)

print(x_lemma2[:10])

['donald trump sends out embarrassing new year eve message this is disturbing', 'drunk bragging trump staffer started russian collusion investigation', 'sheriff david clarke becomes an internet joke for threatening to poke people in the eye', 'trump is so obsessed he even ha obama name coded into his website image', 'pope francis just called out donald trump during his christmas speech', 'racist alabama cop brutalize black boy while he is in handcuff graphic image', 'fresh off the golf course', 'trump said some insanely racist stuff inside the oval office', 'former cia director slam trump over un bullying', 'brand new pro trump ad feature so much kissing it will make you sick']


In [317]:
print(len(x_lemma2), len(y))

34152 34152


### TF-IDF

In [318]:
# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Transform the lemmatized sentences into a TF-IDF feature matrix
X_tfidf = tfidf_vectorizer.fit_transform(x_lemma2)

# Check the shape of the TF-IDF matrix
print("TF-IDF Matrix Shape:", X_tfidf.shape)  # (number_of_documents, number_of_features)

# View feature names
print("Feature Names:", tfidf_vectorizer.get_feature_names_out())

TF-IDF Matrix Shape: (34152, 15860)
Feature Names: ['aar' 'aardvark' 'aaron' ... 'zuma' 'zummar' 'zurich']


### Train, test, split - bag of words 1

In [319]:
X_train_IDF, X_test_IDF, y_train_IDF, y_test_IDF= train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

### SVM

In [320]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Initialize the SVM classifier
clf_svm = SVC(kernel='rbf', gamma='scale', C=100.0, random_state=42)  # You can adjust the kernel and C parameter

# Train the SVM model
clf_svm.fit(X_train_IDF, y_train_IDF)

In [321]:
# Make predictions
y_pred_svm = clf_svm.predict(X_test_IDF)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test_IDF, y_pred_svm))
print("\nClassification Report:\n", classification_report(y_test_IDF, y_pred_svm))

Accuracy: 0.9527155614112136

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.95      0.95      3529
           1       0.94      0.96      0.95      3302

    accuracy                           0.95      6831
   macro avg       0.95      0.95      0.95      6831
weighted avg       0.95      0.95      0.95      6831



In [322]:
# from sklearn.svm import SVC
# from sklearn.model_selection import GridSearchCV

# # Define parameter grid
# param_grid = {
#     'C': [0.1, 1.0, 10.0],
#     'kernel': ['linear', 'rbf'],
#     'gamma': ['scale', 'auto']  # Used for non-linear kernels like 'rbf'
# }

# # Initialize the SVC model
# svm = SVC(random_state=42)

# # Perform GridSearchCV
# grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)
# grid_search.fit(X_train_IDF, y_train_IDF)

# # Evaluate the best model
# best_svm = grid_search.best_estimator_
# y_pred_svm = best_svm.predict(X_test_IDF)

# print("Best Parameters:", grid_search.best_params_)
# print("Accuracy:", accuracy_score(y_test_IDF, y_pred_svm))
# print("\nClassification Report:\n", classification_report(y_test_IDF, y_pred_svm))


## Model 8 : Lemmanization + BoW + SVM = 93,44% acc

### Lemmanization - model 1

In [323]:
#Lemmanization

from nltk.stem.wordnet import WordNetLemmatizer

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

x_lemma1 = []

for word_list in x_split:  # Assuming x_split is a list of lists of words
    lemmatized_list = []  # To store lemmatized words for the current list
    for word in word_list:
        # Lemmatize the word
        lemmatized_word = lemmatizer.lemmatize(word)
        lemmatized_list.append(lemmatized_word)  # Add to the lemmatized list
    x_lemma1.append(lemmatized_list)  # Add the lemmatized list to x_lemma

print(x_lemma1[:10])

[['donald', 'trump', 'sends', 'out', 'embarrassing', 'new', 'year', 'eve', 'message', 'this', 'is', 'disturbing'], ['drunk', 'bragging', 'trump', 'staffer', 'started', 'russian', 'collusion', 'investigation'], ['sheriff', 'david', 'clarke', 'becomes', 'an', 'internet', 'joke', 'for', 'threatening', 'to', 'poke', 'people', 'in', 'the', 'eye'], ['trump', 'is', 'so', 'obsessed', 'he', 'even', 'ha', 'obama', 'name', 'coded', 'into', 'his', 'website', 'image'], ['pope', 'francis', 'just', 'called', 'out', 'donald', 'trump', 'during', 'his', 'christmas', 'speech'], ['racist', 'alabama', 'cop', 'brutalize', 'black', 'boy', 'while', 'he', 'is', 'in', 'handcuff', 'graphic', 'image'], ['fresh', 'off', 'the', 'golf', 'course'], ['trump', 'said', 'some', 'insanely', 'racist', 'stuff', 'inside', 'the', 'oval', 'office'], ['former', 'cia', 'director', 'slam', 'trump', 'over', 'un', 'bullying'], ['brand', 'new', 'pro', 'trump', 'ad', 'feature', 'so', 'much', 'kissing', 'it', 'will', 'make', 'you', 's

In [324]:
print(len(x_lemma1), len(y))

34152 34152


### Bag of words - 1

In [325]:
#Function

from nltk import probability
import math

def bag_of_words(sentences):
    return [dict((word, True) for word in sentence) for sentence in sentences]

In [326]:
#Bag of word lemma1

from sklearn.feature_extraction import DictVectorizer

# Generate Bag-of-Words dictionaries
Xbowl1 = bag_of_words(x_lemma1)

# Vectorize the Bag-of-Words
vectorizer = DictVectorizer(sparse=True)
X_features = vectorizer.fit_transform(Xbowl1)

print(Xbowl1[:10])

# Check the shape of the resulting feature matrix
print("Feature Matrix Shape:", X_features.shape)

# View the feature names
print("Feature Names:", vectorizer.get_feature_names_out())

[{'donald': True, 'trump': True, 'sends': True, 'out': True, 'embarrassing': True, 'new': True, 'year': True, 'eve': True, 'message': True, 'this': True, 'is': True, 'disturbing': True}, {'drunk': True, 'bragging': True, 'trump': True, 'staffer': True, 'started': True, 'russian': True, 'collusion': True, 'investigation': True}, {'sheriff': True, 'david': True, 'clarke': True, 'becomes': True, 'an': True, 'internet': True, 'joke': True, 'for': True, 'threatening': True, 'to': True, 'poke': True, 'people': True, 'in': True, 'the': True, 'eye': True}, {'trump': True, 'is': True, 'so': True, 'obsessed': True, 'he': True, 'even': True, 'ha': True, 'obama': True, 'name': True, 'coded': True, 'into': True, 'his': True, 'website': True, 'image': True}, {'pope': True, 'francis': True, 'just': True, 'called': True, 'out': True, 'donald': True, 'trump': True, 'during': True, 'his': True, 'christmas': True, 'speech': True}, {'racist': True, 'alabama': True, 'cop': True, 'brutalize': True, 'black':

In [327]:
print(len(Xbowl1), len(y))

34152 34152


In [328]:
Xbowl1 = bag_of_words(x_lemma1)
vectorizer = DictVectorizer(sparse=True)
X_features = vectorizer.fit_transform(Xbowl1)

### Train, test, split - bag of words 1

In [329]:
print(X_features.shape)
print(len(y))

(34152, 15871)
34152


In [330]:
from sklearn.model_selection import train_test_split

# Ensure `y` is your labels corresponding to `Xbow11`
X_train1, X_test, y_train1, y_test = train_test_split(X_features, y, test_size=0.2, random_state=42)

### Set SVM

In [331]:
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Check if X_train1 is raw text or already vectorized
if isinstance(X_train1, list) or isinstance(X_train1, pd.Series):
    vectorizer = TfidfVectorizer()
    X_train_TFIDF = vectorizer.fit_transform(X_train1)
    X_test_TFIDF = vectorizer.transform(X_test)
else:
    X_train_TFIDF = X_train1
    X_test_TFIDF = X_test

# Ensure labels are numeric
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train1)
y_test = label_encoder.transform(y_test)

# Train SVM
clf_bow_svm = SVC(kernel='rbf', gamma='scale', C=100.0)
clf_bow_svm.fit(X_train_TFIDF, y_train)

# Predict and evaluate
y_pred = clf_bow_svm.predict(X_test_TFIDF)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9505196896501245

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.95      0.95      3529
           1       0.95      0.95      0.95      3302

    accuracy                           0.95      6831
   macro avg       0.95      0.95      0.95      6831
weighted avg       0.95      0.95      0.95      6831



In [413]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(clf_bow_svm, X_test_IDF, y_test_IDF, cv=5, scoring='accuracy')
print(f"Cross-validation scores: {scores}")
print(f"Mean accuracy: {scores.mean():.2f} ± {scores.std():.2f}")


Cross-validation scores: [0.92392099 0.93191801 0.92825769 0.9363104  0.9341142 ]
Mean accuracy: 0.93 ± 0.00


### Lemmanization for TFIDF - model 2

In [332]:
#Lemmanization for TF-IDF

from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

x_lemma2 = []

def lem_tfidf(data):
    x_lemma = []  # Initialize list to store lemmatized sentences
    for word_list in data:
        lemmatized_list = []  # To store lemmatized words for the current list
        for word in word_list:
            # Lemmatize the word
            lemmatized_word = lemmatizer.lemmatize(word)
            lemmatized_list.append(lemmatized_word)  # Add to the lemmatized list
        x_lemma.append(" ".join(lemmatized_list))  # Combine into a sentence
    return x_lemma  # Return the lemmatized sentences

x_lemma2 = lem_tfidf(x_split)

print(x_lemma2[:10])

['donald trump sends out embarrassing new year eve message this is disturbing', 'drunk bragging trump staffer started russian collusion investigation', 'sheriff david clarke becomes an internet joke for threatening to poke people in the eye', 'trump is so obsessed he even ha obama name coded into his website image', 'pope francis just called out donald trump during his christmas speech', 'racist alabama cop brutalize black boy while he is in handcuff graphic image', 'fresh off the golf course', 'trump said some insanely racist stuff inside the oval office', 'former cia director slam trump over un bullying', 'brand new pro trump ad feature so much kissing it will make you sick']


In [333]:
print(len(x_lemma2), len(y))

34152 34152


### TF-IDF

In [334]:
# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3),min_df=2, max_df=0.5)

# Transform the lemmatized sentences into a TF-IDF feature matrix
X_tfidf = tfidf_vectorizer.fit_transform(x_lemma2)

# Check the shape of the TF-IDF matrix
print("TF-IDF Matrix Shape:", X_tfidf.shape)  # (number_of_documents, number_of_features)

# View feature names
print("Feature Names:", tfidf_vectorizer.get_feature_names_out())

TF-IDF Matrix Shape: (34152, 97768)
Feature Names: ['aardvark' 'aaron' 'aaron rogers' ... 'zuma say' 'zuma say south'
 'zuma successor']


### Train, test, split - bag of words 1

In [335]:
X_train_IDF, X_test_IDF, y_train_IDF, y_test_IDF= train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

### SVM

In [336]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Initialize the SVM classifier
clf_svm = SVC(kernel='rbf', gamma='scale', C=100.0, random_state=42)  # You can adjust the kernel and C parameter

# Train the SVM model
clf_svm.fit(X_train_IDF, y_train_IDF)

In [337]:
# Make predictions
y_pred_svm = clf_svm.predict(X_test_IDF)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test_IDF, y_pred_svm))
print("\nClassification Report:\n", classification_report(y_test_IDF, y_pred_svm))

Accuracy: 0.9565217391304348

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.95      0.96      3529
           1       0.95      0.96      0.96      3302

    accuracy                           0.96      6831
   macro avg       0.96      0.96      0.96      6831
weighted avg       0.96      0.96      0.96      6831



In [338]:
# from sklearn.svm import SVC
# from sklearn.model_selection import GridSearchCV

# # Define parameter grid
# param_grid = {
#     'C': [0.1, 1.0, 10.0],
#     'kernel': ['linear', 'rbf'],
#     'gamma': ['scale', 'auto']  # Used for non-linear kernels like 'rbf'
# }

# # Initialize the SVC model
# svm = SVC(random_state=42)

# # Perform GridSearchCV
# grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)
# grid_search.fit(X_train_IDF, y_train_IDF)

# # Evaluate the best model
# best_svm = grid_search.best_estimator_
# y_pred_svm = best_svm.predict(X_test_IDF)

# print("Best Parameters:", grid_search.best_params_)
# print("Accuracy:", accuracy_score(y_test_IDF, y_pred_svm))
# print("\nClassification Report:\n", classification_report(y_test_IDF, y_pred_svm))


## Model 9 : Lemmanization + TF-IDF Trigram + SVM = 95.65%

### Lemmanization for TFIDF - model 2

In [350]:
#Lemmanization for TF-IDF

from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

x_lemma2 = []

def lem_tfidf(data):
    x_lemma = []  # Initialize list to store lemmatized sentences
    for word_list in data:
        lemmatized_list = []  # To store lemmatized words for the current list
        for word in word_list:
            # Lemmatize the word
            lemmatized_word = lemmatizer.lemmatize(word)
            lemmatized_list.append(lemmatized_word)  # Add to the lemmatized list
        x_lemma.append(" ".join(lemmatized_list))  # Combine into a sentence
    return x_lemma  # Return the lemmatized sentences

x_lemma2 = lem_tfidf(x_split)

print(x_lemma2[:10])

['donald trump sends out embarrassing new year eve message this is disturbing', 'drunk bragging trump staffer started russian collusion investigation', 'sheriff david clarke becomes an internet joke for threatening to poke people in the eye', 'trump is so obsessed he even ha obama name coded into his website image', 'pope francis just called out donald trump during his christmas speech', 'racist alabama cop brutalize black boy while he is in handcuff graphic image', 'fresh off the golf course', 'trump said some insanely racist stuff inside the oval office', 'former cia director slam trump over un bullying', 'brand new pro trump ad feature so much kissing it will make you sick']


In [351]:
print(len(x_lemma2), len(y))

34152 34152


### TF-IDF

In [376]:
# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=2, max_df=0.5)

# Transform the lemmatized sentences into a TF-IDF feature matrix
X_tfidf = tfidf_vectorizer.fit_transform(x_lemma2)  

# Check the shape of the TF-IDF matrix
print("TF-IDF Matrix Shape:", X_tfidf.shape)  # (number_of_documents, number_of_features)

# View feature names
print("Feature Names:", tfidf_vectorizer.get_feature_names_out())

TF-IDF Matrix Shape: (34152, 97768)
Feature Names: ['aardvark' 'aaron' 'aaron rogers' ... 'zuma say' 'zuma say south'
 'zuma successor']


### Train, test, split - bag of words 1

In [None]:
X_train_IDF, X_test_IDF, y_train_IDF, y_test_IDF= train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

### SVM

In [355]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Initialize the SVM classifier
clf_svm = SVC(kernel='rbf', gamma='scale', C=100.0, random_state=42)  # You can adjust the kernel and C parameter


# Train the SVM model
clf_svm.fit(X_train_IDF, y_train_IDF)

In [356]:
# Make predictions
y_pred_svm = clf_svm.predict(X_test_IDF)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test_IDF, y_pred_svm))
print("\nClassification Report:\n", classification_report(y_test_IDF, y_pred_svm))

Accuracy: 0.9565217391304348

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.95      0.96      3529
           1       0.95      0.96      0.96      3302

    accuracy                           0.96      6831
   macro avg       0.96      0.96      0.96      6831
weighted avg       0.96      0.96      0.96      6831



# Generating answers

In [390]:
# Import testing CSV

results = pd.read_csv("testing_data_lowercase_nolabels.csv", header=None, names=['label', 'title'] ,sep="	")

results.head()


Unnamed: 0,label,title
0,2,copycat muslim terrorist arrested with assault...
1,2,wow! chicago protester caught on camera admits...
2,2,germany's fdp look to fill schaeuble's big shoes
3,2,mi school sends welcome back packet warning ki...
4,2,u.n. seeks 'massive' aid boost amid rohingya '...


In [364]:
from sklearn.model_selection import train_test_split

X_results = results["title"]  # Features (input)
y_results = results["label"]  # Labels (output)

## Clean data

In [399]:
X_clean_results=clean_artefacts(X_results)

print(X_clean_results[:2])

['copycat muslim terrorist arrested with assault weapons', 'wow chicago protester caught on camera admits violent activity was pre planned it not gonna be peaceful']


In [400]:
x_split_results = split_sentences_to_words(X_clean_results)
print(x_split_results[:10])



In [401]:
x_lemma_results = lem_tfidf(x_split_results)

print(x_lemma_results[:10])



In [402]:
# Transform the lemmatized sentences into a TF-IDF feature matrix
#X_tfidf_results = tfidf_vectorizer.fit_transform(x_lemma_results)
X_tfidf_results = tfidf_vectorizer.transform(x_lemma_results)  

# Check the shape of the TF-IDF matrix
print("TF-IDF Matrix Shape:", X_tfidf_results.shape)  # (number_of_documents, number_of_features)

# View feature names
print("Feature Names:", tfidf_vectorizer.get_feature_names_out())

TF-IDF Matrix Shape: (9984, 97768)
Feature Names: ['aardvark' 'aaron' 'aaron rogers' ... 'zuma say' 'zuma say south'
 'zuma successor']


In [403]:
#Make predictions with best model

# Make predictions
y_pred_results = clf_svm.predict(X_tfidf_results)

print(y_pred_results[:10])


[0 0 1 0 1 0 1 0 1 1]


In [407]:
y_results = y_pred_results

print(y_results[:10])

[0 0 1 0 1 0 1 0 1 1]


In [404]:
# Ensure the y_results list matches the length of the DataFrame
assert len(y_results) == len(results), "y_results length must match the DataFrame rows"

# Update the 'label' column with the y_results list
results['label'] = y_results

results.head(50)

Unnamed: 0,label,title
0,0,copycat muslim terrorist arrested with assault...
1,0,wow! chicago protester caught on camera admits...
2,1,germany's fdp look to fill schaeuble's big shoes
3,0,mi school sends welcome back packet warning ki...
4,1,u.n. seeks 'massive' aid boost amid rohingya '...
5,0,did oprah just leave ‚nasty‚ hillary wishing s...
6,1,france's macron says his job not 'cool' cites ...
7,0,flashback: chilling ‚60 minutes‚ interview wit...
8,1,spanish foreign ministry says to expel north k...
9,1,trump says cuba 'did some bad things' aimed at...


In [410]:
# Remove the first n rows (e.g., first 5 rows)
results_no_head = results # Replace 5 with the number of rows you want to remove

# Export the resulting DataFrame to a CSV file
results_no_head.to_csv('results_no_head.csv', index=False)

print("Dataset exported to 'results_no_head.csv' without the head.")

Dataset exported to 'results_no_head.csv' without the head.


In [412]:
# Count the occurrences of each unique value in the 'label' column
label_counts = results['label'].value_counts()/9984*100

# Print the counts
print(label_counts)


label
1    51.802885
0    48.197115
Name: count, dtype: float64
