In [None]:
import pandas as pd
import numpy as np
import nltk
import re
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.metrics import confusion_matrix
import seaborn as sns

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

<h3> Download link for train.csv: <br> <p> https://www.kaggle.com/competitions/fake-news/data?select=train.csv

In [None]:
train_df = pd.read_csv('train.csv', encoding='utf-8', error_bad_lines = False, engine="python")
train_df.fillna('', inplace=True) #filling missing NaN values
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True) #shuffle to avoid overfitting
train_df.head()



  train_df = pd.read_csv('train.csv', encoding='utf-8', error_bad_lines = False, engine="python")
Skipping line 13535: field larger than field limit (131072)
Skipping line 19783: field larger than field limit (131072)
Skipping line 13541: Expected 5 fields in line 13541, saw 6
Skipping line 19782: Expected 5 fields in line 19782, saw 7
Skipping line 19783: Expected 5 fields in line 19783, saw 10
Skipping line 19785: Expected 5 fields in line 19785, saw 14
Skipping line 19787: Expected 5 fields in line 19787, saw 8
Skipping line 19789: Expected 5 fields in line 19789, saw 7
Skipping line 19790: Expected 5 fields in line 19790, saw 15
Skipping line 19791: Expected 5 fields in line 19791, saw 9
Skipping line 19792: Expected 5 fields in line 19792, saw 9
Skipping line 19793: Expected 5 fields in line 19793, saw 12
Skipping line 19794: Expected 5 fields in line 19794, saw 15
Skipping line 19795: Expected 5 fields in line 19795, saw 8
Skipping line 19796: Expected 5 fields in line 19796, s

Unnamed: 0,id,title,author,text,label
0,2501,Three Rules For Rulers,IWB,CHROME IS RECOMMENDED BROWSER FOR IWB TODAY'S ...,1
1,15557,Hillary Campaign Manager Gets $7K a Month from...,Daniel Greenfield,Hillary Campaign Manager Gets $7K a Month from...,1
2,15541,WIKILEAKS: Hillary Got $12 Million for Clinton...,,Email \nFormer Secretary of State Hillary Clin...,1
3,9418,The Russian media just loves the campaign to d...,The Saker,"250 Views November 03, 2016 1 Comment Analyses...",1
4,9332,"In China, Homeowners Find Themselves in a Land...",Stuart Leavenworth and Kiki Zhao,"WENZHOU, China — Chen Furong and his wife b...",0


<h3> Download link for test.csv: <br> <p> https://www.kaggle.com/competitions/fake-news/data?select=test.csv

In [None]:
test_df = pd.read_csv('test.csv', encoding='utf-8', error_bad_lines = False, engine="python")
test_df.fillna('', inplace=True) #filling missing NaN values
test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True) #shuffle to avoid overfitting
test_df.head()



  test_df = pd.read_csv('test.csv', encoding='utf-8', error_bad_lines = False, engine="python")


Unnamed: 0,id,title,author,text
0,21926,Elizabeth Warren: Sessions ’Needs to Be Fired’...,Pam Key,"Monday on CNN’s “The Axe Files,” a podcast fro..."
1,20879,High School Basketball Player Scores an Amazin...,Warner Todd Huston,A California high school basketball player has...
2,24060,Facebook Lets Advertisers Exclude Users By Race,Julia Angwin,Facebook Lets Advertisers Exclude Users By Rac...
3,22536,New Clues in the Mystery of Women’s Lagging Li...,Sabrina Tavernise,WASHINGTON — It is now a grim fact that the...
4,25326,Kerry Lists Obama Era’s Diplomatic Successes. ...,Russell Goldman,In an exit memorandum reflecting on eight year...


<h1> Data Cleaning <br>
<h3> Involves the following: <br>
<p> 1. Removing URLS <br>
 2. Lowercasing <br>
 3. Removing Punctuations + Repeating Characters <br>
 4. Removing Stopwords <br>
 5. Lemmatization <br>

 <i> Can add more cleaning at a later stage


In [None]:
columns = ['title', 'author', 'text']
dfs = [train_df, test_df]

#function made to apply other functions to the feature columns
def apply_function(col_list, df_list, fx):
  for i in col_list:
    for j in df_list:
      j[i] = j[i].apply(fx)

In [None]:
def cleaning_URLs(data):
    return re.sub('(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*)', ' ', data)

apply_function(columns, dfs, cleaning_URLs)

In [None]:
punctuation_to_remove = [',','!', '"', "'", '#', '$', '%', '&', '(', ')', '.', '/', ':', ';', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '+', '—', '*', '<', '>']
custom_translation = str.maketrans('', '', ''.join(punctuation_to_remove))

def clean1(text):
  if isinstance(text, str):
    text = text.lower()
    text = text.replace('"', '')
    text = text.replace("'", '')
    return text.translate(custom_translation)
  return text

def cleaning_repeating_char(text):
    return re.sub(r'(.)\1{2,}', r'\1', text)

apply_function(columns, dfs, clean1)
apply_function(columns, dfs, cleaning_repeating_char)


In [None]:
stopword_list=nltk.corpus.stopwords.words('english')
stop = set(stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([i for i in str(text).split(" ") if i not in stop])

apply_function(columns, dfs, remove_stopwords)

In [None]:
lm = nltk.WordNetLemmatizer()

def lemmatization(data):
    if isinstance(data, str):
      text = [lm.lemmatize(word) for word in data.split()]
      return ' ' .join(text)
    return data

apply_function(columns, dfs, lemmatization)

In [None]:
train_df.head()

Unnamed: 0,id,title,author,text,label
0,2501,three rule ruler,iwb,chrome recommended browser iwb today hot post,1
1,15557,hillary campaign manager get 7k month hillary ...,daniel greenfield,hillary campaign manager get 7k month hillary ...,1
2,15541,wikileaks hillary got 12 million clinton chari...,,email former secretary state hillary clinton a...,1
3,9418,russian medium love campaign demonize putin,saker,250 view november 03 2016 1 comment analysis s...,1
4,9332,china homeowner find land doubt - new york time,stuart leavenworth kiki zhao,wenzhou china chen furong wife bought home 23 ...,0


In [None]:
test_df.head()

Unnamed: 0,id,title,author,text
0,21926,elizabeth warren session ’needs fired’ - breit...,pam key,monday cnn’s “the axe files” podcast universit...
1,20879,high school basketball player score amazing 92...,warner todd huston,california high school basketball player stunn...
2,24060,facebook let advertiser exclude user race,julia angwin,facebook let advertiser exclude user race face...
3,22536,new clue mystery women’s lagging life expectan...,sabrina tavernise,washington grim fact life expectancy american ...
4,25326,kerry list obama era’s diplomatic success trum...,russell goldman,exit memorandum reflecting eight year united s...


In [None]:
import tensorflow as tf
import tensorflow_hub as hub
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [None]:
def calculate_avg_embedding(sentence):
  # Generate embeddings for the sentences
  embeddings = embed([sentence])
  averaged_embeddings = np.mean(embeddings, axis=1)
  return float(averaged_embeddings)

# Model Evaluation: <br>
<p> 1. Naive Bayes Classifier <br>
<p> 2. SVM + Logisitic Regression <br>
<p> 3. Random Forests <br>

# Naive Bayes Classifier

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from scipy.sparse import hstack
from sklearn.metrics import roc_curve, auc

In [None]:
vectorizers = [CountVectorizer() for _ in columns]
X_list = [vectorizer.fit_transform(train_df[column]) for vectorizer, column in zip(vectorizers, columns)]
X = hstack(X_list)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, train_df['label'], test_size=0.2, random_state=42)
clf = MultinomialNB()
clf.fit(X_train, y_train)

X_list = [vectorizer.fit_transform(train_df[column]) for vectorizer, column in zip(vectorizers, columns)]
X = hstack(X_list)
y_pred = clf.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.9178871548619448
Classification Report:
                                                  precision    recall  f1-score   support

                                                       0.03      0.33      0.05         3
 чтобы это была дорога с двусторонним движением.       0.00      0.00      0.00         0
                                               0       0.89      0.98      0.93      2059
                                               1       0.98      0.86      0.91      2103

                                        accuracy                           0.92      4165
                                       macro avg       0.47      0.54      0.47      4165
                                    weighted avg       0.93      0.92      0.92      4165



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


<h1> Deep Learning - Long Short Term Memory (LTSM)

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Embedding, LSTM, Dense, Activation, Dropout, Input
from keras.optimizers import Adam, RMSprop
from keras.utils import pad_sequences, plot_model

In [None]:
#Model constants
MAX_LEN = 3000 #change later...
MAX_FEATURES = 500

In [None]:
train_df2 = train_df.copy()
train_df2['combined_text'] = train_df['title'] + ' ' + train_df['author'] + ' ' + train_df['text']
X_LSTM = train_df2['combined_text']
y_LSTM = train_df2['label']
X_train, X_test, y_train, y_test = train_test_split(X_LSTM, y_LSTM, test_size=0.2, random_state=42)

In [None]:
tokenizer = Tokenizer(num_words = MAX_FEATURES)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_seq, maxlen = MAX_LEN)
X_test_padded = pad_sequences(X_test_seq, maxlen = MAX_LEN)

In [None]:
## Model Constants
BATCH_SIZE = 64
EPOCHS = 6

In [None]:
# Input Layer
inputs = Input(shape = [MAX_LEN])

# Embedding Layer - Converts the input sequence into a sequence of dense vectors
layer = Embedding(MAX_FEATURES, 50, input_length = MAX_LEN)(inputs)

# LSTM Layer
layer = LSTM(64)(layer)

# Dense Layer - Fully connected layer
layer = Dense(256)(layer)
layer = Activation('relu')(layer)

# Dropout Layer - Prevents overfitting
layer = Dropout(0.5)(layer)

# Output Layer
layer = Dense(1)(layer)
layer = Activation('sigmoid')(layer)
model = Model(inputs = inputs, outputs = layer)

# Compile the model
model.compile(loss = 'binary_crossentropy', optimizer = RMSprop(), metrics = ['accuracy'])
model.summary()

# Train the model
history = model.fit(X_train_padded, y_train, validation_data = (X_test_padded, y_test), batch_size = BATCH_SIZE, epochs = EPOCHS)

scores = model.evaluate(X_test, y_test, verbose = 0)
print("Accuracy: %.2f%%" % (scores[1] * 100))

#Unsure why this error is occuring....

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 3000)]            0         
                                                                 
 embedding_1 (Embedding)     (None, 3000, 50)          25000     
                                                                 
 lstm_1 (LSTM)               (None, 64)                29440     
                                                                 
 dense_2 (Dense)             (None, 256)               16640     
                                                                 
 activation_2 (Activation)   (None, 256)               0         
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense_3 (Dense)             (None, 1)                 257 

UnimplementedError: ignored

# Support Vector Machines

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [None]:
X = train_df.text
Y = train_df.label

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, train_df['label'], test_size=0.2, random_state=42)

In [None]:
vectorizer = TfidfVectorizer(max_features = 1000)

In [None]:
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
model_svm = SVC(kernel = 'linear')
model_svm.fit(X_train_vec, y_train)

In [None]:
Y_train_hat_svm = model_svm.predict(X_train_vec)
Y_test_hat_svm = model_svm.predict(X_test_vec)

In [None]:
accuracy_score(y_train, Y_train_hat_svm)

0.9595965660082848

In [None]:
accuracy_score(y_test, Y_test_hat_svm)

0.9430972388955582

<h1> Logisitic Regression Model


In [None]:
model_lr = LogisticRegression()
model_lr.fit(X_train_vec, y_train)

In [None]:
Y_train_hat_lr = model_lr.predict(X_train_vec)
Y_test_hat_lr = model_lr.predict(X_test_vec)

In [None]:
accuracy_score(y_train, Y_train_hat_lr)

0.9574953473014348

In [None]:
accuracy_score(y_test, Y_test_hat_lr)

0.9440576230492197

<h1> Random Forests

In [None]:
#Encoding values using sentence embeddings
train_df['title'] = train_df['title'].apply(lambda x: calculate_avg_embedding(x))
train_df['author'] = train_df['author'].apply(lambda x: calculate_avg_embedding(x))
train_df['text'] = train_df['text'].apply(lambda x: calculate_avg_embedding(x))

train_df.head()

Unnamed: 0,id,title,author,text,label
0,2501,0.001121,0.005037,0.002739,1
1,15557,-0.000775,-0.000527,0.001868,1
2,15541,-0.000792,0.002495,0.000565,1
3,9418,0.000685,0.000272,0.000775,1
4,9332,-0.001199,0.000269,-0.000412,0


In [None]:
from sklearn.model_selection import train_test_split

feature_cols = train_df[['title', 'author', 'text']] #skipping ID as their doesn't seem to be any correlation
target_col = train_df[['label']]

X_train, X_test, y_train, y_test = train_test_split(feature_cols, target_col, test_size=0.3, random_state=42)

print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

X_train.head()

Training set shape: (14575, 3) (14575, 1)
Testing set shape: (6247, 3) (6247, 1)


Unnamed: 0,title,author,text
14023,0.001616,0.000144,-0.000209
6619,0.001707,0.002495,0.001827
9888,0.001943,0.001405,0.003788
15154,-0.000212,0.000375,0.000922
18106,-0.00153,-0.001909,0.001441


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

#Let's implement a random forest algorithm that builds decision trees based on 2 feature columns at a time
feature_combinations = [('title', 'author'), ('author', 'text'), ('title', 'text')] #only using 3 decision trees

# Initialize a dictionary to store decision trees
decision_trees = {}

# Train Decision Trees
for features in feature_combinations:
    # Create a decision tree for the feature combination
    tree = DecisionTreeClassifier()
    tree.fit(X_train[list(features)], y_train)

    # Store the decision tree in the dictionary
    decision_trees[features] = tree

predictions = {}  # To store predictions from individual decision trees

for features, tree in decision_trees.items():
    predictions[features] = tree.predict(X_test[list(features)])

# Combine predictions using majority vote
final_predictions = []

for i in range(len(y_test)):
    votes = [predictions[features][i] for features in feature_combinations]
    # Take the majority vote as the final prediction
    final_predictions.append(max(set(votes), key=votes.count))

# Random Forest
rf = RandomForestClassifier()

# Train the Random Forest using the individual decision trees
rf.fit(X_train, y_train)

# Make predictions using the Random Forest
rf_predictions = rf.predict(X_test)

# Evaluate the models
print("Individual Decision Trees:")
for features, tree in decision_trees.items():
    accuracy = accuracy_score(y_test, predictions[features])
    print(f"{features} - Accuracy: {accuracy:.2f}")

print("\nRandom Forest:")
rf_accuracy = accuracy_score(y_test, rf_predictions)
print(f"Random Forest - Accuracy: {rf_accuracy:.2f}")



  rf.fit(X_train, y_train)


Individual Decision Trees:
('title', 'author') - Accuracy: 0.84
('author', 'text') - Accuracy: 0.88
('title', 'text') - Accuracy: 0.54

Random Forest:
Random Forest - Accuracy: 0.73


In [None]:
#Need to build more decision trees to improve accuracy, as only 3 decision trees are being built here.