## Data Processing

In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore") 
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import re
from datetime import datetime

import matplotlib.pyplot as plt
import seaborn as sns

from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
pd.options.display.max_colwidth = None

In [2]:
df=pd.read_csv('./data/data.csv',usecols=['title', 'news'] ,encoding = "ISO-8859-1")

In [3]:
df.head()

Unnamed: 0,title,news
0,Jack Carr recalls Gen. Eisenhower's D-Day memo about 'great and noble undertaking',fox
1,"Bruce Willis, Demi Moore avoided doing one thing while co-parenting, daughter says",fox
2,"Blinken meets Qatar PM, says Israeli actions are not 'retaliation,' but 'defending the lives of its people'",fox
3,"Emily Blunt says her âtoes curlâ?when people tell her their kids want to act: 'I want to say, donât do it!'",fox
4,"'The View' co-host, CNN commentator Ana Navarro to host night 2 of Democratic National Convention",fox


In [4]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Remove punctuation
    words = word_tokenize(text.lower())
    # Remove stop words and punctuation
    words = [
        word for word in words
        if word.lower() not in stop_words and word not in punctuation and not re.search(r'\d', word)
    ]
    
    # Rejoin words
    return ' '.join(words)

# Apply the function to the 'title' column
df['title'] = df['title'].apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\26656\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\26656\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=21)
X_train = train_df['title']
y_train = train_df['news']
X_test = test_df['title']
y_test = test_df['news']
y_train = y_train.apply(lambda x: 1 if x == 'fox' else 0)
y_test = y_test.apply(lambda x: 1 if x == 'fox' else 0)
accuracy_scores={}

## Word Embedding

### TF-IDF/ Bag of worsd/Bag of ngrams


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = TfidfVectorizer(max_features=10000) # Adjust max_features if needed
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
print("Feature Names:", vectorizer.get_feature_names_out())
print(len(vectorizer.vocabulary_))

vectorizer = CountVectorizer() # Adjust max_features if needed
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)
print("Feature Names:", vectorizer.get_feature_names_out())
print(len(vectorizer.vocabulary_))

vectorizer = CountVectorizer(ngram_range=(1, 3))  # Adjust max_features&ngram_range if needed
# Fit and transform the training data, then transform the test data
X_train_bong = vectorizer.fit_transform(X_train)
X_test_bong = vectorizer.transform(X_test)
# Display the feature names (words and n-grams)
print("Feature Names:", vectorizer.get_feature_names_out())
print(len(vectorizer.vocabulary_))

Feature Names: ['aapi' 'aaron' 'aarp' ... 'zucker' 'zuckerberg' 'zzz']
7721
Feature Names: ['aapi' 'aaron' 'aarp' ... 'zucker' 'zuckerberg' 'zzz']
7721
Feature Names: ['aapi' 'aapi owned' 'aapi owned food' ... 'zzz' 'zzz amazon'
 'zzz amazon buys']
53222


### Word2Vec

In [7]:
from gensim.models import KeyedVectors
#install scipy<1.13 to be compatible for gensim
#!pip install "scipy<1.13"
word2vec_model = KeyedVectors.load_word2vec_format('./data/GoogleNews-vectors-negative300.bin.gz', binary=True,limit=500000)

# Tokenize your text data
X_train_tokenized = [sentence.split() for sentence in X_train]
X_test_tokenized = [sentence.split() for sentence in X_test]

# Define a function to average word vectors for each sentence
def average_word_vectors(sentence, model, vector_size):
    words = [word for word in sentence if word in model]
    return np.mean(model[words], axis=0)
# Apply average word vectors on training and test sets
X_train_word2vec = np.array([average_word_vectors(sentence, word2vec_model, 300) for sentence in X_train_tokenized])
X_test_word2vec = np.array([average_word_vectors(sentence, word2vec_model, 300) for sentence in X_test_tokenized])

### Glove

In [8]:
glove_file = "./data/glove.840B.300d.txt"

# Load GloVe embeddings
def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            values = line.strip().split(' ')
            word = values[0]
            try:
                vector = np.asarray(values[1:], dtype="float32")
                embeddings[word] = vector
            except ValueError:
                print(f"Skipping line with invalid vector for word: {word}")
    return embeddings
glove_embeddings = load_glove_embeddings(glove_file)

def sentence_to_glove(sentence, embeddings):
    words = sentence.split()
    vectors = [embeddings[word] for word in words if word in embeddings]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(300)  # Return a zero vector if no words are in embeddings

# Transform an entire dataset (e.g., train and test sets)
X_train_glove = np.array([sentence_to_glove(sentence, glove_embeddings) for sentence in X_train])
X_test_glove = np.array([sentence_to_glove(sentence, glove_embeddings) for sentence in X_test])

glove_embeddings = load_glove_embeddings(glove_file)

## Model

### LogisticRegression

In [9]:
# Logistic Regression with TF-IDF
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print("Tfidf With Logistic Regression")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))
accuracy_scores["Tfidf With Logistic Regression"] = accuracy

# Logistic Regression with Bag of Words
model = LogisticRegression()
model.fit(X_train_bow, y_train)
y_pred = model.predict(X_test_bow)
accuracy = accuracy_score(y_test, y_pred)
print("Bag Of Word With Logistic Regression")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))
accuracy_scores["Bag Of Word With Logistic Regression"] = accuracy

# Logistic Regression with Bag of N-Grams
model = LogisticRegression()
model.fit(X_train_bong, y_train)
y_pred = model.predict(X_test_bong)
accuracy = accuracy_score(y_test, y_pred)
print("Bag Of N Grams With Logistic Regression")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))
accuracy_scores["Bag Of N Grams With Logistic Regression"] = accuracy

# Logistic Regression with Word2Vec
model = LogisticRegression()
model.fit(X_train_word2vec, y_train)
y_pred = model.predict(X_test_word2vec)
accuracy = accuracy_score(y_test, y_pred)
print("Word2Vec With Logistic Regression")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))
accuracy_scores["Word2Vec With Logistic Regression"] = accuracy

# Logistic Regression with GloVe
model = LogisticRegression()
model.fit(X_train_glove, y_train)
y_pred = model.predict(X_test_glove)
accuracy = accuracy_score(y_test, y_pred)
print("GloVe With Logistic Regression")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))
accuracy_scores["GloVe With Logistic Regression"] = accuracy

Tfidf With Logistic Regression
Accuracy: 0.7832
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.75      0.77       363
           1       0.78      0.81      0.79       389

    accuracy                           0.78       752
   macro avg       0.78      0.78      0.78       752
weighted avg       0.78      0.78      0.78       752

Bag Of Word With Logistic Regression
Accuracy: 0.7819
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.80      0.78       363
           1       0.80      0.77      0.78       389

    accuracy                           0.78       752
   macro avg       0.78      0.78      0.78       752
weighted avg       0.78      0.78      0.78       752

Bag Of N Grams With Logistic Regression
Accuracy: 0.7899
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.82      0.79       363
      

### Decision Tree

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Decision Tree with TF-IDF
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_tfidf, y_train)
y_pred_dt = dt_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred_dt)
print("TFIDF with Decision Tree")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_dt))
accuracy_scores["TFIDF with Decision Tree"] = accuracy

# Decision Tree with Bag of Words
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_bow, y_train)
y_pred_dt_bow = dt_model.predict(X_test_bow)
accuracy = accuracy_score(y_test, y_pred_dt_bow)
print("Bag of Words with Decision Tree")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_dt_bow))
accuracy_scores["Bag of Words with Decision Tree"] = accuracy

# Decision Tree with Bag of N-grams
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_bong, y_train)
y_pred_dt_bong = dt_model.predict(X_test_bong)
accuracy = accuracy_score(y_test, y_pred_dt_bong)
print("Bag of N-grams with Decision Tree")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_dt_bong))
accuracy_scores["Bag of N-grams with Decision Tree"] = accuracy

# Decision Tree with Word2Vec
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_word2vec, y_train)
y_pred_dt_word2vec = dt_model.predict(X_test_word2vec)
accuracy = accuracy_score(y_test, y_pred_dt_word2vec)
print("Word2Vec with Decision Tree")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_dt_word2vec))
accuracy_scores["Word2Vec With DecisionTree"] = accuracy

# Decision Tree with GloVe
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_glove, y_train)
y_pred_dt_glove = dt_model.predict(X_test_glove)
accuracy = accuracy_score(y_test, y_pred_dt_glove)
print("GloVe with Decision Tree")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_dt_glove))
accuracy_scores["GloVe with Decision Tree"] = accuracy

TFIDF with Decision Tree
Accuracy: 0.7194
Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.76      0.72       363
           1       0.75      0.68      0.71       389

    accuracy                           0.72       752
   macro avg       0.72      0.72      0.72       752
weighted avg       0.72      0.72      0.72       752

Bag of Words with Decision Tree
Accuracy: 0.7340
Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.74      0.73       363
           1       0.75      0.72      0.74       389

    accuracy                           0.73       752
   macro avg       0.73      0.73      0.73       752
weighted avg       0.73      0.73      0.73       752

Bag of N-grams with Decision Tree
Accuracy: 0.7553
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.72      0.74       363
           1       0.75

### Random Forest

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Random Forest with TF-IDF
rf_model = RandomForestClassifier()
rf_model.fit(X_train_tfidf, y_train)
y_pred_rf = rf_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred_rf)
print("TFIDF with Random Forest")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
accuracy_scores["TFIDF with Random Forest"] = accuracy

# Random Forest with Bag of Words
rf_model = RandomForestClassifier()
rf_model.fit(X_train_bow, y_train)
y_pred_rf_bow = rf_model.predict(X_test_bow)
accuracy = accuracy_score(y_test, y_pred_rf_bow)
print("Bag of Words with Random Forest")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_rf_bow))
accuracy_scores["Bag of Words with Random Forest"] = accuracy

# Random Forest with Bag of N-grams
rf_model = RandomForestClassifier()
rf_model.fit(X_train_bong, y_train)
y_pred_rf_bong = rf_model.predict(X_test_bong)
accuracy = accuracy_score(y_test, y_pred_rf_bong)
print("Bag of N-grams with Random Forest")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_rf_bong))
accuracy_scores["Bag of N-grams with Random Forest"] = accuracy

# Random Forest with Word2Vec
rf_model = RandomForestClassifier()
rf_model.fit(X_train_word2vec, y_train)
y_pred_rf_w2c = rf_model.predict(X_test_word2vec)
accuracy = accuracy_score(y_test, y_pred_rf_w2c)
print("Word2Vec with Random Forest")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_rf_w2c))
accuracy_scores["Word2Vec With RandomForest"] = accuracy

# Random Forest with GloVe
rf_model = RandomForestClassifier()
rf_model.fit(X_train_glove, y_train)
y_pred_rf_glove = rf_model.predict(X_test_glove)
accuracy = accuracy_score(y_test, y_pred_rf_glove)
print("GloVe with Random Forest")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_rf_glove))
accuracy_scores["GloVe with Random Forest"] = accuracy

TFIDF with Random Forest
Accuracy: 0.7793
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.76      0.77       363
           1       0.78      0.79      0.79       389

    accuracy                           0.78       752
   macro avg       0.78      0.78      0.78       752
weighted avg       0.78      0.78      0.78       752

Bag of Words with Random Forest
Accuracy: 0.7580
Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.82      0.77       363
           1       0.81      0.70      0.75       389

    accuracy                           0.76       752
   macro avg       0.76      0.76      0.76       752
weighted avg       0.76      0.76      0.76       752

Bag of N-grams with Random Forest
Accuracy: 0.7394
Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.84      0.76       363
           1       0.81

### SVM

In [12]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# SVM with TFIDF
svm_model = SVC()
svm_model.fit(X_train_tfidf, y_train)
y_pred_svm = svm_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred_svm)
print("TFIDF with SVM")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_svm))
accuracy_scores["TFIDF with SVM"] = accuracy

# SVM with Bag of Words
svm_model = SVC()
svm_model.fit(X_train_bow, y_train)
y_pred_svm_bow = svm_model.predict(X_test_bow)
accuracy = accuracy_score(y_test, y_pred_svm_bow)
print("Bag of Words with SVM")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_svm_bow))
accuracy_scores["Bag of Words with SVM"] = accuracy

# SVM with Bag of N-grams
svm_model = SVC()
svm_model.fit(X_train_bong, y_train)
y_pred_svm_bong = svm_model.predict(X_test_bong)
accuracy = accuracy_score(y_test, y_pred_svm_bong)
print("Bag of N-grams with SVM")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_svm_bong))
accuracy_scores["Bag of N-grams with SVM"] = accuracy

# SVM with Word2Vec
svm_model = SVC()
svm_model.fit(X_train_word2vec, y_train)
y_pred_svm_w2v = svm_model.predict(X_test_word2vec)
accuracy = accuracy_score(y_test, y_pred_svm_w2v)
print("Word2Vec with SVM")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_svm_w2v))
accuracy_scores["Word2Vec With SVM"] = accuracy

# SVM with GloVe
svm_model = SVC()
svm_model.fit(X_train_glove, y_train)
y_pred_svm_glove = svm_model.predict(X_test_glove)
accuracy = accuracy_score(y_test, y_pred_svm_glove)
print("GloVe with SVM")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_svm_glove))
accuracy_scores["GloVe with SVM"] = accuracy

TFIDF with SVM
Accuracy: 0.7939
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.76      0.78       363
           1       0.79      0.83      0.81       389

    accuracy                           0.79       752
   macro avg       0.79      0.79      0.79       752
weighted avg       0.79      0.79      0.79       752

Bag of Words with SVM
Accuracy: 0.7699
Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.87      0.78       363
           1       0.85      0.68      0.75       389

    accuracy                           0.77       752
   macro avg       0.78      0.77      0.77       752
weighted avg       0.78      0.77      0.77       752

Bag of N-grams with SVM
Accuracy: 0.5399
Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.99      0.67       363
           1       0.91      0.12      0.22       389

### Naive Bayes

In [13]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score, classification_report

# Naive Bayes with TF-IDF
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
y_pred_nb_tfidf = nb_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred_nb_tfidf)
print("TFIDF with Naive Bayes")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_nb_tfidf))
accuracy_scores["TFIDF with Naive Bayes"] = accuracy

# Naive Bayes with Bag of Words
nb_model = MultinomialNB()
nb_model.fit(X_train_bow, y_train)
y_pred_nb_bow = nb_model.predict(X_test_bow)
accuracy = accuracy_score(y_test, y_pred_nb_bow)
print("Bag of Words with Naive Bayes")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_nb_bow))
accuracy_scores["Bag of Words with Naive Bayes"] = accuracy

# Naive Bayes with Bag of N-grams
nb_model = MultinomialNB()
nb_model.fit(X_train_bong, y_train)
y_pred_nb_bong = nb_model.predict(X_test_bong)
accuracy = accuracy_score(y_test, y_pred_nb_bong)
print("Bag of N-grams with Naive Bayes")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_nb_bong))
accuracy_scores["Bag of N-grams with Naive Bayes"] = accuracy

# Naive Bayes with Word2Vec (using GaussianNB)
nb_model = GaussianNB()
nb_model.fit(X_train_word2vec, y_train)
y_pred_nb_w2v = nb_model.predict(X_test_word2vec)
accuracy = accuracy_score(y_test, y_pred_nb_w2v)
print("Word2Vec with Naive Bayes (GaussianNB)")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_nb_w2v))
accuracy_scores["Word2Vec with Naive Bayes"] = accuracy

# Naive Bayes with GloVe (using GaussianNB)
nb_model = GaussianNB()
nb_model.fit(X_train_glove, y_train)
y_pred_nb_glove = nb_model.predict(X_test_glove)
accuracy = accuracy_score(y_test, y_pred_nb_glove)
print("GloVe with Naive Bayes (GaussianNB)")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_nb_glove))
accuracy_scores["GloVe with Naive Bayes"] = accuracy

TFIDF with Naive Bayes
Accuracy: 0.7793
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.78      0.77       363
           1       0.79      0.78      0.78       389

    accuracy                           0.78       752
   macro avg       0.78      0.78      0.78       752
weighted avg       0.78      0.78      0.78       752

Bag of Words with Naive Bayes
Accuracy: 0.7699
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.77      0.76       363
           1       0.78      0.77      0.78       389

    accuracy                           0.77       752
   macro avg       0.77      0.77      0.77       752
weighted avg       0.77      0.77      0.77       752

Bag of N-grams with Naive Bayes
Accuracy: 0.7846
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.78      0.78       363
           1       0.79      

In [14]:
ranked_models = sorted(accuracy_scores.items(), key=lambda x: x[1], reverse=True)

In [15]:
ranked_models

[('TFIDF with SVM', 0.7938829787234043),
 ('Bag Of N Grams With Logistic Regression', 0.7898936170212766),
 ('Bag of N-grams with Naive Bayes', 0.7845744680851063),
 ('Tfidf With Logistic Regression', 0.7832446808510638),
 ('Bag Of Word With Logistic Regression', 0.7819148936170213),
 ('TFIDF with Random Forest', 0.7792553191489362),
 ('TFIDF with Naive Bayes', 0.7792553191489362),
 ('Bag of Words with SVM', 0.7699468085106383),
 ('Bag of Words with Naive Bayes', 0.7699468085106383),
 ('Bag of Words with Random Forest', 0.7579787234042553),
 ('Bag of N-grams with Decision Tree', 0.7553191489361702),
 ('Word2Vec With SVM', 0.7553191489361702),
 ('GloVe with SVM', 0.7486702127659575),
 ('Bag of N-grams with Random Forest', 0.7393617021276596),
 ('Bag of Words with Decision Tree', 0.7340425531914894),
 ('GloVe With Logistic Regression', 0.726063829787234),
 ('GloVe with Random Forest', 0.726063829787234),
 ('TFIDF with Decision Tree', 0.7194148936170213),
 ('Word2Vec With RandomForest', 0