In [72]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

## Loading first dataset

In [73]:
fake_news_path = 'datasets/raza/Fake.csv'
true_news_path = 'datasets/raza/True.csv'

fake_news_df = pd.read_csv(fake_news_path)
true_news_df = pd.read_csv(true_news_path)

In [74]:
fake_news_df['label'] = 'Fake'
true_news_df['label'] = 'True'

full_df = pd.concat([fake_news_df, true_news_df], ignore_index=True)
# remove subject and date columns
full_df = full_df.drop(columns=['subject', 'date'])

In [75]:
full_df['label'].value_counts()
full_df.head()

Unnamed: 0,title,text,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,Fake
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,Fake
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",Fake
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",Fake
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,Fake


## Loading second dataset

In [76]:
dataset_path = 'datasets/WELFake_Dataset.csv'
full_df2 = pd.read_csv(dataset_path)

In [77]:
full_df2['label'] = full_df2['label'].replace({0: 'Fake', 1: 'True'})
full_df2['label'].value_counts()
# drop rows with nAn values
full_df2 = full_df2.dropna()
# drop Unnamed: 0 column
full_df2 = full_df2.drop(columns=['Unnamed: 0'])

In [78]:
full_df2.head()

Unnamed: 0,title,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,True
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",True
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,Fake
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",True
5,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,True


## Combining datasets

In [79]:
combined_df = pd.concat([full_df, full_df2], ignore_index=True)
combined_df['label'].value_counts()


label
Fake    58509
True    57926
Name: count, dtype: int64

## Preparing vectorizer on combined dataset

In [80]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


# Combine 'title' and 'text' columns
combined_df['combined_text'] = combined_df['title'].fillna('') + ' ' + combined_df['text'].fillna('')

# Prepare the data
X = combined_df['combined_text']
y = combined_df['label']

# Split the data into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data into numerical vectors using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = tfidf_vectorizer.fit(X)


In [81]:
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

## Training the first model

In [82]:
full_df['combined_text'] = full_df['title'].fillna('') + ' ' + full_df['text'].fillna('')

X = full_df['combined_text']
y = full_df['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data into numerical vectors using TF-IDF
X_train_tfidf = tfidf_vectorizer.transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [83]:
# Train the Support Vector Machine (SVM) classifier
svm_classifier = LinearSVC()
svm_classifier.fit(X_train_tfidf, y_train)

In [84]:
svm_classifier.coef_.shape

(1, 243739)

## Train second model

In [85]:
full_df2['combined_text'] = full_df2['title'].fillna('') + ' ' + full_df2['text'].fillna('')

X2 = full_df2['combined_text']
y2 = full_df2['label']

# Split the data into training and testing sets
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=42)

# Convert text data into numerical vectors using TF-IDF
X_train_tfidf2 = tfidf_vectorizer.transform(X_train2)
X_test_tfidf2 = tfidf_vectorizer.transform(X_test2)

In [86]:
# Train the Support Vector Machine (SVM) classifier
svm_classifier2 = LinearSVC()
svm_classifier2.fit(X_train_tfidf2, y_train2)

## Compare coeficients

In [87]:
import numpy as np

In [88]:
c1 = svm_classifier.coef_
c2 = svm_classifier2.coef_

# Get feature names
feature_names = tfidf_vectorizer.get_feature_names_out()

# Compute absolute difference
abs_diff = np.abs(c1 - c2)

# Flatten if needed (in case it's 2D with shape (1, n_features))
abs_diff = abs_diff.flatten()

# Get indices of top-k most different features
top_k = 20  # or any number you want
top_indices = np.argsort(abs_diff)[-top_k:][::-1]  # sort descending

# Get corresponding words and their differences
top_features = [(feature_names[i], abs_diff[i]) for i in top_indices]

# Print results
for word, diff in top_features:
    print(f"{word}: {diff:.4f}")



reuters: 38.5770
featured: 13.6529
image: 11.6087
getty: 10.0179
breaking: 9.5371
read: 9.3246
video: 9.1633
com: 7.9537
just: 6.1059
hillary: 6.0889
thursday: 5.9824
follow: 5.8351
wire: 5.8016
wednesday: 5.5066
flickr: 5.4739
watch: 5.4245
york: 5.1061
tuesday: 4.8989
washington: 4.7337
nyp: 4.7156
