In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

In [None]:
# Read the preprocessed data
import os

pre_file_path = os.path.join('preprocessed', 'pre-processed.csv')
pre_df = pd.read_csv(pre_file_path, index_col= 0)
pre_df.index.name = None
pre_df = pre_df.rename(columns={"preprocessed_news": "text"})
pre_df = pre_df[["text","label"]]
pre_df

In [None]:
# Split into fake and true news
fake_df = pre_df[pre_df["label"] == 'fake']
true_df = pre_df[pre_df["label"] == 'true']

In [None]:
# Create bag of words for each df
from sklearn.feature_extraction.text import CountVectorizer

fake_bag = CountVectorizer().fit(fake_df["text"])
true_bag = CountVectorizer().fit(true_df["text"])

In [None]:
fake_bag = fake_bag.vocabulary_
true_bag = true_bag.vocabulary_

In [None]:
# Create a bag of words with the difference between the two

# Get all words from both bags
all_words = set(list(fake_bag.keys()) + list(true_bag.keys()))

bag = {}
for word in all_words:
    if word in fake_bag and word in true_bag:
        bag[word] = true_bag[word] - fake_bag[word]
    elif word in fake_bag:
        bag[word] = -fake_bag[word]
    else:
        bag[word] = true_bag[word]


In [None]:
worst_to_best = sorted(bag.items(), key= lambda x: x[1])
worst_to_best[-1]

In [None]:
len(worst_to_best)

In [None]:
# Number of words in each bag
len(fake_bag), len(true_bag)



In [None]:
numberOfFakeWords = sum(fake_bag.values())
numberOfTrueWords = sum(true_bag.values())
numberOfWords = numberOfFakeWords + numberOfTrueWords
print(numberOfFakeWords / numberOfWords, numberOfTrueWords / numberOfWords)

In [None]:
distribution_df = pd.DataFrame(bag.values(), columns=["Rate"])
distribution_df

# Create a histogram of the distribution
plt.figure(figsize=(10,5))
sns.histplot(distribution_df["Rate"], bins=100, kde=True)
plt.show()

In [None]:
normalized_fake_bag = {}
for word in fake_bag:
    normalized_fake_bag[word] = fake_bag[word] / numberOfFakeWords

normalized_true_bag = {}
for word in true_bag:
    normalized_true_bag[word] = true_bag[word] / numberOfTrueWords

normalized_bag = {}
for word in all_words:
    if word in normalized_fake_bag and word in normalized_true_bag:
        normalized_bag[word] = normalized_true_bag[word] - normalized_fake_bag[word]
    elif word in fake_bag:
        normalized_bag[word] = -normalized_fake_bag[word]
    else:
        normalized_bag[word] = normalized_true_bag[word]

In [None]:
distribution_df = pd.DataFrame(normalized_bag.items(), columns=["Word", "Rate"])
distribution_df

# Create a histogram of the distribution
plt.figure(figsize=(10,5))
sns.histplot(distribution_df["Rate"], bins=100, kde=True)
plt.show()

In [None]:
worst_words_df = distribution_df.loc[distribution_df["Rate"] < -5 * 1e-5]
worst_words_df

In [None]:
check_worst_word =  worst_words_df["Word"].values

In [None]:
# Check if the words are in the fake news
countFake = 0
for text in fake_df["text"].values:
    for fakeWord in check_worst_word:
        if fakeWord in text:
            countFake += 1


countTrue = 0
for text in true_df["text"].values:
    for fakeWord in check_worst_word:
        if fakeWord in text:
            countTrue += 1

countFake, countTrue


In [None]:
(countFake / numberOfFakeWords, countTrue / numberOfTrueWords) 

In [None]:
len(fake_bag.keys()), len(true_bag.keys())

In [None]:
numberOfFakeWords/numberOfTrueWords