1. Libraries

In [1]:
#Data Understanding and Preparation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
fake = pd.read_csv('Fake.csv')
true = pd.read_csv('True.csv')



In [3]:
len(fake)
len(true)

21417

In [4]:
fake['label'] = 1
true['label']=0

df_concated = pd.concat([fake, true])
permutation = np.random.permutation(len(df_concated))
df = df_concated.iloc[permutation]
df_concated


Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1
...,...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",0
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",0
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",0
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",0


In [5]:
df.duplicated().sum()

209

In [6]:
df = df.drop_duplicates()

df.shape

(44689, 5)

In [7]:
nan_count_per_column = df.isnull().sum()

print(nan_count_per_column)

title      0
text       0
subject    0
date       0
label      0
dtype: int64


In [8]:
df_no_date = df.drop(columns=['date'])

df_no_date

Unnamed: 0,title,text,subject,label
16308,Kenya opposition wants vote planned for Saturd...,NAIROBI (Reuters) - Kenya s main opposition pa...,worldnews,0
10449,#VeryFakeNews CNN Underestimated Their Opponen...,CNN is finally seeing the serious repercussion...,politics,1
7792,Trump hovers over tight U.S. House race in Flo...,ORLANDO (Reuters) - Ask John Mica why he stick...,politicsNews,0
7830,Latest Trump Campaign Ad Reveals His DISTURBI...,Donald Trump has released a campaign ad detail...,News,1
20588,Pope urges skeptical Colombians to accept peac...,"VILLAVICENCIO, Colombia (Reuters) - Pope Franc...",worldnews,0
...,...,...,...,...
10530,Protesters in Kentucky claim they were assault...,"LOUISVILLE, Ky. (Reuters) - Protesters at a Do...",politicsNews,0
2429,U.S. government delays Obama earnings-strippin...,WASHINGTON (Reuters) - The U.S. government on ...,politicsNews,0
10493,Republican Carson officially ends White House bid,WASHINGTON (Reuters) - Republican presidential...,politicsNews,0
2412,Trump tells Republicans to get back on healthc...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,0


Lets try to make text Preprocessing, where are going through these steps: 
* Lowercasing
* Removing Special Characters and Punctuation
* Tokenization
* Removing Stopwords
* Lemmatization (We'll prefer this over stemming as it's generally more effective for understanding the context of words).

In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('punkt')  # for tokenization
nltk.download('stopwords')  # for stopwords
nltk.download('wordnet')  # for lemmatization



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/claramillekalo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/claramillekalo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/claramillekalo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove special characters and punctuation
    text = ''.join(char for char in text if char.isalnum() or char.isspace())

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join words back to form the cleaned text
    return ' '.join(tokens)

def preprocess_subject(subject):
    # Convert text to lowercase
    subject = subject.lower()

    # Remove special characters and punctuation
    subject = ''.join(char for char in subject if char.isalnum() or char.isspace())

    # Tokenization
    tokens = word_tokenize(subject)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join words back to form the cleaned text
    return ' '.join(tokens)

def preprocess_title(title):
    # Convert text to lowercase
    title = title.lower()

    # Remove special characters and punctuation
    title = ''.join(char for char in title if char.isalnum() or char.isspace())

    # Tokenization
    tokens = word_tokenize(title)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join words back to form the cleaned text
    return ' '.join(tokens)

# Assuming 'df' is your DataFrame and it contains a column named 'text'
df_no_date['cleaned_text'] = df_no_date['text'].apply(preprocess_text)
df_no_date['cleaned_subject'] = df_no_date['subject'].apply(preprocess_subject)
df_no_date['cleaned_title'] = df_no_date['title'].apply(preprocess_title)


In [11]:
df_no_date.drop(columns=['title', 'subject', 'text'], inplace=True)


In [12]:
df_no_date

Unnamed: 0,label,cleaned_text,cleaned_subject,cleaned_title
3536,0,reuters zbigniew brzezinski served u president...,politicsnews,former u national security adviser brzezinski ...
21698,1,offensive choosing allow baby livethe prolife ...,leftnews,prolife license plate deemed patently offensiv...
12373,1,remember famous moment u national intelligence...,politics,slippery snake james clapper resigns time era ...
14273,1,come hell high water occupy movement going voi...,politics,obamas black life matter terrorist join people...
9293,1,president trump giving abc news pas fake news ...,politics,trump suggests people sue abc news massive sto...
...,...,...,...,...
2354,0,washington reuters u president donald trump cl...,politicsnews,trump close decision addressing chinese trade ...
16053,0,shanghai reuters day taking china leader 2012 ...,worldnews,xi top china party official make symbolic visi...
10003,1,woman shame1 muslim activist linda sarsour rem...,politics,muslim activist caught sending donation leftwi...
17089,0,brussels reuters catalonia regional government...,worldnews,catalan foreign affair chief say planning regi...


In [13]:
df_no_date.to_csv('Preprocessed.csv', index=False)

In [14]:
# Load your DataFrame
df = pd.read_csv('Preprocessed.csv')

# Count the number of missing values in each column
missing_values_count = df.isnull().sum()

# Print the number of missing values in each column
print("Missing values in each column:")
print(missing_values_count)

# Calculate the total number of missing values in the DataFrame
total_missing_values = missing_values_count.sum()
print("Total missing values in the dataset:", total_missing_values)


Missing values in each column:
label                0
cleaned_text       632
cleaned_subject      0
cleaned_title        0
dtype: int64
Total missing values in the dataset: 632


To ensure that our models and other features are able to be included and understood, we remove the rows with missing values below

In [15]:
# Remove rows with missing values in cleaned_text as seen from the printout above
df_cleaned = df.dropna()

# Save the cleaned DataFrame to a new file if needed
df_cleaned.to_csv('Preprocessed_clean.csv', index=False)

# Optionally, print the shape to see how many rows were dropped
print("Original DataFrame shape:", df.shape)
print("New DataFrame shape after removing rows with NaN:", df_cleaned.shape)


Original DataFrame shape: (44689, 4)
New DataFrame shape after removing rows with NaN: (44057, 4)


Below we vectorize the dataset to enable counting tokens in the texgt

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Load your dataset
subset_df = pd.read_csv('Preprocessed_clean.csv')

# Initialize a TF-IDF Vectorizer with max of 512 to fit the best practice of BERT while also decreasing the size of the dataset
vectorizer = TfidfVectorizer(max_features=512)  # You can adjust this if needed

# Apply TF-IDF to the 'cleaned_text' column
tfidf_matrix = vectorizer.fit_transform(subset_df['cleaned_text'])

# Convert TF-IDF matrix to a DataFrame to manipulate easily
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Count non-zero entries in each row, which represents the number of unique tokens used
subset_df['token_count'] = (tfidf_df != 0).sum(axis=1)

# Filter out rows where the number of tokens is greater than 512
filtered_df = subset_df[subset_df['token_count'] <= 512]

# Saving the filtered DataFrame
filtered_df.to_csv('filtered_dataset.csv', index=False)

# We then print the dataset to inspect whether this code removed any of the rows. 
print(filtered_df.tokens)


AttributeError: 'DataFrame' object has no attribute 'tokens'

In [None]:
print(filtered_df['token_count'])

0        49
1        68
2        54
3        47
4        33
         ..
44052    97
44053    59
44054    87
44055    70
44056    60
Name: token_count, Length: 44057, dtype: int64


As seen from the printed results none of the rows have been removed, as none of them are above 512
Below we examine the token_count, to determine wether we are able to use this as a way to create subsets and decrease the size of the dataset
We will examine it by looking at both the lowest and highest token_count to ensure we incorporate relevant articles


In [None]:
# Find the row with the maximum 'token_count'
max_token_row = filtered_df[filtered_df['token_count'] == filtered_df['token_count'].max()]

# Find the row with the minimum 'token_count'
min_token_row = filtered_df[filtered_df['token_count'] == filtered_df['token_count'].min()]

# Print the row with the maximum and minimum token count
print(max_token_row)
print(min_token_row)

       label                                       cleaned_text  \
29518      1  funny secret travel start believe bleed lyric ...   
36783      1  funny secret travel start believe bleed lyric ...   

      cleaned_subject                                      cleaned_title  \
29518      middleeast  medium tripwire ping pong pizza conspiracy pro...   
36783          usnews  medium tripwire ping pong pizza conspiracy pro...   

       token_count  
29518          360  
36783          360  
       label                                      cleaned_text  \
9          1             charlie leduff legend detroit classic   
448        1  httpswwwyoutubecomwatchtimecontinue2vijwclqckhd8   
731        1                httpswwwyoutubecomwatchvptbfkqk7gu   
1128       1                                           wow wow   
1299       1                                             enjoy   
...      ...                                               ...   
42080      1                                

As we can clearly see many rows seem to have 0 tokens, which means these are not understood well by the computer
This show a relevance in setting a minimum token_count and creating a subset based on this

In [None]:
# Define a minimum token count threshold
MIN_TOKENS = 70

# determine that the DataFrame should include only rows with 'token_count' being greater than or equal to MIN_TOKENS
filter_df_tokens = filtered_df[filtered_df['token_count'] >= MIN_TOKENS]

# Split the DataFrame into two based on the label
tokens_df_class_0 = filter_df_tokens[filter_df_tokens['label'] == 0]
tokens_df_class_1 = filter_df_tokens[filter_df_tokens['label'] == 1]

# Find the minimum count to balance the dataset
min_count = min(len(tokens_df_class_0), len(tokens_df_class_1))

# Randomly sample min_count rows from each DataFrame
subset_class_0 = tokens_df_class_0.sample(n=min_count, random_state=42)  # Ensures reproducibility
subset_class_1 = tokens_df_class_1.sample(n=min_count, random_state=42)

# Concatenate the two subsets to form a new balanced DataFrame
balanced_subset = pd.concat([subset_class_0, subset_class_1])

# Shuffle the rows to ensure random order
final_subset = balanced_subset.sample(frac=1, random_state=42).reset_index(drop=True)

# Optionally, save the subset to a new CSV file
final_subset.to_csv('balanced_subset.csv', index=False)

# Print some information about the subset
print("Subset shape:", final_subset.shape)
print("Class distribution in subset:\n", final_subset['label'].value_counts())



Subset shape: (13834, 5)
Class distribution in subset:
 1    6917
0    6917
Name: label, dtype: int64
