1. Libraries

In [1]:
#Data Understanding and Preparation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
fake = pd.read_csv('Fake.csv')
true = pd.read_csv('True.csv')



In [3]:
len(fake)
len(true)

21417

In [4]:
fake['label'] = 1
true['label']=0

df_concated = pd.concat([fake, true])
permutation = np.random.permutation(len(df_concated))
df = df_concated.iloc[permutation]
df_concated


Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1
...,...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",0
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",0
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",0
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",0


In [5]:
df.duplicated().sum()

209

In [6]:
df = df.drop_duplicates()

df.shape

(44689, 5)

In [7]:
nan_count_per_column = df.isnull().sum()

print(nan_count_per_column)

title      0
text       0
subject    0
date       0
label      0
dtype: int64


In [8]:
df_no_date = df.drop(columns=['date'])

df_no_date

Unnamed: 0,title,text,subject,label
464,Dan Rather: Mueller’s Russia Investigation Ha...,Legendary journalist Dan Rather says that Robe...,News,1
17421,Death toll from blasts in Somalia's capital Mo...,MOGADISHU (Reuters) - More than 200 people wer...,worldnews,0
8239,U.S. House Republicans to discuss whether to i...,WASHINGTON (Reuters) - Members of the Republic...,politicsNews,0
14076,THIS IS BIG! Tennessee Votes To Sue The Feds O...,This is big! In the name of security and state...,politics,1
11886,HIGH ROAD! MARTIN LUTHER KING III MEETS WITH T...,Martin Luther King III met with president-elec...,politics,1
...,...,...,...,...
20185,VIDEO SHOWS SCARY TRUTH About What Decades Of ...,"We live near the city of Detroit, and anyone c...",left-news,1
14292,Rosneft's Sechin to miss hearing at ex-ministe...,MOSCOW (Reuters) - The head of Russian state o...,worldnews,0
509,Trump Supporter In Phoenix Just Threatened Jo...,"Donald Trump, in his ongoing effort to spur on...",News,1
8988,"For Donald Trump, going on about golf is par f...","TURNBERRY, Scotland (Reuters) - Donald Trump f...",politicsNews,0


Lets try to make text Preprocessing, where are going through these steps: 
* Lowercasing
* Removing Special Characters and Punctuation
* Tokenization
* Removing Stopwords
* Lemmatization (We'll prefer this over stemming as it's generally more effective for understanding the context of words).

In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('punkt')  # for tokenization
nltk.download('stopwords')  # for stopwords
nltk.download('wordnet')  # for lemmatization



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/claramillekalo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/claramillekalo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/claramillekalo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove special characters and punctuation
    text = ''.join(char for char in text if char.isalnum() or char.isspace())

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join words back to form the cleaned text
    return ' '.join(tokens)

def preprocess_subject(subject):
    # Convert text to lowercase
    subject = subject.lower()

    # Remove special characters and punctuation
    subject = ''.join(char for char in subject if char.isalnum() or char.isspace())

    # Tokenization
    tokens = word_tokenize(subject)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join words back to form the cleaned text
    return ' '.join(tokens)

def preprocess_title(title):
    # Convert text to lowercase
    title = title.lower()

    # Remove special characters and punctuation
    title = ''.join(char for char in title if char.isalnum() or char.isspace())

    # Tokenization
    tokens = word_tokenize(title)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join words back to form the cleaned text
    return ' '.join(tokens)

# Assuming 'df' is your DataFrame and it contains a column named 'text'
df_no_date['cleaned_text'] = df_no_date['text'].apply(preprocess_text)
df_no_date['cleaned_subject'] = df_no_date['subject'].apply(preprocess_subject)
df_no_date['cleaned_title'] = df_no_date['title'].apply(preprocess_title)


In [11]:
df_no_date.drop(columns=['title', 'subject', 'text'], inplace=True)


In [12]:
df_no_date

Unnamed: 0,label,cleaned_text,cleaned_subject,cleaned_title
464,1,legendary journalist dan rather say robert mue...,news,dan rather muellers russia investigation trump...
17421,0,mogadishu reuters 200 people killed twin bomb ...,worldnews,death toll blast somalia capital mogadishu top...
8239,0,washington reuters member republican majority ...,politicsnews,u house republican discus whether impeach irs ...
14076,1,big name security state sovereignty tennessee ...,politics,big tennessee vote sue fed refugee resettlement
11886,1,martin luther king iii met presidentelect dona...,politics,high road martin luther king iii meet trump ti...
...,...,...,...,...
20185,1,live near city detroit anyone could literally ...,leftnews,video show scary truth decade democrat ruled d...
14292,0,moscow reuters head russian state oil giant ro...,worldnews,rosnefts sechin miss hearing exminister corrup...
509,1,donald trump ongoing effort spur civil war hol...,news,trump supporter phoenix threatened john mccain...
8988,0,turnberry scotland reuters donald trump flew n...,politicsnews,donald trump going golf par course


In [13]:
df_no_date.to_csv('Preprocessed.csv', index=False)

In [14]:
# Load your DataFrame
df = pd.read_csv('Preprocessed.csv')

# Count the number of missing values in each column
missing_values_count = df.isnull().sum()

# Print the number of missing values in each column
print("Missing values in each column:")
print(missing_values_count)

# Calculate the total number of missing values in the DataFrame
total_missing_values = missing_values_count.sum()
print("Total missing values in the dataset:", total_missing_values)


Missing values in each column:
label                0
cleaned_text       632
cleaned_subject      0
cleaned_title        0
dtype: int64
Total missing values in the dataset: 632


To ensure that our models and other features are able to be included and understood, we remove the rows with missing values below

In [15]:
# Remove rows with missing values in cleaned_text as seen from the printout above
df_cleaned = df.dropna()

# Save the cleaned DataFrame to a new file if needed
df_cleaned.to_csv('Preprocessed_clean.csv', index=False)

# Optionally, print the shape to see how many rows were dropped
print("Original DataFrame shape:", df.shape)
print("New DataFrame shape after removing rows with NaN:", df_cleaned.shape)


Original DataFrame shape: (44689, 4)
New DataFrame shape after removing rows with NaN: (44057, 4)


Below we vectorize the dataset to enable counting tokens in the text

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Load your dataset
subset_df = pd.read_csv('Preprocessed_clean.csv')

# Initialize a TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=512)  # You can adjust this if needed

# Apply TF-IDF to the 'cleaned_text' column
tfidf_matrix = vectorizer.fit_transform(subset_df['cleaned_text'])

# Convert TF-IDF matrix to a DataFrame to manipulate easily
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Count non-zero entries in each row, which represents the number of unique tokens used
subset_df['token_count'] = (tfidf_df != 0).sum(axis=1)

# Filter out rows where the number of tokens is greater than 512
filtered_df = subset_df[subset_df['token_count'] <= 512]

# Save the filtered DataFrame
filtered_df.to_csv('filtered_dataset.csv', index=False)

# Optionally, inspect the first few rows of the filtered DataFrame
print(filtered_df.head())



   label                                       cleaned_text cleaned_subject  \
0      1  legendary journalist dan rather say robert mue...            news   
1      0  mogadishu reuters 200 people killed twin bomb ...       worldnews   
2      0  washington reuters member republican majority ...    politicsnews   
3      1  big name security state sovereignty tennessee ...        politics   
4      1  martin luther king iii met presidentelect dona...        politics   

                                       cleaned_title  token_count  
0  dan rather muellers russia investigation trump...           59  
1  death toll blast somalia capital mogadishu top...           71  
2  u house republican discus whether impeach irs ...           74  
3    big tennessee vote sue fed refugee resettlement           45  
4  high road martin luther king iii meet trump ti...           30  


In [20]:
print(filtered_df['token_count'])

0        59
1        71
2        74
3        45
4        30
         ..
44052    42
44053    74
44054    52
44055    77
44056    75
Name: token_count, Length: 44057, dtype: int64


As seen from the printed results none of the rows have been removed, as none of them are above 512
Below we examine the token_count, to determine wether we are able to use this as a way to create subsets and decrease the size of the dataset
We will examine it by looking at both the lowest and highest token_count to ensure we incorporate relevant articles


In [21]:
# Find the row with the maximum 'token_count'
max_token_row = filtered_df[filtered_df['token_count'] == filtered_df['token_count'].max()]

# Find the row with the minimum 'token_count'
min_token_row = filtered_df[filtered_df['token_count'] == filtered_df['token_count'].min()]

# Print the row with the maximum and minimum token count
print(max_token_row)
print(min_token_row)

       label                                       cleaned_text  \
18670      1  funny secret travel start believe bleed lyric ...   
23908      1  funny secret travel start believe bleed lyric ...   

      cleaned_subject                                      cleaned_title  \
18670          usnews  medium tripwire ping pong pizza conspiracy pro...   
23908      middleeast  medium tripwire ping pong pizza conspiracy pro...   

       token_count  
18670          360  
23908          360  
       label                                       cleaned_text  \
635        1  httpstwittercomriggsreportstatus79868622488195...   
1502       1  httpsfedupwpenginecomwpcontentuploads201504hil...   
1755       1                httpswwwyoutubecomwatchvpjeoojypnck   
1777       1  inherent vice capitalism unequal sharing bless...   
2323       1                httpswwwyoutubecomwatchvswbypij7cq8   
...      ...                                                ...   
42166      1                         

As we can clearly see many rows seem to have 0 tokens, which means these are not understood well by the computer
This show a relevance in setting a minimum token_count and creating a subset based on this

In [22]:
# Define a minimum token count threshold
MIN_TOKENS = 70

# determine that the DataFrame should include only rows with 'token_count' being greater than or equal to MIN_TOKENS
filter_df_tokens = filtered_df[filtered_df['token_count'] >= MIN_TOKENS]

# Split the DataFrame into two based on the label
tokens_df_class_0 = filter_df_tokens[filter_df_tokens['label'] == 0]
tokens_df_class_1 = filter_df_tokens[filter_df_tokens['label'] == 1]

# Find the minimum count to balance the dataset
min_count = min(len(tokens_df_class_0), len(tokens_df_class_1))

# Randomly sample min_count rows from each DataFrame
subset_class_0 = tokens_df_class_0.sample(n=min_count, random_state=42)  # Ensures reproducibility
subset_class_1 = tokens_df_class_1.sample(n=min_count, random_state=42)

# Concatenate the two subsets to form a new balanced DataFrame
balanced_subset = pd.concat([subset_class_0, subset_class_1])

# Shuffle the rows to ensure random order
final_subset = balanced_subset.sample(frac=1, random_state=42).reset_index(drop=True)

# Optionally, save the subset to a new CSV file
final_subset.to_csv('balanced_subset.csv', index=False)

# Print some information about the subset
print("Subset shape:", final_subset.shape)
print("Class distribution in subset:\n", final_subset['label'].value_counts())



Subset shape: (13834, 5)
Class distribution in subset:
 1    6917
0    6917
Name: label, dtype: int64
