## Adding the necessary libraries 

In [1]:
import pandas as pd
import nltk
import nltk
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re

[nltk_data] Downloading package omw-1.4 to C:\Users\Tee Chang
[nltk_data]     Zen\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Tee Chang
[nltk_data]     Zen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Tee Chang
[nltk_data]     Zen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Tee Chang
[nltk_data]     Zen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Importing the DataSets

In [2]:
fake_df = pd.read_csv('Fake.csv')
real_df = pd.read_csv('True.csv')

## Preprocessing

In [3]:
fake_df.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [4]:
real_df.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


## Merging the datasets

In [5]:
# Add labels to the dataframes
fake_df['label'] = 0
real_df['label'] = 1

# Combine the two dataframes
data = pd.concat([fake_df, real_df], ignore_index=True)

# Shuffle the dataset
data = data.sample(frac=1).reset_index(drop=True)

#nltk.download('averaged_perceptron_tagger')
data.head()

Unnamed: 0,title,text,subject,date,label
0,That Moment Conservative Star Milo Yiannopoul...,"If you have not yet heard, alt-Right (that mea...",News,"February 19, 2017",0
1,Syria says military jet downed in northern Ham...,AMMAN (Reuters) - Syria s armed forces said in...,worldnews,"December 26, 2017",1
2,European businesses working in Russia slam pro...,MOSCOW (Reuters) - The Association of European...,politicsNews,"July 27, 2017",1
3,A POEM: ‘Twas The Night Before CNN’s Christmas…’,ACR s BOILER ROOM presents a Christmas poem Tw...,US_News,"December 25, 2017",0
4,A New Low: Trump Exploits Las Vegas Shooting ...,Donald Trump doesn t exactly hide the fact tha...,News,"October 8, 2017",0


## Cleaning the Merged Datasets

## Basic cleaning of text data

In [24]:
# Function to clean text data
def clean_text(text):
    # Remove special characters and digits
    text = re.sub('[^a-zA-Z]', ' ', text)
    
    # Convert text to lowercase
    text = text.lower()
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Stemming
    ps = PorterStemmer()
    tokens = [ps.stem(word) for word in tokens]
    
    # Join the cleaned tokens back into a string
    text = ' '.join(tokens)
    
    return text

# Clean the 'title' column
data['title_clean'] = data['title'].apply(clean_text)
data['text_clean'] = data['text'].apply(clean_text)
data.head()

Unnamed: 0,title,text,subject,date,label,title_clean,text_clean
0,That Moment Conservative Star Milo Yiannopoul...,"If you have not yet heard, alt-Right (that mea...",News,"February 19, 2017",0,moment conserv star milo yiannopoulo said ok o...,yet heard alt right mean nazi scumbag milo yia...
1,Syria says military jet downed in northern Ham...,AMMAN (Reuters) - Syria s armed forces said in...,worldnews,"December 26, 2017",1,syria say militari jet down northern hama pilo...,amman reuter syria arm forc said insurg down m...
2,European businesses working in Russia slam pro...,MOSCOW (Reuters) - The Association of European...,politicsNews,"July 27, 2017",1,european busi work russia slam propos u sanction,moscow reuter associ european busi aeb said th...
3,A POEM: ‘Twas The Night Before CNN’s Christmas…’,ACR s BOILER ROOM presents a Christmas poem Tw...,US_News,"December 25, 2017",0,poem twa night cnn christma,acr boiler room present christma poem twa nigh...
4,A New Low: Trump Exploits Las Vegas Shooting ...,Donald Trump doesn t exactly hide the fact tha...,News,"October 8, 2017",0,new low trump exploit la vega shoot rais money...,donald trump exactli hide fact sociopath look ...


## Removing StopWords

In [26]:
import string
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)

def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop:
            final_text.append(i.strip())
    return " ".join(final_text)
data['title_clean'] = data['title_clean'].apply(remove_stopwords)
data['text_clean'] = data['text_clean'].apply(remove_stopwords)
data.head()

Unnamed: 0,title,text,subject,date,label,title_clean,text_clean
0,That Moment Conservative Star Milo Yiannopoul...,"If you have not yet heard, alt-Right (that mea...",News,"February 19, 2017",0,moment conserv star milo yiannopoulo said ok o...,yet heard alt right mean nazi scumbag milo yia...
1,Syria says military jet downed in northern Ham...,AMMAN (Reuters) - Syria s armed forces said in...,worldnews,"December 26, 2017",1,syria say militari jet northern hama pilot kill,amman reuter syria arm forc said insurg milita...
2,European businesses working in Russia slam pro...,MOSCOW (Reuters) - The Association of European...,politicsNews,"July 27, 2017",1,european busi work russia slam propos u sanction,moscow reuter associ european busi aeb said th...
3,A POEM: ‘Twas The Night Before CNN’s Christmas…’,ACR s BOILER ROOM presents a Christmas poem Tw...,US_News,"December 25, 2017",0,poem twa night cnn christma,acr boiler room present christma poem twa nigh...
4,A New Low: Trump Exploits Las Vegas Shooting ...,Donald Trump doesn t exactly hide the fact tha...,News,"October 8, 2017",0,new low trump exploit la vega shoot rais money...,donald trump exactli hide fact sociopath look ...


## Strip html

In [27]:
from bs4 import BeautifulSoup

def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

data['title_clean'] = data['title_clean'].apply(strip_html)
data['text_clean'] = data['text_clean'].apply(strip_html)
data.head()

Unnamed: 0,title,text,subject,date,label,title_clean,text_clean
0,That Moment Conservative Star Milo Yiannopoul...,"If you have not yet heard, alt-Right (that mea...",News,"February 19, 2017",0,moment conserv star milo yiannopoulo said ok o...,yet heard alt right mean nazi scumbag milo yia...
1,Syria says military jet downed in northern Ham...,AMMAN (Reuters) - Syria s armed forces said in...,worldnews,"December 26, 2017",1,syria say militari jet northern hama pilot kill,amman reuter syria arm forc said insurg milita...
2,European businesses working in Russia slam pro...,MOSCOW (Reuters) - The Association of European...,politicsNews,"July 27, 2017",1,european busi work russia slam propos u sanction,moscow reuter associ european busi aeb said th...
3,A POEM: ‘Twas The Night Before CNN’s Christmas…’,ACR s BOILER ROOM presents a Christmas poem Tw...,US_News,"December 25, 2017",0,poem twa night cnn christma,acr boiler room present christma poem twa nigh...
4,A New Low: Trump Exploits Las Vegas Shooting ...,Donald Trump doesn t exactly hide the fact tha...,News,"October 8, 2017",0,new low trump exploit la vega shoot rais money...,donald trump exactli hide fact sociopath look ...


## Removing square brackets

In [28]:
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)
data['title_clean'] = data['title_clean'].apply(remove_between_square_brackets)
data['text_clean'] = data['text_clean'].apply(remove_between_square_brackets)
data.head()

Unnamed: 0,title,text,subject,date,label,title_clean,text_clean
0,That Moment Conservative Star Milo Yiannopoul...,"If you have not yet heard, alt-Right (that mea...",News,"February 19, 2017",0,moment conserv star milo yiannopoulo said ok o...,yet heard alt right mean nazi scumbag milo yia...
1,Syria says military jet downed in northern Ham...,AMMAN (Reuters) - Syria s armed forces said in...,worldnews,"December 26, 2017",1,syria say militari jet northern hama pilot kill,amman reuter syria arm forc said insurg milita...
2,European businesses working in Russia slam pro...,MOSCOW (Reuters) - The Association of European...,politicsNews,"July 27, 2017",1,european busi work russia slam propos u sanction,moscow reuter associ european busi aeb said th...
3,A POEM: ‘Twas The Night Before CNN’s Christmas…’,ACR s BOILER ROOM presents a Christmas poem Tw...,US_News,"December 25, 2017",0,poem twa night cnn christma,acr boiler room present christma poem twa nigh...
4,A New Low: Trump Exploits Las Vegas Shooting ...,Donald Trump doesn t exactly hide the fact tha...,News,"October 8, 2017",0,new low trump exploit la vega shoot rais money...,donald trump exactli hide fact sociopath look ...


## Lemitisation

In [29]:
# Lemitisation (changing similar words to the root base)
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    tokens = nltk.word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)

data['title_clean'] = data['title_clean'].apply(lemmatize_text)
data['text_clean'] = data['text_clean'].apply(lemmatize_text)
data.head()

Unnamed: 0,title,text,subject,date,label,title_clean,text_clean
0,That Moment Conservative Star Milo Yiannopoul...,"If you have not yet heard, alt-Right (that mea...",News,"February 19, 2017",0,moment conserv star milo yiannopoulo said ok o...,yet heard alt right mean nazi scumbag milo yia...
1,Syria says military jet downed in northern Ham...,AMMAN (Reuters) - Syria s armed forces said in...,worldnews,"December 26, 2017",1,syria say militari jet northern hama pilot kill,amman reuter syria arm forc said insurg milita...
2,European businesses working in Russia slam pro...,MOSCOW (Reuters) - The Association of European...,politicsNews,"July 27, 2017",1,european busi work russia slam propos u sanction,moscow reuter associ european busi aeb said th...
3,A POEM: ‘Twas The Night Before CNN’s Christmas…’,ACR s BOILER ROOM presents a Christmas poem Tw...,US_News,"December 25, 2017",0,poem twa night cnn christma,acr boiler room present christma poem twa nigh...
4,A New Low: Trump Exploits Las Vegas Shooting ...,Donald Trump doesn t exactly hide the fact tha...,News,"October 8, 2017",0,new low trump exploit la vega shoot rais money...,donald trump exactli hide fact sociopath look ...


## Removing urls

In [30]:
#remove_urls
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

data['title_clean'] = data['title_clean'].apply(remove_urls)
data['text_clean'] = data['text_clean'].apply(remove_urls)
data.head()

Unnamed: 0,title,text,subject,date,label,title_clean,text_clean
0,That Moment Conservative Star Milo Yiannopoul...,"If you have not yet heard, alt-Right (that mea...",News,"February 19, 2017",0,moment conserv star milo yiannopoulo said ok o...,yet heard alt right mean nazi scumbag milo yia...
1,Syria says military jet downed in northern Ham...,AMMAN (Reuters) - Syria s armed forces said in...,worldnews,"December 26, 2017",1,syria say militari jet northern hama pilot kill,amman reuter syria arm forc said insurg milita...
2,European businesses working in Russia slam pro...,MOSCOW (Reuters) - The Association of European...,politicsNews,"July 27, 2017",1,european busi work russia slam propos u sanction,moscow reuter associ european busi aeb said th...
3,A POEM: ‘Twas The Night Before CNN’s Christmas…’,ACR s BOILER ROOM presents a Christmas poem Tw...,US_News,"December 25, 2017",0,poem twa night cnn christma,acr boiler room present christma poem twa nigh...
4,A New Low: Trump Exploits Las Vegas Shooting ...,Donald Trump doesn t exactly hide the fact tha...,News,"October 8, 2017",0,new low trump exploit la vega shoot rais money...,donald trump exactli hide fact sociopath look ...


## Removing html tags

In [31]:
#remove html tags
def remove_html_tags(text):
    html_pattern = re.compile(r'<.*?>')
    return html_pattern.sub(r'', text)

data['title_clean'] = data['title_clean'].apply(remove_html_tags)
data['text_clean'] = data['text_clean'].apply(remove_html_tags)
data.head()

Unnamed: 0,title,text,subject,date,label,title_clean,text_clean
0,That Moment Conservative Star Milo Yiannopoul...,"If you have not yet heard, alt-Right (that mea...",News,"February 19, 2017",0,moment conserv star milo yiannopoulo said ok o...,yet heard alt right mean nazi scumbag milo yia...
1,Syria says military jet downed in northern Ham...,AMMAN (Reuters) - Syria s armed forces said in...,worldnews,"December 26, 2017",1,syria say militari jet northern hama pilot kill,amman reuter syria arm forc said insurg milita...
2,European businesses working in Russia slam pro...,MOSCOW (Reuters) - The Association of European...,politicsNews,"July 27, 2017",1,european busi work russia slam propos u sanction,moscow reuter associ european busi aeb said th...
3,A POEM: ‘Twas The Night Before CNN’s Christmas…’,ACR s BOILER ROOM presents a Christmas poem Tw...,US_News,"December 25, 2017",0,poem twa night cnn christma,acr boiler room present christma poem twa nigh...
4,A New Low: Trump Exploits Las Vegas Shooting ...,Donald Trump doesn t exactly hide the fact tha...,News,"October 8, 2017",0,new low trump exploit la vega shoot rais money...,donald trump exactli hide fact sociopath look ...


## Saving the cleaned dataset as cleaned_news_data csv

In [32]:
import os

# Save the cleaned dataset
output_file = 'cleaned_news_data.csv'
data.to_csv(output_file, index=False)

# Print the full path of the saved file
current_working_directory = os.getcwd()
output_file_path = os.path.join(current_working_directory, output_file)
print(f"Cleaned dataset saved at: {output_file_path}")

Cleaned dataset saved at: C:\Users\Tee Chang Zen\Documents\OneDrive\Sem 2 Mods\DSAI SC1015\Project\cleaned_news_data.csv


## Further cleaning data, in cleaned dataset to change date format from (1 Jan/January 2001) to (YYYY-MM)

In [33]:
#Importing cleaned data csv
import pandas as pd
df = pd.read_csv('cleaned_news_data.csv')

In [34]:
# Convert date column to datetime format
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Extract year and day from datetime objects
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month

# Format date column as "YYYY-MM" strings
df['date'] = df['date'].dt.strftime('%Y-%m')
df.head()

Unnamed: 0,title,text,subject,date,label,title_clean,text_clean,year,month
0,That Moment Conservative Star Milo Yiannopoul...,"If you have not yet heard, alt-Right (that mea...",News,2017-02,0,moment conserv star milo yiannopoulo said ok o...,yet heard alt right mean nazi scumbag milo yia...,2017.0,2.0
1,Syria says military jet downed in northern Ham...,AMMAN (Reuters) - Syria s armed forces said in...,worldnews,2017-12,1,syria say militari jet northern hama pilot kill,amman reuter syria arm forc said insurg milita...,2017.0,12.0
2,European businesses working in Russia slam pro...,MOSCOW (Reuters) - The Association of European...,politicsNews,2017-07,1,european busi work russia slam propos u sanction,moscow reuter associ european busi aeb said th...,2017.0,7.0
3,A POEM: ‘Twas The Night Before CNN’s Christmas…’,ACR s BOILER ROOM presents a Christmas poem Tw...,US_News,2017-12,0,poem twa night cnn christma,acr boiler room present christma poem twa nigh...,2017.0,12.0
4,A New Low: Trump Exploits Las Vegas Shooting ...,Donald Trump doesn t exactly hide the fact tha...,News,2017-10,0,new low trump exploit la vega shoot rais money...,donald trump exactli hide fact sociopath look ...,2017.0,10.0


In [35]:
import os

# Save the cleaned dataset
output_file = 'cleaned_news_data.csv'
df.to_csv(output_file, index=False)

# Print the full path of the saved file
current_working_directory = os.getcwd()
output_file_path = os.path.join(current_working_directory, output_file)
print(f"Cleaned dataset saved at: {output_file_path}")

Cleaned dataset saved at: C:\Users\Tee Chang Zen\Documents\OneDrive\Sem 2 Mods\DSAI SC1015\Project\cleaned_news_data.csv


## Checking for nan values in cleaned data set

In [36]:
data.isna().sum()

title          0
text           0
subject        0
date           0
label          0
title_clean    0
text_clean     0
dtype: int64

#### No nan values present

## Checking for float values in text_clean

In [38]:
float_rows = df[df['text_clean'].apply(lambda x: isinstance(x, float))]
print(float_rows)

                                                   title text  \
43     ROB SCHNEIDER Nails The Russia Conspiracy Theo...        
122    YOU’LL LOVE MIKE ROWE’S Awesome Response To An...        
224    SHARE THIS EVERYWHERE! DISEASED REFUGEES Get S...        
395    HILARIOUS TRUMP CHRISTMAS VIDEO: “It’s The Mos...        
562    NANCY PELOSI Tries to Lead Democrats in Chants...        
...                                                  ...  ...   
44640  HYSTERICAL! TUCKER CARLSON Slams Geraldo For P...        
44690  VICIOUS! PORTLAND RIOTERS ATTACK Pregnant Woma...        
44746  DONALD TRUMP Sits On Obama’s Right…Why This Is...        
44793  CNN’S FAREED ZAKARIA Busts Out a Profanity Fil...        
44810  NIGEL FARAGE ON TRUMP/MERKEL POWWOW: Merkel’s ...        

               subject     date  label  \
43            politics  2017-03      0   
122          left-news  2016-12      0   
224           politics  2017-03      0   
395           politics  2016-12      0   
562      

## Remove rows with float value

In [47]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('cleaned_news_data.csv')

# Function to check if a value is a float
def is_float(value):
    try:
        float(value)
        return True
    except ValueError:
        return False

# Apply the is_float function to the 'text_clean' column to create a boolean mask
float_mask = df['text_clean'].apply(is_float)

# Keep only rows where the 'text_clean' column is not a float
df_no_floats = df[~float_mask]

# Save the cleaned DataFrame to CSV file
df_no_floats.to_csv('cleaned_news_data.csv', index=False)
df.to_csv(output_file, index=False)

# Print the full path of the saved file
current_working_directory = os.getcwd()
output_file_path = os.path.join(current_working_directory, output_file)
print(f"Cleaned dataset saved at: {output_file_path}")


Cleaned dataset saved at: C:\Users\Tee Chang Zen\Documents\OneDrive\Sem 2 Mods\DSAI SC1015\Project\cleaned_news_data.csv


In [48]:
float_rows = df[df['text_clean'].apply(lambda x: isinstance(x, float))]
print(float_rows)

Empty DataFrame
Columns: [title, text, subject, date, label, title_clean, text_clean, year, month]
Index: []


#### No rows with float values in text_clean column