# Import libraries

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import re

# DATA COLLECTION

LOAD THE DATASETS FROM KAGGLE

In [2]:
data_true = pd.read_csv("Fake.csv")
data_fake = pd.read_csv("True.csv")

In [3]:
data_true.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [4]:
print(data_true.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    23481 non-null  object
 1   text     23481 non-null  object
 2   subject  23481 non-null  object
 3   date     23481 non-null  object
dtypes: object(4)
memory usage: 733.9+ KB
None


In [5]:
data_fake.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [6]:
print(data_fake.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB
None


In [7]:
#Removing the location and source in the fake dataset
data_fake['text'] = data_fake['text'].apply(lambda x: x.split(') - ', 1)[-1])
data_fake.head(10)

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",The head of a conservative Republican faction ...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,Transgender people will be allowed for the fir...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,The special counsel investigation of links bet...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,Trump campaign adviser George Papadopoulos tol...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,President Donald Trump called on the U.S. Post...,politicsNews,"December 29, 2017"
5,"White House, Congress prepare for talks on spe...",The White House said on Friday it was set to k...,politicsNews,"December 29, 2017"
6,"Trump says Russia probe will be fair, but time...",President Donald Trump said on Thursday he bel...,politicsNews,"December 29, 2017"
7,Factbox: Trump on Twitter (Dec 29) - Approval ...,The following statements were posted to the ve...,politicsNews,"December 29, 2017"
8,Trump on Twitter (Dec 28) - Global Warming,The following statements were posted to the ve...,politicsNews,"December 29, 2017"
9,Alabama official to certify Senator-elect Jone...,Alabama Secretary of State John Merrill said h...,politicsNews,"December 28, 2017"


# Merge the datasets

In [8]:
# Add a label column to each dataset
data_true['label'] = 'True'
data_fake['label'] = 'Fake'

# Merge 
merged_data = pd.concat([data_true, data_fake], axis=0).reset_index(drop=True)

merged_data.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",True
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",True
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",True
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",True
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",True


In [9]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   label    44898 non-null  object
dtypes: object(5)
memory usage: 1.7+ MB


# DATA CLEANING

In [10]:
# Define a function to handle different date formats
def convert_date_format(date_str):
    # List of possible date formats
    formats = [
        "%B %d, %Y",   # e.g., December 31, 2017
        "%b %d, %Y",   # e.g., Dec 31, 2017
        "%d-%b-%y",    # e.g., 31-Dec-17
        "%d-%b-%Y",    # e.g., 31-Dec-2017
        "%d %B %Y",    # e.g., 31 December 2017
        "%d/%m/%Y",    # e.g., 31/12/2017
        "%m/%d/%Y",    # e.g., 12/31/2017
        "%Y-%m-%d",    # e.g., 2017-12-31
        "%d-%b-%y",    # e.g., 14-Feb-18
        "%d %b %y",    # e.g., 14 Feb 18
        "%b %d, %y"    # e.g., Feb 14, 18
    ]
    
    for fmt in formats:
        try:
            return datetime.strptime(date_str.strip(), fmt)
        except ValueError:
            continue
    return pd.NaT

# Apply the custom function to the date column
merged_data['date'] = merged_data['date'].apply(convert_date_format)

# Verify
print(merged_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   title    44898 non-null  object        
 1   text     44898 non-null  object        
 2   subject  44898 non-null  object        
 3   date     44888 non-null  datetime64[ns]
 4   label    44898 non-null  object        
dtypes: datetime64[ns](1), object(4)
memory usage: 1.7+ MB
None


In [11]:
merged_data.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,2017-12-31,True
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,2017-12-31,True
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,2017-12-30,True
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,2017-12-29,True
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,2017-12-25,True


In [12]:
# Get a count of each unique value in the 'subject' column
subject_counts = merged_data['subject'].value_counts()

# Display the counts
print(subject_counts)

subject
politicsNews       11272
worldnews          10145
News                9050
politics            6841
left-news           4459
Government News     1570
US_News              783
Middle-east          778
Name: count, dtype: int64


In [13]:
# Combine 'politics' and 'politicsNews' into a single 'Politics' category
merged_data['subject'] = merged_data['subject'].replace({'politicsNews': 'Politics', 'politics': 'Politics'})

# Verify
print(merged_data['subject'].value_counts())

subject
Politics           18113
worldnews          10145
News                9050
left-news           4459
Government News     1570
US_News              783
Middle-east          778
Name: count, dtype: int64


In [14]:
# Filter for the 'Politics' category
politics_data = merged_data[merged_data['subject'] == 'Politics']

# Count the number of 'True' and 'False' labels in the 'Politics' category
label_counts = politics_data['label'].value_counts()

print(label_counts)

label
Fake    11272
True     6841
Name: count, dtype: int64


In [15]:
# Transform the 'label' column
politics_data['label'] = politics_data['label'].map({'True': 0, 'Fake': 1})

# Verify the transformation
print(politics_data['label'].value_counts())

label
1    11272
0     6841
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  politics_data['label'] = politics_data['label'].map({'True': 0, 'Fake': 1})


In [16]:
# Save the filtered dataset to a CSV file
politics_data.to_csv('cleaned_politics_data.csv', index=False)

print("Filtered dataset saved as 'cleaned_politics_data.csv'.")

Filtered dataset saved as 'cleaned_politics_data.csv'.
