In [1]:
import pandas as pd
import os

In [2]:
def get_base_path():
    try:
        current_path = os.path.dirname(os.path.abspath(__file__))
    except NameError:
        current_path = os.getcwd()
    
    parent_path = os.path.dirname(current_path)
    
    return parent_path

In [3]:
base_path = get_base_path()
fake_path = os.path.join(base_path, 'data', 'raw', 'politifact_fake.csv')
real_path = os.path.join(base_path, 'data', 'raw', 'politifact_real.csv')
print(f"Base path: {base_path}")
print(f"Fake path: {fake_path}")
print(f"Real path: {real_path}")

Base path: C:\Users\KIIT\Documents\WORKSPACE\development\ML\ai-fake-news-detector
Fake path: C:\Users\KIIT\Documents\WORKSPACE\development\ML\ai-fake-news-detector\data\raw\politifact_fake.csv
Real path: C:\Users\KIIT\Documents\WORKSPACE\development\ML\ai-fake-news-detector\data\raw\politifact_real.csv


In [4]:
df_fake = pd.read_csv(fake_path)
df_real = pd.read_csv(real_path)
df_fake["label"] = 0
df_real["label"] = 1
#to merge df_real and df_fake
df = pd.concat([df_fake, df_real], ignore_index=True)
#shuffling dataset so that results are mixed.
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [5]:
df.head()

Unnamed: 0,id,news_url,title,tweet_ids,label
0,politifact14885,www.breakingnews247.net/59bc13819a723/world-s-...,World's most popular candy to be removed from ...,908193166632812545\t908318679821160450\t908327...,0
1,politifact1106,http://frwebgate.access.gpo.gov/cgi-bin/getdoc...,Browse Congressional Bills,2761266599\t3283658037\t3439787429\t5180526172...,1
2,politifact1678,http://abcnews.go.com/ThisWeek/video/supreme-c...,Supreme Court Vacancy Video,,1
3,politifact370,http://tonto.eia.doe.gov/dnav/pet/pet_move_wkl...,U.S. Imports & Exports,735229171086065664\t735591660080488449\t736678...,1
4,politifact15263,http://sciencevibe.com/2018/04/11/dying-78-yea...,Dying 78 Year Old CIA Agent Admits To Killing ...,588772842071666688\t588795790774140929\t588796...,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1056 entries, 0 to 1055
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         1056 non-null   object
 1   news_url   995 non-null    object
 2   title      1056 non-null   object
 3   tweet_ids  801 non-null    object
 4   label      1056 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 41.4+ KB


In [7]:
print("Fake news:", len(df[df['label'] == 0]))
print("Real news:", len(df[df['label'] == 1]))

Fake news: 432
Real news: 624


In [8]:
print("Null values per column:")
print(df.isnull().sum())

Null values per column:
id             0
news_url      61
title          0
tweet_ids    255
label          0
dtype: int64


In [9]:
#remove null 
df = df.dropna(subset=['news_url'])

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 995 entries, 0 to 1055
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         995 non-null    object
 1   news_url   995 non-null    object
 2   title      995 non-null    object
 3   tweet_ids  762 non-null    object
 4   label      995 non-null    int64 
dtypes: int64(1), object(4)
memory usage: 46.6+ KB


In [11]:
# Define processed directory path
processed_dir = os.path.join(base_path, 'data', 'processed')
print(processed_dir)
# Create processed directory using if-else
if os.path.exists(processed_dir):
    print(f"Directory already exists: {processed_dir}")
else:
    os.makedirs(processed_dir)
    print(f"Created new directory: {processed_dir}")
# Save to CSV
output_path = os.path.join(processed_dir, 'combined_politifact.csv')
df.to_csv(output_path, index=False)
print(f"Saved merged dataset to: {output_path}")

C:\Users\KIIT\Documents\WORKSPACE\development\ML\ai-fake-news-detector\data\processed
Directory already exists: C:\Users\KIIT\Documents\WORKSPACE\development\ML\ai-fake-news-detector\data\processed
Saved merged dataset to: C:\Users\KIIT\Documents\WORKSPACE\development\ML\ai-fake-news-detector\data\processed\combined_politifact.csv


In [12]:
df = pd.read_csv(output_path)

In [13]:
df.head()

Unnamed: 0,id,news_url,title,tweet_ids,label
0,politifact14885,www.breakingnews247.net/59bc13819a723/world-s-...,World's most popular candy to be removed from ...,908193166632812545\t908318679821160450\t908327...,0
1,politifact1106,http://frwebgate.access.gpo.gov/cgi-bin/getdoc...,Browse Congressional Bills,2761266599\t3283658037\t3439787429\t5180526172...,1
2,politifact1678,http://abcnews.go.com/ThisWeek/video/supreme-c...,Supreme Court Vacancy Video,,1
3,politifact370,http://tonto.eia.doe.gov/dnav/pet/pet_move_wkl...,U.S. Imports & Exports,735229171086065664\t735591660080488449\t736678...,1
4,politifact15263,http://sciencevibe.com/2018/04/11/dying-78-yea...,Dying 78 Year Old CIA Agent Admits To Killing ...,588772842071666688\t588795790774140929\t588796...,0


## adding missing protocol to links missing http:// or https://

In [14]:
df['news_url'] = df['news_url'].apply(lambda url: url if url.startswith(('http://', 'https://')) else 'http://' + url)

In [15]:
print(df['news_url'])

0      http://www.breakingnews247.net/59bc13819a723/w...
1      http://frwebgate.access.gpo.gov/cgi-bin/getdoc...
2      http://abcnews.go.com/ThisWeek/video/supreme-c...
3      http://tonto.eia.doe.gov/dnav/pet/pet_move_wkl...
4      http://sciencevibe.com/2018/04/11/dying-78-yea...
                             ...                        
990    http://breaking13news.com/malia-obama-arrested...
991                              http://www.bls.gov/ces/
992    https://web.archive.org/web/20180109225217/htt...
993    http://www.cq.com/doc/newsmakertranscripts-395...
994    https://web.archive.org/web/20080911193042/htt...
Name: news_url, Length: 995, dtype: object


In [16]:
num_duplicates = df.duplicated(subset='news_url').sum()
print(f"Number of duplicate news_url entries: {num_duplicates}")

Number of duplicate news_url entries: 28


In [17]:
#Remove leading/trailing whitespace from URLs
df['news_url'] = df['news_url'].str.strip()

In [18]:
# Count total duplicate URLs (excluding the first occurrence)
duplicate_count = df.duplicated(subset='news_url').sum()
total_rows = len(df)
print(f"Number of duplicate URLs: {duplicate_count}")
print(f"Total rows: {total_rows}")

Number of duplicate URLs: 28
Total rows: 995


In [19]:
#Drop the duplicates
df = df.drop_duplicates(subset='news_url', keep='first')
print(f"New dataset size after removing duplicates: {len(df)}")

New dataset size after removing duplicates: 967


In [20]:
import requests
# Define headers to mimic a real browser and avoid 403 Forbidden errors
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

# Function to check if URL is reachable
def is_url_reachable(url, timeout=5):
    try:
        response = requests.get(url, headers=HEADERS, timeout=timeout, allow_redirects=True)
        if response.status_code == 200:
            return "OK"
        elif 300 <= response.status_code < 400:
            return "REDIRECT"
        elif response.status_code == 403:
            return "FORBIDDEN"
        elif response.status_code == 404:
            return "NOT_FOUND"
        else:
            return f"ERROR_{response.status_code}"
    except requests.exceptions.SSLError:
        return "SSL_ERROR"
    except requests.exceptions.Timeout:
        return "TIMEOUT"
    except requests.exceptions.ConnectionError:
        return "CONNECTION_ERROR"
    except Exception as e:
        return f"ERROR_{type(e).__name__}"

In [21]:
from tqdm import tqdm
# enables progress_apply for Pandas
tqdm.pandas()  

In [26]:
# Apply the check to each row in news_url column (May take time, please have a cup of tea.)
df['url_status'] = df['news_url'].progress_apply(lambda url: is_url_reachable(url))

100%|████████████████████████████████████████████████████████████████████████████████| 967/967 [27:15<00:00,  1.69s/it]


In [27]:
validated_urls_path = os.path.join(base_path, 'data', 'processed', 'validated_urls.csv')
reachable_urls_path = os.path.join(base_path, 'data', 'processed', 'reachable_urls.csv')
print(f"Validated URLs path: {validated_urls_path}")
print(f"Reachable URLs path: {reachable_urls_path}")

Validated URLs path: C:\Users\KIIT\Documents\WORKSPACE\development\ML\ai-fake-news-detector\data\processed\validated_urls.csv
Reachable URLs path: C:\Users\KIIT\Documents\WORKSPACE\development\ML\ai-fake-news-detector\data\processed\reachable_urls.csv


In [28]:
df.to_csv(validated_urls_path, index=False)

# Filter valid links and save
df_valid = df[df['url_status'] == "OK"]
df_valid.to_csv(reachable_urls_path, index=False)

print(f"Saved {len(df)} total URLs to validated_urls.csv")
print(f"Saved {len(df_valid)} reachable URLs to reachable_urls.csv")

Saved 967 total URLs to validated_urls.csv
Saved 611 reachable URLs to reachable_urls.csv


In [29]:
# Print summary
print(f"Total URLs: {len(df)}")
print(f"Reachable URLs: {len(df_valid)}")

Total URLs: 967
Reachable URLs: 611
