In [3]:
from google.colab import files
uploaded = files.upload()


Saving fri_dataset.csv to fri_dataset.csv


In [4]:
import os
os.makedirs("data", exist_ok=True)

import shutil
shutil.move("fri_dataset.csv", "data/fri_dataset.csv")

'data/fri_dataset.csv'

### Import required libraries

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

### Load dataset

In [3]:
df = pd.read_csv('../dataset/fri_dataset_cleaned.csv')
df.head()

Unnamed: 0,text,label
0,The nation witnessed yet another hit from Nax...,0
1,In an interview with ET Now Jim Rogers Chairma...,0
2,NEW YORK There has always been a great paradox...,0
3,NEW DELHI The CAG has rapped oil ministry and ...,0
4,NEW DELHI Having weighed in on the Pakistani s...,0


Rename columns

In [4]:
df = df.rename(columns={"ArticleBody": "text", "Label": "label"})

## Clean the dataset
### Remove nulls and duplicates

In [5]:
# Drop rows with missing values in 'text'
df = df.dropna(subset=["text"])

# Drop duplicate rows
df = df.drop_duplicates()

# Reset index (optional, for clean indexing)
df = df.reset_index(drop=True)

# Post-cleaning summary
print("🧼 Cleaned shape:", df.shape)
print("🧼 Missing values:\n", df.isnull().sum())
print("🧼 Duplicate rows:", df.duplicated().sum())
print("📊 Label distribution:\n", df['label'].value_counts())


🧼 Cleaned shape: (9828, 2)
🧼 Missing values:
 text     0
label    0
dtype: int64
🧼 Duplicate rows: 0
📊 Label distribution:
 label
0    4995
1    4833
Name: count, dtype: int64


### Basic data checks

In [6]:
print("Shape:", df.shape)
print("\nMissing Values:\n", df.isnull().sum())
print("\nDuplicate rows:", df.duplicated().sum())
print("\nLabel distribution:\n", df['label'].value_counts())

Shape: (9828, 2)

Missing Values:
 text     0
label    0
dtype: int64

Duplicate rows: 0

Label distribution:
 label
0    4995
1    4833
Name: count, dtype: int64


###Save Cleaned Dataset

In [13]:
df.to_csv("fri_dataset_cleaned.csv", index=False)

### Preprocess Text and Split the Data

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 1: Split the dataset
X = df['text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Step 2: TF-IDF Vectorization
vectorizer = TfidfVectorizer(
    stop_words='english',
    max_df=0.7,
    min_df=5,
    ngram_range=(1,2)
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Output shapes
print("✅ X_train_tfidf shape:", X_train_tfidf.shape)
print("✅ X_test_tfidf shape:", X_test_tfidf.shape)


✅ X_train_tfidf shape: (7862, 47203)
✅ X_test_tfidf shape: (1966, 47203)


### Save the training and testing dataset

In [15]:
import pandas as pd

# Combine X and y back into DataFrames
train_df = pd.DataFrame({'text': X_train, 'label': y_train})
test_df = pd.DataFrame({'text': X_test, 'label': y_test})

# Save to CSV
train_df.to_csv('train_data.csv', index=False)
test_df.to_csv('test_data.csv', index=False)
