## Load and Explore the Dataset

In [11]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('../data/df_file.csv')  

# Map numeric labels to category names
label_to_category = {
    0: 'Politics',
    1: 'Sport',
    2: 'Technology',
    3: 'Entertainment',
    4: 'Business',
    5: 'Other'  
}

# Add a new column for category names
df['category'] = df['Label'].map(label_to_category)

# Display basic info
print("Dataset Overview:")
print(df.head())

print("\nDataset Shape:", df.shape)
print("\nColumn Names and Data Types:")
print(df.dtypes)

# Display class distribution with category names
print("\nClass Distribution:")
print(df['category'].value_counts())

Dataset Overview:
                                                Text  Label  category
0  Budget to set scene for election\n \n Gordon B...      0  Politics
1  Army chiefs in regiments decision\n \n Militar...      0  Politics
2  Howard denies split over ID cards\n \n Michael...      0  Politics
3  Observers to monitor UK election\n \n Minister...      0  Politics
4  Kilroy names election seat target\n \n Ex-chat...      0  Politics

Dataset Shape: (2225, 3)

Column Names and Data Types:
Text        object
Label        int64
category    object
dtype: object

Class Distribution:
category
Sport            511
Business         510
Politics         417
Technology       401
Entertainment    386
Name: count, dtype: int64


### Text Preprocessing

In [14]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

# Download NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Text preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = ''.join([char for char in text if char.isalpha() or char == ' '])
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply preprocessing
df['cleaned_text'] = df['Text'].apply(preprocess_text)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)  
X = vectorizer.fit_transform(df['cleaned_text'])
y = df['Label']

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ab\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
