In [27]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from nltk.stem import PorterStemmer

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

def preprocess_text(text):
    """
    Clean and preprocess the text.
    """
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    # Remove punctuation
    text = re.sub(r"[^\w\s]", '', text)
    # Remove numbers
    text = re.sub(r"\d+", '', text)
    # Tokenize
    words = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # apply stemming
    words = stem_words(words)

    # Join words back into a single string
    return " ".join(words)

def encode_labels(df, label_mapping):
    """
    Encode the labels in the dataset based on the provided mapping.
    """
    df['Category'] = df['Category'].map(label_mapping)
    return df

def remove_null_values(df):
    """
    Remove rows with null values in 'Discussion' or 'Category'.
    """
    df = df.dropna(subset=['Discussion', 'Category'])
    return df

def preprocess_dataset(dataset_path, label_mapping):
    """
    Main function to preprocess the dataset.
    """
    # Load the dataset
    df = pd.read_csv(dataset_path)
    
    # Step 1: Remove rows with null values
    df = remove_null_values(df)
    
    # Step 2: Clean and preprocess text
    df['Discussion'] = df['Discussion'].apply(preprocess_text)
    
    # Step 3: Encode labels
    df = encode_labels(df, label_mapping)
    
    return df


def stem_words(words):
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in words]
# Example usage
if __name__ == "__main__":
    label_mapping = {
        "Politics": 0,
        "Sports": 1,
        "Media": 2,
        "Market & Economy": 3,
        "STEM": 4
    }
    
    dataset_path = "train.csv"  
    preprocessed_data = preprocess_dataset(dataset_path, label_mapping)
    
    # Save the preprocessed data to a new CSV file
    preprocessed_data.to_csv("preprocessed_dataset.csv", index=False)
    
    print("Preprocessing complete. Preprocessed data saved to 'preprocessed_dataset.csv'.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ATN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ATN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ATN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Preprocessing complete. Preprocessed data saved to 'preprocessed_dataset.csv'.
