In [11]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from nltk.stem import PorterStemmer
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

In [12]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ATN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ATN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ATN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [13]:
# Prepare the data and preprocess
def preprocess_text(text):
    """
    Clean and preprocess the text.
    """
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    # Remove punctuation
    text = re.sub(r"[^\w\s]", '', text)
    # Remove numbers
    text = re.sub(r"\d+", '', text)
    # Tokenize
    words = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # apply stemming
    words = stem_words(words)

    # Join words back into a single string
    return " ".join(words)

def encode_labels(df, label_mapping):
    """
    Encode the labels in the dataset based on the provided mapping.
    """
    df['Category'] = df['Category'].map(label_mapping)
    return df

def remove_null_values(df):
    """
    Remove rows with null values in 'Discussion' or 'Category'.
    """
    df = df.dropna(subset=['Discussion', 'Category'])
    return df

def preprocess_dataset(dataset_path, label_mapping):
    """
    Main function to preprocess the dataset.
    """
    # Load the dataset
    df = pd.read_csv(dataset_path)
    
    # Step 1: Remove rows with null values
    df = remove_null_values(df)
    
    # Step 2: Clean and preprocess text
    df['Discussion'] = df['Discussion'].apply(preprocess_text)
    
    # Step 3: Encode labels
    df = encode_labels(df, label_mapping)
    
    return df

def stem_words(words):
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in words]


In [14]:
def apply_bert_model(df):
    """
    Function to apply BERT for classification.
    """
    # Load the pre-trained BERT tokenizer and model for classification
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df['Category'].unique()))

    # Tokenize the 'Discussion' column
    def tokenize_function(examples):
        return tokenizer(examples['Discussion'], padding="max_length", truncation=True)

    # Prepare dataset in Hugging Face format
    dataset = Dataset.from_pandas(df)
    dataset = dataset.map(tokenize_function, batched=True)

    # Split the dataset into training and testing
    train_dataset, test_dataset = dataset.train_test_split(test_size=0.2).values()

    # Define training arguments
    training_args = TrainingArguments(
        output_dir='./results',          
        evaluation_strategy="epoch",   
        learning_rate=2e-5,              
        per_device_train_batch_size=16,  
        per_device_eval_batch_size=64,   
        num_train_epochs=3,             
        weight_decay=0.01,
    )

    # Initialize the Trainer
    trainer = Trainer(
        model=model,                        
        args=training_args,                  
        train_dataset=train_dataset,         
        eval_dataset=test_dataset,        
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    eval_results = trainer.evaluate()
    print(f"Evaluation results: {eval_results}")

    # Save the trained model
    model.save_pretrained('./trained_bert_model')

    return model

In [None]:
if __name__ == "__main__":
    label_mapping = {
        "Politics": 0,
        "Sports": 1,
        "Media": 2,
        "Market & Economy": 3,
        "STEM": 4
    }
    
    dataset_path = "train.csv"  
    preprocessed_data = preprocess_dataset(dataset_path, label_mapping)
    
    # Save the preprocessed data to a new CSV file
    preprocessed_data.to_csv("preprocessed_dataset.csv", index=False)
    
    print("Preprocessing complete. Preprocessed data saved to 'preprocessed_dataset.csv'.")

    model = apply_bert_model(preprocessed_data)
    print("BERT model training complete.")

Preprocessing complete. Preprocessed data saved to 'preprocessed_dataset.csv'.


ImportError: 
BertForSequenceClassification requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
Please note that you may need to restart your runtime after installation.
