## Importing Libraries

In [51]:
# Core data processing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# NLP libraries
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Machine learning utilities
from sklearn.model_selection import train_test_split

# Set seaborn style
sns.set_style('whitegrid')  # or other valid seaborn styles like 'white', 'dark', 'darkgrid', 'ticks'
sns.set_palette('husl')  # Set seaborn color palette

In [52]:
# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

## Loading the data

In [68]:
df = pd.read_csv('Data/raw_data.txt', sep='\t', header=None, names=['label', 'message'])
df.head(10)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [13]:
df.to_csv('Data/raw_data.csv', index=False)

## Initialize dvc

In [37]:
!dvc init --subdir -f

Initialized DVC repository.

You can now commit the changes to git.

[31m+---------------------------------------------------------------------+
[0m[31m|[0m                                                                     [31m|[0m
[31m|[0m        DVC has enabled anonymous aggregate usage analytics.         [31m|[0m
[31m|[0m     Read the analytics documentation (and how to opt-out) here:     [31m|[0m
[31m|[0m             <[36mhttps://dvc.org/doc/user-guide/analytics[39m>              [31m|[0m
[31m|[0m                                                                     [31m|[0m
[31m+---------------------------------------------------------------------+
[0m
[33mWhat's next?[39m
[33m------------[39m
- Check out the documentation: <[36mhttps://dvc.org/doc[39m>
- Get help and share ideas: <[36mhttps://dvc.org/chat[39m>
- Star us on GitHub: <[36mhttps://github.com/iterative/dvc[39m>
[0m

In [38]:
!git add .dvc/config -f

In [42]:
!git rm -r --cached 'Data/raw_data.csv'
!dvc add Data/raw_data.csv 
!git commit -m "Added raw_data.csv"

fatal: pathspec 'Data/raw_data.csv' did not match any files
[?25l[32m⠋[0m Checking graph                                       core[39m>
Adding...                                                                       
![A
Collecting files and computing hashes in Data/raw_data.csv |0.00 [00:00,     ?fi[A
                                                                                [A
![A
  0% Checking cache in '/root/AppliedMachineLearning/Assignment 2/.dvc/cache/fil[A
                                                                                [A
![A
  0%|          |Checking out /root/AppliedMachineLearn0/1 [00:00<?,    ?files/s][A
100% Adding...|████████████████████████████████████████|1/1 [00:00, 21.76file/s][A

To track the changes with git, run:

	git add Data/raw_data.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true
[0mOn branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean


In [43]:
!dvc status

Data and pipelines are up to date.                                              
[0m

## Text Preprocessing

In [69]:
class TextPreprocessor:
    """Text preprocessing pipeline for SMS messages."""
    
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
    
    def preprocess(self, text):
        """Preprocess a single text message.
        
        Args:
            text (str): Input text message
            
        Returns:
            list: Preprocessed tokens
        """
        # Tokenization
        tokens = word_tokenize(text)
        
        # Convert to lowercase and remove non-alphabetic tokens
        tokens = [token.lower() for token in tokens if token.isalpha()]
        
        # Remove stopwords
        tokens = [token for token in tokens if token not in self.stop_words]
        
        # Lemmatization
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens]
        
        return tokens

# Encoding ham as 0 and spam as 1
def encode(text):
    if text == 'spam':
        return 1
    elif text == 'ham':
        return 0

# Initialize preprocessor and process messages
preprocessor = TextPreprocessor()
df['processed_message'] = df['message'].apply(preprocessor.preprocess)

# Add token count feature
df['token_count'] = df['processed_message'].apply(len)

# Display example
print("Example preprocessing:")
example_idx = 0
print(f"Original: {df['message'].iloc[example_idx]}")
print(f"Processed: {df['processed_message'].iloc[example_idx]}")

Example preprocessing:
Original: Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
Processed: ['go', 'jurong', 'point', 'crazy', 'available', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amore', 'wat']


## Data Splitting

In [80]:
def prepare_splits(df, train_size=0.7, val_size=0.15, random_state=42):
    """Split data into training, validation, and test sets.
    
    Args:
        df (pd.DataFrame): Input DataFrame
        train_size (float): Proportion for training set
        val_size (float): Proportion for validation set
        random_state (int): Random seed for reproducibility
        
    Returns:
        tuple: (train_df, val_df, test_df)
    """
    # Apply the encoding to the 'label' column
    df['label'] = df['label'].apply(encode)
    
    # First split: separate training set
    train_df, temp_df = train_test_split(
        df,
        train_size=train_size,
        stratify=df['label'],
        random_state=random_state
    )
    
    # Second split: separate validation and test sets
    val_df, test_df = train_test_split(
        temp_df,
        train_size=val_size/(1-train_size),
        stratify=temp_df['label'],
        random_state=random_state
    )
    
    print(f"Split sizes: train={len(train_df)}, val={len(val_df)}, test={len(test_df)}")
    return train_df, val_df, test_df

## Splitting into Train, Validation and Test data

### Seeds

In [81]:
seed_1 = 42
seed_2 = 1234

## Save Processed Datasets

In [77]:
def save_datasets(train_df, val_df, test_df, output_dir='Data'):
    """Save processed datasets to CSV files.
    
    Args:
        train_df (pd.DataFrame): Training data
        val_df (pd.DataFrame): Validation data
        test_df (pd.DataFrame): Test data
        output_dir (str): Output directory
    """
    import os
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Save datasets
    for name, dataset in [('train', train_df), ('validation', val_df), ('test', test_df)]:
        filepath = os.path.join(output_dir, f'{name}.csv')
        dataset.to_csv(filepath, index=False)
        print(f"Saved {name} dataset to {filepath}")

save_datasets(train_df, val_df, test_df)

Saved train dataset to Data/train.csv
Saved validation dataset to Data/validation.csv
Saved test dataset to Data/test.csv
