# Assignment by Atul Anant (MDS202314)

In [1]:
import pandas as pd
import numpy as np
import string
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lalsa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Loading the dataset

In [2]:
def load_from_path(path : str) -> pd.DataFrame:
    data = pd.read_csv(path, sep='\t', names=['label', 'message'])
    return data

In [3]:
path_sms_data = r'D:\sem4\Aml\Assignment1\SMSSpamCollection'
data = load_from_path(path_sms_data)

In [4]:
data.shape

(5572, 2)

In [5]:
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
data.head(1)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."


In [8]:
data.columns

Index(['label', 'message'], dtype='object')

In [9]:
data.shape[0]

5572

In [None]:
ham_counter, spam_counter = 0, 0
for i in range(data.shape[0]):
    if data['label'][i]=='ham':
        ham_counter +=1; data['label'][i] = 0
    else:
        spam_counter +=1; data['label'][i] = 1

In [11]:
ham_counter, spam_counter

(4825, 747)

In [12]:
data.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


## Pre-processing

In [29]:
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
stop_words = set(stopwords.words('english'))
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lalsa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lalsa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Function to pre-process each message in the dataframe

In [21]:
def preprocess_text(text: str) -> str:
    # 1. Convert text to lowercase
    text = text.lower()

    # 2. Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # 3. Remove digits
    text = re.sub(r'\d+', '', text)

    # 4. Remove extra whitespace
    text = ' '.join(text.split())

    # 5. Tokenize the text
    tokens = word_tokenize(text)

    # 6. Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]

    # 7. Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # 8. Filter out empty tokens (just in case)
    tokens = [token for token in tokens if token]

    # Join back into a single string
    return ' '.join(tokens)
    

### Function to pre-process the entire message column using the above function

In [22]:
#preprocess_data = lambda data, text_col : data[text_col].apply(preprocess_text)

def preprocess_data(df: pd.DataFrame, text_col: str = 'message') -> pd.DataFrame:
    df[text_col] = df[text_col].apply(preprocess_text)
    return df

In [23]:
data_preprocessed = preprocess_data(data, 'message')

### Splitting the data and saving it locally for further use

In [24]:
len(data_preprocessed['message'])

5572

In [25]:
from sklearn.model_selection import train_test_split
import os

In [26]:
def split_and_save_data(
        data: pd.DataFrame, 
        output_path: str = '.',
        train_ratio:float = 0.8,
        val_ratio:float = 0.1,
        label_col: str = "label", random_state: int=42 
        ) -> None:
    
    os.makedirs(output_path, exist_ok=True)

    y = data[label_col]
    X = data.drop(columns=[label_col])
    
    
    X_train, X_rest, y_train, y_rest  = train_test_split(
        X, y,
        test_size=(1-train_ratio),
        stratify=y,
        random_state=random_state
    )

    test_ratio = 1 - train_ratio - val_ratio
    val_fraction_of_temp = val_ratio / (val_ratio + test_ratio)

    X_val, X_test, y_val, y_test = train_test_split(
        X_rest, y_rest,
        test_size=0.5,
        stratify=y_rest,
        random_state=random_state
    )

    data_train = pd.concat([y_train, X_train], axis=1)
    data_test = pd.concat([y_test, X_test], axis=1)
    data_val = pd.concat([y_val, X_val], axis=1)


    data_train.to_csv(os.path.join(output_path, 'train.csv'), index=False)
    data_test.to_csv(os.path.join(output_path, 'test.csv'), index=False)
    data_val.to_csv(os.path.join(output_path, 'val.csv'), index=False)

    print("Data has been split and saved to CSV files:")
    print(f"  Train: {len(data_train)} rows -> {os.path.join(output_path, 'train.csv')}")
    print(f"  Validation: {len(data_val)} rows -> {os.path.join(output_path, 'validation.csv')}")
    print(f"  Test: {len(data_test)} rows -> {os.path.join(output_path, 'test.csv')}")

    return

In [27]:
output_path = r'D:\sem4\Aml\Assignment1\data_splits'
split_and_save_data(
    data_preprocessed,
    output_path=output_path
)

Data has been split and saved to CSV files:
  Train: 4457 rows -> D:\sem4\Aml\Assignment1\data_splits\train.csv
  Validation: 557 rows -> D:\sem4\Aml\Assignment1\data_splits\validation.csv
  Test: 558 rows -> D:\sem4\Aml\Assignment1\data_splits\test.csv
