### Arka Roy- MDS202311

### Prepare.ipynb

### Function to load data from a given file path

In [1]:
import pandas as pd
import numpy as np
import os
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

# Ensure required NLTK resources are available
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/arkaroy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/arkaroy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/arkaroy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Define function to load dataset 
def load_sms_spam_dataset(file_path):
   
    try:
        data = pd.read_csv(file_path, sep='\t', header=None, names=['label', 'message'], encoding='utf-8')
        print(f"Dataset loaded successfully with {data.shape[0]} rows and {data.shape[1]} columns.")
        return data
    except FileNotFoundError:
        print("Error: File not found. Please check the file path.")
    except pd.errors.ParserError:
        print("Error: Parsing issue encountered. Ensure the file format is correct.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
    return None

### Function for text pre-processing

In [3]:
# Define text preprocessing function
def preprocess_text(text: str) -> str:
    
    text = text.lower()  # Convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = ' '.join(text.split())  # Remove extra whitespace

    tokens = word_tokenize(text)  # Tokenization
    stop_words = set(stopwords.words('english'))  # Load stop words
    tokens = [token for token in tokens if token not in stop_words]  # Remove stop words

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]  # Lemmatization

    return ' '.join(tokens)

### Function for pre-processing the entire dataset

In [4]:
# Function to preprocess the entire dataset
def preprocess_dataset(data: pd.DataFrame) -> pd.DataFrame:
    
    data = data.copy()
    data['message'] = data['message'].apply(preprocess_text)

    # Convert labels to numeric (0 for ham, 1 for spam)
    data['label'] = data['label'].map({'ham': 0, 'spam': 1})

    return data

### Function to split and save the dataset

In [10]:

def split_and_save_data(data: pd.DataFrame, output_dir: str, train_ratio=0.8, val_ratio=0.1, random_state=42):
   
    os.makedirs(output_dir, exist_ok=True)

    # Splitting into train and temp set (val + test)
    y = data['label']
    X = data.drop(columns=['label'])

    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=(1 - train_ratio), stratify=y, random_state=random_state)

    # Splitting temp set into validation and test
    test_ratio = 1 - train_ratio - val_ratio
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=random_state)

    # Saving datasets
    pd.concat([y_train, X_train], axis=1).to_csv(os.path.join(output_dir, 'train.csv'), index=False)
    pd.concat([y_val, X_val], axis=1).to_csv(os.path.join(output_dir, 'validation.csv'), index=False)
    pd.concat([y_test, X_test], axis=1).to_csv(os.path.join(output_dir, 'test.csv'), index=False)

    print(f"Data split and saved in '{output_dir}':")
    print(f"  - Train: {len(y_train)} samples")
    print(f"  - Validation: {len(y_val)} samples")
    print(f"  - Test: {len(y_test)} samples")


In [11]:
# Define file paths (Modify as needed)
sms_data_file_path = "/Users/arkaroy/Downloads/sms+spam+collection/SMSSpamCollection"
output_directory = "/Users/arkaroy/Downloads/sms+spam+collection/data_splits"

# Load dataset
sms_raw_data = load_sms_spam_dataset(sms_data_file_path)

# Process dataset
if sms_raw_data is not None:
    sms_processed_data = preprocess_dataset(sms_raw_data)
    split_and_save_data(sms_processed_data, output_directory)


Dataset loaded successfully with 5572 rows and 2 columns.
Data split and saved in '/Users/arkaroy/Downloads/sms+spam+collection/data_splits':
  - Train: 4457 samples
  - Validation: 557 samples
  - Test: 558 samples
