In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.corpus import stopwords

In [2]:
def load_data(filepath):
    df = pd.read_csv(filepath, sep='\t', header=None, names=['label', 'message'])
    df['label'] = df['label'].map({'ham': 0, 'spam': 1})
    return df

In [3]:
filepath = '/content/SMSSpamCollection'
raw_messages= load_data(filepath)

In [4]:
raw_messages.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
spam_n, ham_n = len(raw_messages[raw_messages["label"] == 1]), len(raw_messages[raw_messages["label"] == 0])
print(f"Percentage of spam emails = {round((spam_n)/(spam_n + ham_n) * 100, 2)}%")
print(f"Percentage of ham emails = {round((ham_n)/(spam_n + ham_n) * 100, 2)}%")

Percentage of spam emails = 13.41%
Percentage of ham emails = 86.59%


In [7]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    words = text.split()
    words = [word for word in words if word not in stopwords.words('english')]
    return ' '.join(words)

In [8]:
def preprocess_data(df):
    nltk.download('stopwords')
    df['message'] = df['message'].apply(preprocess_text)
    return df

In [9]:
def split_and_store(df):
    train, temp = train_test_split(df, test_size=0.3, random_state=42)
    validation, test = train_test_split(temp, test_size=0.5, random_state=42)
    train.to_csv('train.csv', index=False)
    validation.to_csv('validation.csv', index=False)
    test.to_csv('test.csv', index=False)

In [11]:
data = preprocess_data(raw_messages)
split_and_store(raw_messages)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
data.head()

Unnamed: 0,label,message
0,0,go jurong point crazy available bugis n great ...
1,0,ok lar joking wif u oni
2,1,free entry 2 wkly comp win fa cup final tkts 2...
3,0,u dun say early hor u c already say
4,0,nah dont think goes usf lives around though
