# Loading data

In [2]:
import pandas as pd
df = pd.read_excel('/appliedml1.xlsx', header=None)

# Assign column names
df.columns = ['label', 'message']  # Assuming the first column is the label and the second is the message


In [3]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
len(df)

5572

# Pre process

In [5]:
print(df.isnull().sum())


label      0
message    0
dtype: int64


In [7]:
print(df['label'].value_counts())

label
ham     4825
spam     747
Name: count, dtype: int64


In [8]:
import re
def clean_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip().lower()
    return text

# Convert all entries in the 'message' column to strings
df['message'] = df['message'].astype(str)

# Apply the clean_text function
df['message'] = df['message'].apply(clean_text)



In [9]:
df.head(10)

Unnamed: 0,label,message
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i don t think he goes to usf he lives arou...
5,spam,freemsg hey there darling it s been 3 week s n...
6,ham,even my brother is not like to speak with me t...
7,ham,as per your request melle melle oru minnaminun...
8,spam,winner as a valued network customer you have b...
9,spam,had your mobile 11 months or more u r entitled...


In [10]:
df['label'] = df['label'].map({'spam': 1, 'ham': 0})


In [11]:
df.head()

Unnamed: 0,label,message
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor u c already then say
4,0,nah i don t think he goes to usf he lives arou...


# Splitting and saving to csv

In [12]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42)

# Save to CSV
train_df.to_csv("train.csv", index=False)
val_df.to_csv("validation.csv", index=False)
test_df.to_csv("test.csv", index=False)

In [13]:
train_df.head()

Unnamed: 0,label,message
184,0,he will you guys close
2171,0,can i please come up now imin town dontmatter ...
5422,0,ok k sry i knw 2 siva tats y i askd
4113,0,i ll see but prolly yeah
4588,0,i ll see if i can swing by in a bit got some t...
