# Exploration

## Agenda
1. Split the raw data into initial features (classification and content)
2. View summary statistics of spam and non-spam sms content
3. Convert the raw data into tabular format

In [3]:
import pandas as pd

In [4]:
# Split raw data from file into classification and sms content arrays
spam:list = []
text:list = []

with open("SMSSpamCollection") as file:
    for line in file.readlines():
        # Extract classification prefix from line
        prefix:str = line.split()[0]

        # Converts the two class names into boolean format where "spam" is true
        spam_bool = lambda x: True if prefix == "spam" else False

        # Add the row instance to each feature array
        spam.append(spam_bool(prefix))
        text.append(line.removeprefix(prefix)[1:-1])

In [8]:
# Load classification and sms content into dataframe object for future handling
spam_df = pd.DataFrame(
    {
        "Spam":spam,
        "Text":text
    }
)

In [10]:
print(spam_df.shape)
spam_df.head()

(5574, 2)


Unnamed: 0,Spam,Text
0,False,"Go until jurong point, crazy.. Available only ..."
1,False,Ok lar... Joking wif u oni...
2,True,Free entry in 2 a wkly comp to win FA Cup fina...
3,False,U dun say so early hor... U c already then say...
4,False,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
spam_df.groupby("Spam").describe()

Unnamed: 0_level_0,Text,Text,Text,Text
Unnamed: 0_level_1,count,unique,top,freq
Spam,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
False,4827,4518,"Sorry, I'll call later",30
True,747,653,Please call our customer service representativ...,4


In [8]:
spam_df.to_csv(
    path_or_buf="spam.csv", 
    index=False
)