In [1]:
import pandas as pd

In [2]:
sms_spam_collection = pd.read_csv('smsspamcollection\SMSSpamCollection', sep='\t', header = None, names = ['Label', 'SMS'])
sms_spam_collection.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
sms_spam_collection.Label.value_counts(normalize = True) * 100

ham     86.593683
spam    13.406317
Name: Label, dtype: float64

# Training and Test Set

In [4]:
randomized_dataset = sms_spam_collection.sample(frac = 1, random_state = 1)

training_index = round(len(randomized_dataset) * 0.8)

training_set = randomized_dataset[:training_index].reset_index(drop = 'index')
test_set = randomized_dataset[training_index:].reset_index(drop = 'index')

In [5]:
training_set.head()

Unnamed: 0,Label,SMS
0,ham,"Yep, by the pretty sculpture"
1,ham,"Yes, princess. Are you going to make me moan?"
2,ham,Welp apparently he retired
3,ham,Havent.
4,ham,I forgot 2 ask ü all smth.. There's a card on ...


In [6]:
training_set.Label.value_counts(normalize = True) * 100

ham     86.54105
spam    13.45895
Name: Label, dtype: float64

In [7]:
test_set.head()

Unnamed: 0,Label,SMS
0,ham,Later i guess. I needa do mcat study too.
1,ham,But i haf enuff space got like 4 mb...
2,spam,Had your mobile 10 mths? Update to latest Oran...
3,ham,All sounds good. Fingers . Makes it difficult ...
4,ham,"All done, all handed in. Don't know if mega sh..."


In [8]:
test_set.Label.value_counts(normalize = True) * 100

ham     86.804309
spam    13.195691
Name: Label, dtype: float64

# Letter Case and Punctuation 

In [9]:
training_set.SMS = training_set.SMS.str.replace('\W', ' ').str.lower()

In [10]:
test_set.SMS = test_set.SMS.str.replace('\W', ' ').str.lower()

In [11]:
training_set.head()

Unnamed: 0,Label,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired
3,ham,havent
4,ham,i forgot 2 ask ü all smth there s a card on ...


In [12]:
test_set.head()

Unnamed: 0,Label,SMS
0,ham,later i guess i needa do mcat study too
1,ham,but i haf enuff space got like 4 mb
2,spam,had your mobile 10 mths update to latest oran...
3,ham,all sounds good fingers makes it difficult ...
4,ham,all done all handed in don t know if mega sh...


# Creating the Vocabulary

In [24]:
# For training_set

training_SMS_list = training_set.SMS.str.split().tolist()
training_SMS_list[:5]

[['yep', 'by', 'the', 'pretty', 'sculpture'],
 ['yes', 'princess', 'are', 'you', 'going', 'to', 'make', 'me', 'moan'],
 ['welp', 'apparently', 'he', 'retired'],
 ['havent'],
 ['i',
  'forgot',
  '2',
  'ask',
  'ü',
  'all',
  'smth',
  'there',
  's',
  'a',
  'card',
  'on',
  'da',
  'present',
  'lei',
  'how',
  'ü',
  'all',
  'want',
  '2',
  'write',
  'smth',
  'or',
  'sign',
  'on',
  'it']]

In [25]:
vocabulary = set()

for sms in training_SMS_list:
    for word in sms:
        if word not in vocabulary:
            vocabulary.add(word)

vocabulary = list(vocabulary)
vocabulary

['needs',
 'txtstar',
 'maybe',
 'perhaps',
 'avoiding',
 '47',
 'expired',
 'broad',
 'meow',
 'bill',
 'juliana',
 'service',
 'audrey',
 'chit',
 'jolly',
 'archive',
 'allday',
 'gbp4',
 'lambda',
 'bam',
 'amazing',
 'pairs',
 'jul',
 'results',
 'cool',
 'poker',
 'statements',
 '48',
 'finally',
 'making',
 'days',
 'steal',
 'fell',
 'trade',
 'bfore',
 'n8',
 'sextextuk',
 'headstart',
 'textand',
 'recovery',
 'lane',
 'copied',
 'heading',
 'tips',
 'olage',
 'alive',
 'vijaykanth',
 'elsewhere',
 'jane',
 'necesity',
 'behalf',
 'prove',
 'regular',
 'eng',
 'sk3',
 'reassurance',
 'type',
 'yowifes',
 'pound',
 'afternoons',
 'strips',
 'ac',
 'browsin',
 'except',
 'shindig',
 'lazy',
 'nange',
 'zed',
 'nitz',
 'southern',
 'opened',
 'settings',
 'bother',
 'arab',
 'gone',
 'gas',
 'redeemed',
 '2007',
 'forevr',
 'dancing',
 'clubmoby',
 'virgil',
 'chill',
 'goin',
 '4041',
 'burgundy',
 'ignorant',
 'whether',
 'hittng',
 'fed',
 'everyone',
 'couple',
 'trek',
 'se

# The Final Training Set 

In [16]:
word_count_per_sms = {unique_word : [0] * len(training_set) for unique_word in vocabulary}

for index, sms in enumerate(training_SMS_list):
    for word in sms:
        word_count_per_sms[word][index] += 1

In [17]:
word_count_per_sms_df = pd.DataFrame(word_count_per_sms)

In [18]:
word_count_per_sms_df.head()

Unnamed: 0,needs,txtstar,maybe,perhaps,avoiding,47,expired,broad,meow,bill,...,harri,savamob,canary,ko,suzy,walls,shouted,forms,fakeye,somewheresomeone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
training_set_word_count = pd.concat([training_set, word_count_per_sms_df], axis = 1)
training_set_word_count.head()

Unnamed: 0,Label,SMS,needs,txtstar,maybe,perhaps,avoiding,47,expired,broad,...,harri,savamob,canary,ko,suzy,walls,shouted,forms,fakeye,somewheresomeone
0,ham,yep by the pretty sculpture,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,yes princess are you going to make me moan,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,welp apparently he retired,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,havent,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,i forgot 2 ask ü all smth there s a card on ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Calculating Constants 

In [30]:
p_ham = training_set.Label.value_counts(normalize = True)[0]
p_spam = training_set.Label.value_counts(normalize = True)[1]

In [31]:
p_ham

0.8654104979811574

In [32]:
p_spam

0.13458950201884254

In [43]:
training_set_word_count.iloc[0,1]

'yep  by the pretty sculpture'

In [45]:
training_set_word_count['words_per_sms'] = 
training_set_word_count['words_per_sms']

0       7789
1       7793
2       7788
3       7785
4       7810
        ... 
4453    7801
4454    7818
4455    7812
4456    7811
4457    7788
Name: words_per_sms, Length: 4458, dtype: int64

In [23]:
n_ham = 
n_spam = 
n_vocabulary = len(vocabulary)
alpha = 1 

SyntaxError: invalid syntax (<ipython-input-23-7f160530010b>, line 1)