In [1]:
from __future__ import division, print_function

import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# reading dataset
with open('smsspamcollection/SMSSpamCollection.txt', 'r') as f:
    sms = f.read().splitlines() 

In [18]:
sms[:10]

['ham\tGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
 'ham\tOk lar... Joking wif u oni...',
 "spam\tFree entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 'ham\tU dun say so early hor... U c already then say...',
 "ham\tNah I don't think he goes to usf, he lives around here though",
 "spam\tFreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv",
 'ham\tEven my brother is not like to speak with me. They treat me like aids patent.',
 "ham\tAs per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune",
 'spam\tWINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Cl

In [9]:
sms[0].split('\t')

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [10]:
text = [sms[i].split('\t')[1] for i in range(len(sms))]
labels = [sms[i].split('\t')[0] for i in range(len(sms))]
labels = [1 if i == 'spam' else 0 for i in labels]

In [19]:
text[:4]

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
 'Ok lar... Joking wif u oni...',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 'U dun say so early hor... U c already then say...']

In [20]:
labels[:4]

[0, 0, 1, 0]

In [21]:
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(text)
X_counts.shape

(5574, 8713)

In [23]:
# -- 1
pipeline = Pipeline([('count_vect', CountVectorizer()), ('log_reg', LogisticRegression())])

# cross_val_score for f1 metric for log_reg
score = cross_val_score(pipeline, text, labels, scoring='f1', cv=10).mean()
print('Cross_val_score f1: %.4f' % score)

Cross_val_score f1: 0.9333


In [24]:
with open('ans1.txt', 'w') as f:
    ans1 = str(round(score, 1))
    f.write(ans1)

3

In [25]:
test_text = ["FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! Subscribe6GB",
"FreeMsg: Txt: claim your reward of 3 hours talk time",
"Have you visited the last lecture on physics?",
"Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$",
"Only 99$"]

In [26]:
pipeline.fit(text, labels)

Pipeline(memory=None,
     steps=[('count_vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [27]:
pred_labels = pipeline.predict(test_text)

In [28]:
pred_labels

array([1, 1, 0, 0, 0])

In [29]:
with open('ans2.txt', 'w') as f:
    ans2 = ' '.join(map(str, pred_labels))
    f.write(ans2)

9

In [32]:
ngram_ranges = [(2, 2), (3, 3), (1, 3)]
f_values = []

for ngram in ngram_ranges:
    pipeline = Pipeline([('count_vect', CountVectorizer(ngram_range=ngram)), ('log_reg', LogisticRegression())])
    f_values.append(round(cross_val_score(pipeline, text, labels, scoring='f1', cv=10).mean(), 2))

In [33]:
with open('ans3.txt', 'w') as f:
    ans3 = ' '.join(map(str, ['%.2f' % sc for sc in f_values]))
    f.write(ans3)

14

In [34]:
ngram_ranges = [(2, 2), (3, 3), (1, 3)]
f_values = []

for ngram in ngram_ranges:
    pipeline = Pipeline([('count_vect', CountVectorizer(ngram_range=ngram)), ('mn_nb', MultinomialNB())])
    f_values.append(round(cross_val_score(pipeline, text, labels, scoring='f1', cv=10).mean(), 2))

In [35]:
with open('ans4.txt', 'w') as f:
    ans4 = ' '.join(map(str, ['%.2f' % sc for sc in f_values]))
    f.write(ans4)

14

In [36]:
f_values

[0.93, 0.87, 0.95]

In [38]:
scores = []
for ngram in ngram_ranges:
    X_counts = CountVectorizer(ngram_range=ngram).fit_transform(text)
    mn_nb = MultinomialNB()
    score = cross_val_score(mn_nb, X_counts, labels, scoring='f1', cv=10).mean()
    scores.append(score)
    print('Ngram_range: ', ngram)
    print('Cross-val-score f1: %.4f\n' % score)

Ngram_range:  (2, 2)
Cross-val-score f1: 0.6455

Ngram_range:  (3, 3)
Cross-val-score f1: 0.3786

Ngram_range:  (1, 3)
Cross-val-score f1: 0.8879



In [39]:
with open('ans4.txt', 'w') as f:
    ans4 = ' '.join(map(str, ['%.2f' % sc for sc in scores]))
    f.write(ans4)

14

In [40]:
# cross_val_score for f1 metric for log_reg with tf_idf
pipeline = Pipeline([('tf_idf', TfidfVectorizer()),
                     ('log_reg', LogisticRegression())])
score = cross_val_score(pipeline, text, labels, scoring='f1', cv=10).mean()
print('Cross-val-score f1: %.4f\n' % score)

Cross-val-score f1: 0.8785



In [41]:
with open('ans5.txt', 'w') as f:
    ans5 = str(-1)
    f.write(ans5)

2