In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

In [2]:
data = pd.read_csv('SMSSpamCollection.txt', sep='\t', header=None)

In [3]:
data[0] = data[0].apply(lambda x: 1 if(x == 'spam') else 0)

# CountVectorizer

In [4]:
countVec = CountVectorizer()

In [5]:
X_countVec = countVec.fit_transform(data[1])

In [6]:
clfLog = LogisticRegression(random_state=2)

In [7]:
def scorer(clf, X, y):
    y_pred = clf.predict(X)
    return f1_score(y, y_pred)

In [8]:
scores = cross_val_score(estimator=clfLog, X=X_countVec, y=data[0], cv=10, n_jobs=-1, scoring=scorer)

In [9]:
scores

array([ 0.95890411,  0.89855072,  0.91549296,  0.95833333,  0.93706294,
        0.91304348,  0.94444444,  0.92753623,  0.92198582,  0.95104895])

In [10]:
scores.mean()

0.9326402983610631

In [11]:
with open('ans1.txt', 'w') as file:
    file.write('0.9')

# Predict

In [12]:
clfLog.fit(X_countVec, data[0])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=2, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [13]:
tests = [
    'FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! Subscribe6GB',
    'FreeMsg: Txt: claim your reward of 3 hours talk time',
    'Have you visited the last lecture on physics?',
    'Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$',
    'Only 99$'
]

In [14]:
tests_countVec = countVec.transform(tests)

In [15]:
predictions = clfLog.predict(tests_countVec)

In [16]:
predictions

array([1, 1, 0, 0, 0])

In [17]:
with open('ans2.txt', 'w') as file:
    file.write(' '.join([str(x) for x in predictions]))

# n-gram

In [18]:
n_gram_tuples = [(2,2), (3,3), (1,3)]

In [19]:
ans = []
for i in n_gram_tuples:
    countVec = CountVectorizer(ngram_range=i)
    X_countVec = countVec.fit_transform(data[1])
    
    scores = cross_val_score(estimator=clfLog, X=X_countVec, y=data[0], cv=10, n_jobs=-1, scoring=scorer)
    ans.append(scores.mean())

In [20]:
ans

[0.82242206641871329, 0.72501615554673771, 0.92513825586488374]

In [21]:
with open('ans3.txt', 'w') as file:
    file.write(' '.join([str(round(x, 2)) for x in ans]))

# MultinomialNB

In [22]:
ans = []
for i in n_gram_tuples:
    countVec = CountVectorizer(ngram_range=i)
    X_countVec = countVec.fit_transform(data[1])
    
    clfNB = MultinomialNB()
    scores = cross_val_score(estimator=clfNB, X=X_countVec, y=data[0], cv=10, n_jobs=-1, scoring=scorer)
    ans.append(scores.mean())

In [23]:
ans

[0.64550151779854426, 0.37871948524573595, 0.88848596560610016]

In [24]:
with open('ans4.txt', 'w') as file:
    file.write(' '.join([str(round(x, 2)) for x in ans]))

# TfidfVectorizer

In [25]:
tfidf_Vec = TfidfVectorizer()

In [26]:
X_tfidf = tfidf_Vec.fit_transform(data[1])

In [27]:
scores = cross_val_score(estimator=clfLog, X=X_tfidf, y=data[0], cv=10, n_jobs=-1, scoring=scorer)

In [28]:
scores

array([ 0.87407407,  0.84210526,  0.83076923,  0.88888889,  0.85496183,
        0.84615385,  0.82170543,  0.8372093 ,  0.8372093 ,  0.89552239])

In [29]:
scores.mean()

0.85285995541724557

In [30]:
with open('ans5.txt', 'w') as file:
    file.write('-1')