# Binary classfication

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn import model_selection

import matplotlib.pyplot as plt
import seaborn
%matplotlib inline

### Home task: Spam detection

Для заданной тестовой выборки построить модель для предсказания является ли sms сообщение спамом.  
На заданном разбиении (df_train, df_test) ваша модель должна превзойти baseline'ы, приведенные ниже.  

Чем больше baseline'ов вы превзойдете, тем выше ваша оценка
Метрика качества F1


baseline 1: 0.9444      bag of words + Multinomial Naive Bayes  
baseline 2: 0.9490      symbol 3-grams with IDF and l2-norm + Logistic Regression  
baseline 3: 0.9636      text stemming + baseline 2  


! Your results must be reproducible. Если ваша модель - стохастическая (как например LogisticRegression), то вы явно должны задавать все seed и random_state в параметрах моделей  
! Вы должны использовать df_test только для измерения качества конечной обученной модели. 

In [4]:
#load dataset
df = pd.read_csv('spam.csv', encoding='latin-1')
df = df[['v1', 'v2']]
df = df.rename(columns={'v1': 'target', 'v2': 'text'})
df.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# dataset size
df.shape

(5572, 2)

In [6]:
# class proportions
df.target.value_counts(normalize=True)

ham     0.865937
spam    0.134063
Name: target, dtype: float64

In [7]:
# Use  df_train for model training
# Use df_test as  hold-out dataset for your final model perfomance estimation.
# You cannot change  this splitting
# All results must be reproducible
SEED = 1337
df_train, df_test = model_selection.train_test_split(df, test_size=0.4, random_state=SEED, shuffle=True, stratify=df.target)
print('train size %d, test size %d' % (df_train.shape[0], df_test.shape[0]))

train size 3343, test size 2229


## Baseline 4

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV

In [47]:
tfidf = TfidfVectorizer(lowercase=True, analyzer='word', norm='l2')

X_train = tfidf.fit_transform(df_train.text)

label_enc = LabelEncoder().fit(df_train.target)
y_train = label_enc.transform(df_train.target)

In [48]:
from sklearn.svm import LinearSVC
from sklearn import warnings

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    clf = GridSearchCV(LinearSVC(random_state=SEED), 
                       {'loss': ('hinge', 'squared_hinge'),
                        'C':[1, 3, 5, 7]}, 
                        verbose=2)
    clf.fit(X_train, y_train)
    print(clf.best_params_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] C=1, loss=hinge .................................................
[CV] .................................. C=1, loss=hinge, total=   0.0s
[CV] C=1, loss=hinge .................................................
[CV] .................................. C=1, loss=hinge, total=   0.0s
[CV] C=1, loss=hinge .................................................
[CV] .................................. C=1, loss=hinge, total=   0.0s
[CV] C=1, loss=squared_hinge .........................................
[CV] .......................... C=1, loss=squared_hinge, total=   0.0s
[CV] C=1, loss=squared_hinge .........................................
[CV] .......................... C=1, loss=squared_hinge, total=   0.0s
[CV] C=1, loss=squared_hinge .........................................
[CV] .......................... C=1, loss=squared_hinge, total=   0.0s
[CV] C=3, loss=hinge .................................................
[CV] ............

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s



[CV] C=5, loss=squared_hinge .........................................
[CV] .......................... C=5, loss=squared_hinge, total=   0.0s
[CV] C=7, loss=hinge .................................................
[CV] .................................. C=7, loss=hinge, total=   0.0s
[CV] C=7, loss=hinge .................................................
[CV] .................................. C=7, loss=hinge, total=   0.0s
[CV] C=7, loss=hinge .................................................
[CV] .................................. C=7, loss=hinge, total=   0.0s
[CV] C=7, loss=squared_hinge .........................................
[CV] .......................... C=7, loss=squared_hinge, total=   0.0s
[CV] C=7, loss=squared_hinge .........................................
[CV] .......................... C=7, loss=squared_hinge, total=   0.0s
[CV] C=7, loss=squared_hinge .........................................
[CV] .......................... C=7, loss=squared_hinge, total=   0.0s
{'C':

[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:    0.3s finished


In [49]:
model = LinearSVC(C=3,loss='hinge',random_state=SEED)
model.fit(X_train,y_train)

X_test = tfidf.transform(df_test.text)
y_pred = model.predict(X_test)
y_test = label_enc.transform(df_test.target)
print('test', metrics.f1_score(y_test,model.predict(X_test)))

test 0.949740034662
