In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
sms_data = pd.read_table("sms.tsv", names=['label', 'message'])
sms_data.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
sms_data.shape

(5572, 2)

In [6]:
# Quick way of converting string label to numeric targets
sms_data['target'] = sms_data.label.map({'ham':0,'spam':1})

In [7]:
sms_data.head()

Unnamed: 0,label,message,target
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [8]:
X = sms_data.message
y = sms_data.target

In [9]:
print(X.shape)
print(y.shape)

(5572,)
(5572,)


In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,stratify = y,random_state = 42)

In [16]:
vect = CountVectorizer()
vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [17]:
X_train_dtm = vect.transform(X_train)

In [18]:
vect.get_feature_names()

['00',
 '000',
 '000pes',
 '008704050406',
 '0089',
 '0121',
 '01223585236',
 '01223585334',
 '02',
 '0207',
 '02072069400',
 '02073162414',
 '02085076972',
 '021',
 '03',
 '04',
 '0430',
 '05',
 '050703',
 '0578',
 '06',
 '07',
 '07008009200',
 '07046744435',
 '07099833605',
 '07123456789',
 '0721072',
 '07732584351',
 '07734396839',
 '07742676969',
 '07753741225',
 '07781482378',
 '07786200117',
 '077xxx',
 '078',
 '07801543489',
 '07808',
 '07808247860',
 '07808726822',
 '07815296484',
 '07821230901',
 '078498',
 '0789xxxxxxx',
 '0796xxxxxx',
 '07xxxxxxxxx',
 '0800',
 '08000407165',
 '08000839402',
 '08000930705',
 '08000938767',
 '08001950382',
 '08002888812',
 '08002986030',
 '08002986906',
 '08002988890',
 '08006344447',
 '0808',
 '08081263000',
 '08081560665',
 '0825',
 '083',
 '0844',
 '08448350055',
 '08448714184',
 '0845',
 '08450542832',
 '08452810073',
 '08452810075over18',
 '0870',
 '08700469649',
 '08700621170150p',
 '08701213186',
 '08701237397',
 '08701417012',
 '087014

In [19]:
len(vect.get_feature_names())

7668

In [20]:
X_train_dtm.shape

(4457, 7668)

In [21]:
X_test_dtm = vect.transform(X_test)

In [23]:
X_test_dtm.shape

(1115, 7668)

# logistic regression

In [24]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression()

In [25]:
%time lr_clf.fit(X_train_dtm,y_train)



CPU times: user 176 ms, sys: 12 ms, total: 188 ms
Wall time: 355 ms


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [26]:
score_reg = lr_clf.score(X_test_dtm,y_test)

In [29]:
score_reg

0.9820627802690582

In [30]:
y_pred = lr_clf.predict(X_test_dtm)

In [31]:
from sklearn.metrics import confusion_matrix,precision_score,recall_score,f1_score

In [32]:
confusion_matrix(y_true=y_test,y_pred = y_pred)

array([[966,   0],
       [ 20, 129]])

In [33]:
print("precision:", precision_score(y_true=y_test, y_pred=y_pred))
print("recall:", recall_score(y_true=y_test, y_pred=y_pred))
print("f1-score:", f1_score(y_true=y_test, y_pred=y_pred))

precision: 1.0
recall: 0.8657718120805369
f1-score: 0.9280575539568345


# using 2 grams

In [35]:
vect2g = CountVectorizer(ngram_range=(1,2))
vect2g.fit(X_train)
vect2g.get_feature_names()

['00',
 '00 in',
 '00 per',
 '00 sub',
 '00 subs',
 '000',
 '000 bonus',
 '000 cash',
 '000 homeowners',
 '000 pounds',
 '000 price',
 '000 prize',
 '000 xmas',
 '000pes',
 '000pes so',
 '008704050406',
 '008704050406 sp',
 '0089',
 '0089 my',
 '0121',
 '0121 2025050',
 '01223585236',
 '01223585236 xx',
 '01223585334',
 '01223585334 to',
 '02',
 '02 06',
 '02 09',
 '02 claimcode',
 '02 user',
 '0207',
 '0207 083',
 '0207 153',
 '02072069400',
 '02072069400 bx',
 '02073162414',
 '02073162414 now',
 '02085076972',
 '02085076972 reply',
 '021',
 '021 3680',
 '03',
 '03 05',
 '03 is',
 '03 our',
 '03 this',
 '04',
 '04 call',
 '0430',
 '0430 jul',
 '05',
 '05 05',
 '05 or',
 '050703',
 '050703 csbcm4235wc1n3xx',
 '0578',
 '0578 now',
 '06',
 '06 03',
 '06 05',
 '06 11',
 '06 good',
 '07',
 '07 11',
 '07008009200',
 '07046744435',
 '07046744435 now',
 '07099833605',
 '07099833605 to',
 '07123456789',
 '07123456789 to',
 '0721072',
 '0721072 to',
 '07732584351',
 '07732584351 rodger',
 '0773

In [37]:
X_train_2g = vect2g.transform(X_train)

In [38]:
X_train_2g.shape

(4457, 42696)

In [39]:
len(vect2g.get_feature_names())

42696

In [40]:
X_test_2g = vect2g.transform(X_test)

In [42]:
X_test_2g.shape

(1115, 42696)

# logistic regression 2gram

In [44]:
from sklearn.linear_model import LogisticRegression

In [46]:
lr = LogisticRegression()
%time lr.fit(X_train_2g,y_train)

CPU times: user 416 ms, sys: 8 ms, total: 424 ms
Wall time: 310 ms


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [48]:
score_lr = lr.score(X_test_2g,y_test)
score_lr

0.97847533632287

In [51]:
yy_pred = lr.predict(X_test_2g)

In [52]:
yy_pred[:10]

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0])

In [53]:
confusion_matrix(y_true = y_test,y_pred=yy_pred)

array([[966,   0],
       [ 24, 125]])

In [65]:
print("precision:", precision_score(y_true=y_test, y_pred=yy_pred))
print("recall:", recall_score(y_true=y_test, y_pred=yy_pred))
print("f1-score:", f1_score(y_true=y_test, y_pred=yy_pred))

precision: 1.0
recall: 0.8389261744966443
f1-score: 0.9124087591240876


# random forest 2 grams

In [55]:
from sklearn.ensemble import RandomForestClassifier

In [56]:
rf = RandomForestClassifier(random_state=42)

In [59]:
rf_clf = rf.fit(X_train_2g,y_train)

In [60]:
rf_score = rf_clf.score(X_test_2g,y_test)

In [61]:
rf_score

0.9668161434977578

In [62]:
yyy_pred = rf_clf.predict(X_test_2g)

In [63]:
confusion_matrix(y_true=y_test,y_pred=yyy_pred)

array([[966,   0],
       [ 37, 112]])

In [64]:
print("precision:", precision_score(y_true=y_test, y_pred=yyy_pred))
print("recall:", recall_score(y_true=y_test, y_pred=yyy_pred))
print("f1-score:", f1_score(y_true=y_test, y_pred=yyy_pred))

precision: 1.0
recall: 0.7516778523489933
f1-score: 0.8582375478927203


# using upsampling and downsampling techniques

# upsampling

In [67]:
from imblearn.over_sampling import SMOTE

In [68]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train==0)))

Before OverSampling, counts of label '1': 598
Before OverSampling, counts of label '0': 3859 



In [71]:
sm = SMOTE(random_state=42)
X_train_up,y_train_up = sm.fit_sample(X_train_dtm,y_train.ravel())

In [73]:
print('After OverSampling, the shape of train_X: {}'.format(X_train_up.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_up.shape))

print("After OverSampling, counts of label '1': {}".format(sum(y_train_up==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_up==0)))

After OverSampling, the shape of train_X: (7718, 7668)
After OverSampling, the shape of train_y: (7718,) 

After OverSampling, counts of label '1': 3859
After OverSampling, counts of label '0': 3859


 # logistic regression after upsampling

In [78]:
log_reg_up = LogisticRegression()

In [80]:
lr_ups = log_reg_up.fit(X_train_up,y_train_up) 

In [81]:
lr_score = lr_ups.score(X_test_dtm,y_test)

In [82]:
lr_score

0.9461883408071748

In [83]:
y_test.shape

(1115,)

In [84]:
X_test_dtm.shape

(1115, 7668)

In [85]:
y_pred_up = lr_ups.predict(X_test_dtm)

In [86]:
confusion_matrix(y_true=y_test,y_pred=y_pred_up)

array([[919,  47],
       [ 13, 136]])

In [87]:
print("precision:", precision_score(y_true=y_test, y_pred=y_pred_up))
print("recall:", recall_score(y_true=y_test, y_pred=y_pred_up))
print("f1-score:", f1_score(y_true=y_test, y_pred=y_pred_up))

precision: 0.7431693989071039
recall: 0.912751677852349
f1-score: 0.819277108433735


# random forest after upsampling

In [88]:
rnd_up = RandomForestClassifier(random_state =42)

In [89]:
rnd_ups = rnd_up.fit(X_train_up,y_train_up)



In [91]:
rnd_ups_score = rnd_ups.score(X_test_dtm,y_test)

In [92]:
rnd_ups_score

0.9273542600896861

In [93]:
y_pred_rnd = rnd_ups.predict(X_test_dtm)

In [94]:
confusion_matrix(y_true = y_test,y_pred = y_pred_rnd)

array([[911,  55],
       [ 26, 123]])

In [95]:
print("precision:", precision_score(y_true=y_test, y_pred=y_pred_rnd))
print("recall:", recall_score(y_true=y_test, y_pred=y_pred_rnd))
print("f1-score:", f1_score(y_true=y_test, y_pred=y_pred_rnd))

precision: 0.6910112359550562
recall: 0.825503355704698
f1-score: 0.7522935779816513


# downsampling

In [141]:
sms_data_ds = sms_data.sample(frac = 1,random_state = 42)

In [142]:
spam = sms_data_ds.loc[sms_data_ds["target"]==1]
ham = sms_data_ds.loc[sms_data_ds["target"]==0][:747]

In [143]:
normal_distributed_data = pd.concat([spam,ham])

In [144]:
sms_data_dsg = normal_distributed_data.sample(frac=1,random_state = 42)

In [145]:
sms_data_dsg.head()

Unnamed: 0,label,message,target
3979,ham,Reason is if the team budget is available at l...,0
3803,ham,No da. I am happy that we sit together na,0
2306,ham,Still in the area of the restaurant. Ill try t...,0
3552,ham,I love you both too :-),0
297,ham,Unless it's a situation where YOU GO GURL woul...,0


In [146]:
sms_data_dsg.shape

(1494, 3)

In [147]:
X_dsg = sms_data_dsg.message
y_dsg = sms_data_dsg.target

In [169]:
print(X_dsg.shape)
print(y_dsg.shape)

(1494,)
(1494,)


In [149]:
len(spam)

747

In [150]:
len(ham)

747

In [151]:
X_train_ds,X_test_ds,y_train_ds,y_test_ds = train_test_split(X_dsg,y_dsg,test_size = 0.2,stratify = y_dsg,random_state = 42)

In [152]:
print(X_train_ds.shape)
print(X_test_ds.shape)
print(y_train_ds.shape)
print(y_test_ds.shape)

(1195,)
(299,)
(1195,)
(299,)


In [153]:
ds_vect = CountVectorizer()
ds_vect.fit(X_train_ds)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [154]:
X_train_ds = ds_vect.transform(X_train_ds) 

In [155]:
X_test_ds = ds_vect.transform(X_test_ds)

In [156]:
print(X_train_ds.shape)
print(X_test_ds.shape)

(1195, 4063)
(299, 4063)


# logistic regression downsampling

In [157]:
log_reg_dsg = LogisticRegression()
log_reg_dsg.fit(X_train_ds,y_train_ds)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [158]:
log_ds_score = log_reg_dsg.score(X_test_ds,y_test_ds)

In [159]:
log_ds_score

0.9531772575250836

In [160]:
y_pred_ds = log_reg_dsg.predict(X_test_ds)

In [161]:
confusion_matrix(y_true = y_test_ds,y_pred = y_pred_ds)

array([[147,   3],
       [ 11, 138]])

In [162]:
print("precision:", precision_score(y_true=y_test_ds, y_pred=y_pred_ds))
print("recall:", recall_score(y_true=y_test_ds, y_pred=y_pred_ds))
print("f1-score:", f1_score(y_true=y_test_ds, y_pred=y_pred_ds))

precision: 0.9787234042553191
recall: 0.9261744966442953
f1-score: 0.9517241379310345


# random forest downsampling

In [163]:
rnd_ds = RandomForestClassifier(random_state=42)

In [164]:
rnd_ds.fit(X_train_ds,y_train_ds)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [165]:
rnd_ds_score = rnd_ds.score(X_test_ds,y_test_ds)
rnd_ds_score

0.9464882943143813

In [166]:
y_pred_rnd_ds = rnd_ds.predict(X_test_ds) 

In [167]:
confusion_matrix(y_true = y_test_ds,y_pred = y_pred_rnd_ds)

array([[148,   2],
       [ 14, 135]])

In [168]:
print("precision:", precision_score(y_true=y_test_ds, y_pred=y_pred_rnd_ds))
print("recall:", recall_score(y_true=y_test_ds, y_pred=y_pred_rnd_ds))
print("f1-score:", f1_score(y_true=y_test_ds, y_pred=y_pred_rnd_ds))

precision: 0.9854014598540146
recall: 0.9060402684563759
f1-score: 0.944055944055944
