In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
sms_data = pd.read_table("sms.tsv", names=['label', 'message'])
sms_data.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# Quick way of converting string label to numeric targets
sms_data['target'] = sms_data.label.map({'ham':0,'spam':1})

In [5]:
sms_data.shape

(5572, 3)

In [6]:
sms_data.head()

Unnamed: 0,label,message,target
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
X = sms_data.message
y = sms_data.target

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify = y,test_size = 0.2,random_state = 42)

In [10]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4457,)
(1115,)
(4457,)
(1115,)


# count vectorization

In [11]:
vect = CountVectorizer()

In [12]:
# Generate count matrix
X_train_vect = vect.fit_transform(X_train)

In [13]:
X_test_vect = vect.transform(X_test)

In [14]:
#vect_obj.get_feature_names()

# tf-idf transformation

In [15]:
from sklearn.feature_extraction.text import TfidfTransformer

In [16]:
tf_idf = TfidfTransformer(norm = 'l2',use_idf=True,smooth_idf=True)

In [17]:
tf_idf.fit(X_train_vect)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [18]:
X_train_tf = tf_idf.transform(X_train_vect)
X_test_tf = tf_idf.transform(X_test_vect)

# logistic regression tf idf

In [19]:
from sklearn.linear_model import LogisticRegression

In [20]:
log_reg = LogisticRegression()

In [21]:
log_reg_clf = log_reg.fit(X_train_tf,y_train)



In [22]:
log_reg_clf_score = log_reg_clf.score(X_test_tf,y_test)

In [23]:
log_reg_clf_score

0.9730941704035875

In [24]:
y_pred_lreg = log_reg_clf.predict(X_test_tf)

In [25]:
from sklearn.metrics import confusion_matrix,recall_score,f1_score,precision_score

In [26]:
confusion_matrix(y_true = y_test,y_pred=y_pred_lreg)

array([[966,   0],
       [ 30, 119]])

In [27]:
print("precision:", precision_score(y_true=y_test, y_pred=y_pred_lreg))
print("recall:", recall_score(y_true=y_test, y_pred=y_pred_lreg))
print("f1-score:", f1_score(y_true=y_test, y_pred=y_pred_lreg))

precision: 1.0
recall: 0.7986577181208053
f1-score: 0.8880597014925372


# random forest tf idf

In [28]:
from sklearn.ensemble import RandomForestClassifier

In [29]:
rnd_tf = RandomForestClassifier(random_state=42)

In [30]:
rnd_tf_clf = rnd_tf.fit(X_train_tf,y_train)



In [31]:
rnd_tf_clf_score = rnd_tf_clf.score(X_test_tf,y_test)

In [32]:
rnd_tf_clf_score

0.9641255605381166

In [33]:
y_pred_rnd = rnd_tf_clf.predict(X_test_tf)

In [34]:
confusion_matrix(y_true=y_test,y_pred=y_pred_rnd)

array([[966,   0],
       [ 40, 109]])

In [35]:
print("precision:", precision_score(y_true=y_test, y_pred=y_pred_rnd))
print("recall:", recall_score(y_true=y_test, y_pred=y_pred_rnd))
print("f1-score:", f1_score(y_true=y_test, y_pred=y_pred_rnd))

precision: 1.0
recall: 0.7315436241610739
f1-score: 0.8449612403100776


# using 3 grams

In [36]:
vect_3g = CountVectorizer(ngram_range=(1,3))

In [37]:
X_train_3g = vect_3g.fit_transform(X_train)

In [38]:
X_test_3g = vect_3g.transform(X_test)

In [39]:
tf_idf_3g = TfidfTransformer(norm = 'l2',use_idf=True,smooth_idf=True)

In [40]:
tf_idf_3g.fit(X_train_3g)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [41]:
X_train_tf3g = tf_idf_3g.transform(X_train_3g)

In [42]:
X_test_tf3g = tf_idf_3g.transform(X_test_3g)

In [43]:
print(X_train_3g.shape)
print(X_test_3g.shape)
print(X_train_tf3g.shape)
print(X_test_tf3g.shape)

(4457, 87201)
(1115, 87201)
(4457, 87201)
(1115, 87201)


# logistic regression 3 grams

In [44]:
lreg_3g = LogisticRegression()

In [45]:
lreg_3g_clf = lreg_3g.fit(X_train_tf3g,y_train)



In [46]:
X_test_tf3g.shape

(1115, 87201)

In [47]:
lreg_3g_score = lreg_3g_clf.score(X_test_tf3g,y_test)

In [48]:
lreg_3g_score

0.9524663677130045

In [49]:
y_pred_reg = lreg_3g_clf.predict(X_test_tf3g)

In [50]:
confusion_matrix(y_true = y_test,y_pred = y_pred_reg)

array([[966,   0],
       [ 53,  96]])

In [51]:
print("precision:", precision_score(y_true=y_test, y_pred=y_pred_reg))
print("recall:", recall_score(y_true=y_test, y_pred=y_pred_reg))
print("f1-score:", f1_score(y_true=y_test, y_pred=y_pred_reg))

precision: 1.0
recall: 0.6442953020134228
f1-score: 0.7836734693877551


 # random forest 3 grams

In [52]:
from sklearn.ensemble import RandomForestClassifier

In [53]:
rnd_3g = RandomForestClassifier(random_state=42)

In [54]:
rnd_3g_clf = rnd_3g.fit(X_train_tf3g,y_train)



In [55]:
rnd_3g_score = rnd_3g_clf.score(X_test_tf3g,y_test)

In [56]:
rnd_3g_score

0.95695067264574

In [58]:
y_pred_rnd3g = rnd_3g_clf.predict(X_test_tf3g)

In [59]:
confusion_matrix(y_true = y_test,y_pred = y_pred_rnd3g)

array([[966,   0],
       [ 48, 101]])

In [60]:
print("precision:", precision_score(y_true=y_test, y_pred=y_pred_rnd3g))
print("recall:", recall_score(y_true=y_test, y_pred=y_pred_rnd3g))
print("f1-score:", f1_score(y_true=y_test, y_pred=y_pred_rnd3g))

precision: 1.0
recall: 0.6778523489932886
f1-score: 0.808


# upsampling and downsampling techniques

# upsampling

In [61]:
from imblearn.over_sampling import SMOTE

In [62]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train==0)))

Before OverSampling, counts of label '1': 598
Before OverSampling, counts of label '0': 3859 



In [63]:
sm = SMOTE(random_state=42)
X_train_up,y_train_up = sm.fit_sample(X_train_tf,y_train.ravel())

In [64]:
print('After OverSampling, the shape of train_X: {}'.format(X_train_up.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_up.shape))

print("After OverSampling, counts of label '1': {}".format(sum(y_train_up==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_up==0)))

After OverSampling, the shape of train_X: (7718, 7668)
After OverSampling, the shape of train_y: (7718,) 

After OverSampling, counts of label '1': 3859
After OverSampling, counts of label '0': 3859


# logistic regression after upsampling

In [65]:
log_reg_up = LogisticRegression()

In [66]:
lr_ups = log_reg_up.fit(X_train_up,y_train_up) 



In [67]:
lr_score = lr_ups.score(X_test_tf,y_test)
lr_score

0.9838565022421525

In [68]:
y_pred_up = lr_ups.predict(X_test_tf)

In [69]:
confusion_matrix(y_true=y_test,y_pred=y_pred_up)

array([[961,   5],
       [ 13, 136]])

In [70]:
print("precision:", precision_score(y_true=y_test, y_pred=y_pred_up))
print("recall:", recall_score(y_true=y_test, y_pred=y_pred_up))
print("f1-score:", f1_score(y_true=y_test, y_pred=y_pred_up))

precision: 0.9645390070921985
recall: 0.912751677852349
f1-score: 0.9379310344827586


# random forest upsampling

In [71]:
rnd_up = RandomForestClassifier(random_state =42)

In [72]:
rnd_ups = rnd_up.fit(X_train_up,y_train_up)



In [74]:
rnd_ups_score = rnd_ups.score(X_test_tf,y_test)
rnd_ups_score

0.9730941704035875

In [75]:
y_pred_rnd_up = rnd_ups.predict(X_test_tf)

In [76]:
confusion_matrix(y_true = y_test,y_pred = y_pred_rnd_up)

array([[966,   0],
       [ 30, 119]])

In [77]:
print("precision:", precision_score(y_true=y_test, y_pred=y_pred_rnd_up))
print("recall:", recall_score(y_true=y_test, y_pred=y_pred_rnd_up))
print("f1-score:", f1_score(y_true=y_test, y_pred=y_pred_rnd_up))

precision: 1.0
recall: 0.7986577181208053
f1-score: 0.8880597014925372


# downsampling

In [78]:
sms_data_ds = sms_data.sample(frac = 1,random_state = 42)

In [79]:
spam = sms_data_ds.loc[sms_data_ds["target"]==1]
ham = sms_data_ds.loc[sms_data_ds["target"]==0][:747]

In [80]:
normal_distributed_data = pd.concat([spam,ham])

In [81]:
sms_data_dsg = normal_distributed_data.sample(frac=1,random_state = 42)

In [82]:
sms_data_dsg.head()

Unnamed: 0,label,message,target
3979,ham,Reason is if the team budget is available at l...,0
3803,ham,No da. I am happy that we sit together na,0
2306,ham,Still in the area of the restaurant. Ill try t...,0
3552,ham,I love you both too :-),0
297,ham,Unless it's a situation where YOU GO GURL woul...,0


In [83]:
sms_data_dsg.shape

(1494, 3)

In [84]:
X_dsg = sms_data_dsg.message
y_dsg = sms_data_dsg.target

In [85]:
print(X_dsg.shape)
print(y_dsg.shape)

(1494,)
(1494,)


In [86]:
X_train_ds,X_test_ds,y_train_ds,y_test_ds = train_test_split(X_dsg,y_dsg,test_size = 0.2,stratify = y_dsg,random_state = 42)

In [87]:
ds_vect = CountVectorizer()
ds_vect.fit(X_train_ds)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [88]:
X_train_ds = ds_vect.transform(X_train_ds) 

In [89]:
X_test_ds = ds_vect.transform(X_test_ds)

In [90]:
tf_idf_ds = TfidfTransformer(norm = 'l2',use_idf=True,smooth_idf=True)

In [91]:
tf_idf_ds.fit(X_train_ds)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [92]:
X_train_tf_ds = tf_idf_ds.transform(X_train_ds)
X_test_tf_ds = tf_idf_ds.transform(X_test_ds)

In [93]:
print(X_train_ds.shape)
print(X_test_ds.shape)

(1195, 4063)
(299, 4063)


# logistic regression downsampling

In [97]:
log_reg_dsg = LogisticRegression()
log_reg_dsg.fit(X_train_tf_ds,y_train_ds)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [98]:
log_reg_dsg_score = log_reg_dsg.score(X_test_tf_ds,y_test_ds)

In [99]:
log_reg_dsg_score

0.959866220735786

In [100]:
y_pred_ds = log_reg_dsg.predict(X_test_tf_ds)

In [103]:
confusion_matrix(y_true = y_test_ds,y_pred = y_pred_ds)

array([[147,   3],
       [  9, 140]])

In [104]:
print("precision:", precision_score(y_true=y_test_ds, y_pred=y_pred_ds))
print("recall:", recall_score(y_true=y_test_ds, y_pred=y_pred_ds))
print("f1-score:", f1_score(y_true=y_test_ds, y_pred=y_pred_ds))

precision: 0.9790209790209791
recall: 0.9395973154362416
f1-score: 0.958904109589041


# random forest downsampling

In [105]:
rnd_ds = RandomForestClassifier(random_state=42)

In [107]:
rnd_ds.fit(X_train_tf_ds,y_train_ds)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [108]:
rnd_ds_score = rnd_ds.score(X_test_tf_ds,y_test_ds)

In [109]:
rnd_ds_score

0.9464882943143813

In [110]:
y_pred_rnd_ds = rnd_ds.predict(X_test_tf_ds)

In [111]:
confusion_matrix(y_true=y_test_ds,y_pred = y_pred_rnd_ds)

array([[147,   3],
       [ 13, 136]])

In [112]:
print("precision:", precision_score(y_true=y_test_ds, y_pred=y_pred_rnd_ds))
print("recall:", recall_score(y_true=y_test_ds, y_pred=y_pred_rnd_ds))
print("f1-score:", f1_score(y_true=y_test_ds, y_pred=y_pred_rnd_ds))

precision: 0.9784172661870504
recall: 0.912751677852349
f1-score: 0.9444444444444444
