In [2]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')


In [3]:
df = pd.read_table('data/SMS.tsv', sep='\t', header=None, names=['label', 'sms_message'])
df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['sms_message'])
features = vectorizer.get_feature_names_out()


In [5]:
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
y = df['label']
df

Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [6]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(X, y)

important_features = {'RFC': pd.DataFrame(clf.feature_importances_, features,
                                          columns=['importance']).sort_values('importance', ascending=False).head(30)}
important_features['RFC']

Unnamed: 0,importance
call,0.026573
txt,0.022648
claim,0.02077
free,0.018687
www,0.017882
mobile,0.016837
150p,0.01506
stop,0.012711
prize,0.012244
service,0.011833


In [7]:
from sklearn.linear_model import LogisticRegression

weight_features = []
for j in range(0, len(features)):
    if j % 100 == 0:
        print(j)
    X_copy = X.copy()
    X_copy[:, j] = np.random.permutation(X_copy[:, j].toarray().ravel())
    lr = LogisticRegression()
    lr.fit(X_copy, y)
    weight_features.append(lr.score(X_copy, y))
important_features['PI'] = pd.DataFrame(weight_features, features, columns=['importance']).sort_values('importance', ascending=False).head(30)
important_features['PI']

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700


Unnamed: 0,importance
now,0.978105
to,0.977925
contact,0.977746
or,0.977746
but,0.977566
name,0.977566
we,0.977566
love,0.977566
www,0.977566
as,0.977566


In [8]:
from scipy.stats import chi2_contingency

weight_features = []
for i in range(len(features)):
    if i % 100 == 0:
        print(i)
    contingency_table = pd.crosstab(y, X[:, i].toarray().ravel())
    chi2, p, dof, expected = chi2_contingency(contingency_table.values)
    weight_features.append(chi2)
important_features['CHI'] = pd.DataFrame(weight_features, features, columns=['importance']).sort_values('importance', ascending=False).head(30)
important_features['CHI']

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700


Unnamed: 0,importance
to,3341.289567
call,2263.869865
you,1773.752468
your,1559.972139
now,1298.950637
for,1240.911568
the,1214.578614
or,1202.939664
free,1138.476927
is,1014.672791


In [22]:
from sklearn.feature_selection import SelectKBest, f_classif

algorithm = SelectKBest(f_classif, k=30)
X_new = algorithm.fit_transform(X, y)
words = [features[i] for i in algorithm.get_support(indices=True)]
important_features['SKB'] = pd.DataFrame(words, words, columns=['importance']).sort_values('importance', ascending=False)
print(important_features['SKB'])

            importance
www                www
won                won
win                win
urgent          urgent
uk                  uk
txt                txt
tone              tone
stop              stop
service        service
reply            reply
prize            prize
nokia            nokia
mobile          mobile
guaranteed  guaranteed
free              free
cs                  cs
contact        contact
co                  co
claim            claim
cash              cash
call              call
awarded        awarded
500                500
50                  50
18                  18
16                  16
150ppm          150ppm
150p              150p
1000              1000
100                100


In [23]:
from sklearn.feature_selection import RFE

clf = RandomForestClassifier(n_estimators=100, random_state=0)
model = RFE(clf, n_features_to_select=30, step=0.1)
model.fit(X, y)
words = [features[i] for i in model.get_support(indices=True)]
important_features['RFE'] = pd.DataFrame(words, words, columns=['importance']).sort_values('importance', ascending=False)
print(important_features['RFE'])

         importance
your           your
you             you
www             www
won             won
win             win
urgent       urgent
uk               uk
txt             txt
to               to
text           text
stop           stop
service     service
ringtone   ringtone
reply         reply
prize         prize
or               or
now             now
nokia         nokia
mobile       mobile
me               me
free           free
contact     contact
com             com
co               co
claim         claim
call           call
50               50
18               18
16               16
150p           150p


In [25]:
from sklearn.feature_selection import SequentialFeatureSelector, SelectFromModel

lr = LogisticRegression()
sfm_lr = SelectFromModel(lr, max_features=30)
sfm_lr.fit(X, y)
words = [features[i] for i in sfm_lr.get_support(indices=True)]
important_features['SFM'] = pd.DataFrame(words, words, columns=['importance']).sort_values('importance', ascending=False)
print(important_features['SFM'])

         importance
your           your
www             www
won             won
win             win
urgent       urgent
uk               uk
txt             txt
to               to
text           text
stop           stop
service     service
ringtone   ringtone
reply         reply
prize         prize
or               or
new             new
my               my
mobile       mobile
me               me
from           from
free           free
com             com
co               co
claim         claim
chat           chat
cash           cash
call           call
50               50
18               18
150p           150p


In [27]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def compare_methods(model):
    print(model)
    model.fit(X_train, y_train)
    print(model.score(X_test, y_test))

    for method in important_features:
        best_features = important_features[method].index.values
        X_train_new = X_train[:, [np.where(features == feature)[0][0] for feature in best_features]]
        X_test_new = X_test[:, [np.where(features == feature)[0][0] for feature in best_features]]
        model.fit(X_train_new, y_train)
        print(method, " : ", model.score(X_test_new, y_test))

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

compare_methods(KNeighborsClassifier())
compare_methods(SVC())
compare_methods(MultinomialNB())



KNeighborsClassifier()
0.9147982062780269
RFC  :  0.9623318385650225
PI  :  0.9228699551569507
CHI  :  0.9533632286995516
SKB  :  0.9659192825112107
RFE  :  0.9695067264573991
SFM  :  0.9766816143497757
SVC()
0.9829596412556054
RFC  :  0.9632286995515695
PI  :  0.9192825112107623
CHI  :  0.9605381165919282
SKB  :  0.9730941704035875
RFE  :  0.9721973094170404
SFM  :  0.9757847533632287
MultinomialNB()
0.9668161434977578
RFC  :  0.8708520179372198
PI  :  0.8663677130044843
CHI  :  0.8663677130044843
SKB  :  0.8663677130044843
RFE  :  0.8780269058295964
SFM  :  0.8717488789237668
