In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [None]:
df = pd.read_csv("../inputs/spam_data.csv",encoding='ISO-8859-1')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df = df[['v1','v2']]

In [None]:
df.head()

In [None]:
df['v1'].value_counts()

In [None]:
df.isna().sum()

In [None]:
df['v1'] = df['v1'].apply(lambda x: 1 if x=='spam' else 0)

In [None]:
df['v1'].value_counts().plot(kind='bar')

In [None]:
spam_words = ' '.join(list(df[df['v1']==1]['v2']))
spam_wc = WordCloud(width=500,height=500).generate(spam_words)
plt.figure(figsize=(10,10))
plt.imshow(spam_wc)
plt.show()

In [None]:
not_spam_words = ' '.join(list(df[df['v1']==0]['v2']))
spam_wc = WordCloud(width=500,height=500).generate(not_spam_words)
plt.figure(figsize=(10,10))
plt.imshow(spam_wc)
plt.show()

## NO UPSAMPLING OR DOWNSAMPLING, PREPROCESSING + BOW + NB + CV (GRID, RANDOM, BAYESIAN OPTIMISATION) 

In [None]:
import pandas as pd
import re
import string
import time

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn import naive_bayes
from sklearn.model_selection import GridSearchCV

In [None]:
df = pd.read_csv("../inputs/spam_data.csv",encoding='ISO-8859-1')
df = df[['v1','v2']]
df['v1'] = df['v1'].apply(lambda x: 1 if x=='spam' else 0)
df.columns = ['label','document']

In [None]:
df = df.sample(frac=1).reset_index(drop=True)
df_train = df.loc[:4500,:].reset_index(drop=True)
df_test = df.loc[4500:,:].reset_index(drop=True)

PREPROCESSING STEPS:<BR> 1) REMOVE ANYTHING THAT IS NOT A-Z AND 0-9<BR>
                     2) LOWER CASE<BR>
                     3) STOP WORDS REMOVAL<BR>
                     4) STEMMING

In [None]:
stop = list(stopwords.words('english'))
stemmer = SnowballStemmer('english')

def clean_text(sentence):
    final_string = ''
    sentence = re.sub(f'[{re.escape(string.punctuation)}]','',' '.join(word_tokenize(sentence))).lower()
    sentence = ' '.join(word_tokenize(sentence))
    for word in sentence.split():
        word = stemmer.stem(word)
        if word not in stop:
            final_string = final_string + word +' '
    return final_string.rstrip().lstrip()

In [None]:
df_train['document'] = df_train['document'].apply(lambda x: clean_text(x))

In [None]:
df_train['kfold'] = -1
kf = StratifiedKFold(n_splits=5)
for fold_,(t_,v_) in enumerate(kf.split(X=df_train,y=df_train['label'])):
    df_train.loc[v_,'kfold'] = fold_

In [None]:
for fold in range(5):
    train = df_train[df_train['kfold']!=fold].reset_index(drop=True)
    valid = df_train[df_train['kfold']==fold].reset_index(drop=True)
    
    bow = CountVectorizer(binary=True)
    bow.fit(train['document'])
    
    x_train = bow.transform(train['document'])
    y_train = train['label']
    
    x_valid = bow.transform(valid['document'])
    
    NB = naive_bayes.MultinomialNB()
    NB.fit(x_train,y_train)
    
    preds = NB.predict(x_valid)
    
    auc = metrics.roc_auc_score(valid['label'],preds)
    
    print(f'For fold = {fold}, AUC = {auc}')
    

In [None]:
bow = CountVectorizer(binary=True)
bow.fit(df_train['document'])
x_train = bow.transform(df_train['document'])
y_train = df_train['label']

df_test['document'] = df_test['document'].apply(lambda x: clean_text(x))

x_test = bow.transform(df_test['document'])
y_test = df_test['label']

NB = naive_bayes.MultinomialNB()
NB.fit(x_train,y_train)

preds = NB.predict(x_test)

auc = metrics.roc_auc_score(y_test,preds)

print(f'AUC = {auc}')


In [None]:
# GRID SEARCH
X = df_train['document']
X = X.apply(clean_text)
bow = CountVectorizer(binary=True)
bow.fit(X)
X = bow.transform(X)
y = df_train['label']

classifier = naive_bayes.MultinomialNB()
param_grid = {'alpha':[0.0001,0.001,0.01,0.1,1,10,100,1000]}
model = GridSearchCV(estimator = classifier, param_grid=param_grid,verbose=10,scoring='roc_auc',cv=5)
model.fit(X,y)

In [None]:
model.best_params_

In [None]:
X_test = df_test['document']
X_test = X_test.apply(clean_text)
X_test = bow.transform(X_test)
y_test = df_test['label']

NB = naive_bayes.MultinomialNB(alpha=0.1)
NB.fit(X,y)

preds = NB.predict(X_test)

print(metrics.roc_auc_score(y_test,preds))
print(metrics.precision_score(y_test,preds))

In [None]:
preds = NB.predict(X)

print(metrics.roc_auc_score(y,preds))
print(metrics.precision_score(y,preds))

In [None]:
X_test = df_test['document']
X_test = X_test.apply(clean_text)
X_test = bow.transform(X_test)
y_test = df_test['label']

train_auc = []
test_auc = []
for alpha in [0.0001,0.001,0.01,0.1,1,10,100,1000]:
    NB = naive_bayes.MultinomialNB(alpha=alpha)
    NB.fit(X,y)

    preds_test = NB.predict(X_test)
    preds_train = NB.predict(X)
    auc_test = metrics.roc_auc_score(y_test,preds_test)
    auc_train = metrics.roc_auc_score(y,preds_train)
    
    train_auc.append(auc_train)
    test_auc.append(auc_test)
    #print('ALPHA: '+ str(alpha) + ' TRAINING AUC: '+ str(round(auc_train,2))+' TEST AUC: '+ str(round(auc_test,2)))


In [None]:
alpha = [0.0001,0.001,0.01,0.1,1,10,100,1000]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
temp_df = pd.DataFrame({'alpha':alpha,'train_auc':train_auc,'test_auc':test_auc})

In [None]:
fig,ax = plt.subplots(figsize=(7,7))
temp_df.plot(x = 'alpha',y='train_auc',ax=ax)
temp_df.plot(x = 'alpha',y='test_auc',ax=ax)
#plt.xlim(0,1)

In [None]:
bow.vocabulary_

## THIS TIME WE WILL OVERSAMPLE THE DATA 

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import re
import string

from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

from sklearn import metrics
from sklearn import naive_bayes
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv("../inputs/spam_data.csv",encoding='ISO-8859-1')
df = df[['v1','v2']]
df['v1'] = df['v1'].apply(lambda x: 1 if x=='spam' else 0)
df.columns = ['label','document']

In [3]:
df.shape

(5572, 2)

In [4]:
df.head()

Unnamed: 0,label,document
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df = df.sample(frac=1).reset_index(drop=True)
df_train = df.loc[:4500,:].reset_index(drop=True)
df_test = df.loc[4500:,:].reset_index(drop=True)

In [6]:
df_train['label'].value_counts(normalize=True)

0    0.862253
1    0.137747
Name: label, dtype: float64

In [7]:
df_train_spam = df_train[df_train['label']==1]

In [8]:
df_train_spam.shape

(620, 2)

In [9]:
df_train_not_spam = df_train[df_train['label']!=1]

In [10]:
df_train_not_spam.shape

(3881, 2)

In [11]:
df_train_spam = df_train_spam.sample(df_train_not_spam.shape[0],replace=True) 

In [12]:
df_train_spam.shape

(3881, 2)

In [13]:
df_train = pd.concat([df_train_spam,df_train_not_spam]).reset_index(drop=True)

In [14]:
df_train = df_train.sample(frac=1).reset_index(drop=True)

In [15]:
stop = list(stopwords.words('english'))
stemmer = SnowballStemmer('english')

def clean_text(sentence):
    final_string = ''
    sentence = re.sub(f'[{re.escape(string.punctuation)}]','',' '.join(word_tokenize(sentence))).lower()
    sentence = ' '.join(word_tokenize(sentence))
    for word in sentence.split():
        word = stemmer.stem(word)
        if word not in stop:
            final_string = final_string + word +' '
    return final_string.rstrip().lstrip()

In [16]:
df_train['document'] = df_train['document'].apply(clean_text)

In [17]:
df_train.head()

Unnamed: 0,label,document
0,1,urgent call 09061749602 landlin complimentari ...
1,1,hot live fantasi call 08707500020 20p per min ...
2,0,ãã come lt 25 n pass lar
3,0,wonder okor great month cherish guy wish well ...
4,1,hot live fantasi call 08707500020 20p per min ...


In [18]:
bow = CountVectorizer()
bow.fit(df_train['document'])
X = bow.transform(df_train['document'])
y = df_train['label']

In [19]:
NB = naive_bayes.MultinomialNB()
params_grid = {'alpha':[0.01,0.1,1,10,100,1000]}
model = GridSearchCV(estimator=NB,param_grid=params_grid,cv=5,scoring='roc_auc',verbose=10)
model.fit(X,y)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5; 1/6] START alpha=0.01..................................................
[CV 1/5; 1/6] END ...................alpha=0.01;, score=1.000 total time=   0.0s
[CV 2/5; 1/6] START alpha=0.01..................................................
[CV 2/5; 1/6] END ...................alpha=0.01;, score=0.999 total time=   0.0s
[CV 3/5; 1/6] START alpha=0.01..................................................
[CV 3/5; 1/6] END ...................alpha=0.01;, score=0.998 total time=   0.0s
[CV 4/5; 1/6] START alpha=0.01..................................................
[CV 4/5; 1/6] END ...................alpha=0.01;, score=0.999 total time=   0.0s
[CV 5/5; 1/6] START alpha=0.01..................................................
[CV 5/5; 1/6] END ...................alpha=0.01;, score=0.998 total time=   0.0s
[CV 1/5; 2/6] START alpha=0.1...................................................
[CV 1/5; 2/6] END ....................alpha=0.1;,

GridSearchCV(cv=5, estimator=MultinomialNB(),
             param_grid={'alpha': [0.01, 0.1, 1, 10, 100, 1000]},
             scoring='roc_auc', verbose=10)

In [20]:
model.best_score_

0.9988055805813405

In [21]:
model.best_params_

{'alpha': 0.01}

In [22]:
df_test['document'] = df_test['document'].apply(clean_text)

In [25]:
X_test = bow.transform(df_test['document'])
y_test = df_test['label']

NB = naive_bayes.MultinomialNB(alpha=0.01)
NB.fit(X,y)

train_auc = round(metrics.roc_auc_score(y,NB.predict(X)),2)
test_auc = round(metrics.roc_auc_score(y_test,NB.predict(X_test)),2)

print(f'For Alpha = 0.01, train auc = {train_auc} and test auc = {test_auc}')

For Alpha = 0.0001, train auc = 0.99 and test auc = 0.94


In [39]:
NB.coef_[0]

array([ -8.75283438,  -9.26299423,  -9.44498306, ...,  -9.26299423,
        -9.10908128, -15.66158916])

In [40]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

In [45]:
X.toarray().shape

(7762, 6829)

In [49]:
y

0       1
1       1
2       0
3       0
4       1
       ..
7757    1
7758    1
7759    1
7760    1
7761    1
Name: label, Length: 7762, dtype: int64

In [48]:
chi2_features = SelectKBest(chi2, k = 500)

In [51]:
chi2_features.fit_transform(X,y).toarray().shape

(7762, 500)

In [69]:
chi_scores = chi2(X,y)

In [70]:
chi_scores

(array([10.,  6.,  5., ...,  6.,  7.,  1.]),
 array([0.0015654 , 0.01430588, 0.02534732, ..., 0.01430588, 0.00815097,
        0.31731051]))

In [71]:
chi_values = chi_scores[0]
chi_values 

array([10.,  6.,  5., ...,  6.,  7.,  1.])

In [72]:
len(chi_values)

6829

In [73]:
len(bow.get_feature_names())

6829

In [74]:
chi_values = pd.Series(chi_values,index=bow.get_feature_names()).sort_values(ascending=False)
chi_values

call     1350.955699
free     1073.898894
txt       799.574140
mobil     740.417526
claim     627.000000
            ...     
havin       0.000000
door        0.000000
child       0.000000
eye         0.000000
darl        0.000000
Length: 6829, dtype: float64

In [75]:
chi_values[:500]

call         1350.955699
free         1073.898894
txt           799.574140
mobil         740.417526
claim         627.000000
                ...     
liverpool      24.000000
8552           24.000000
juz            24.000000
87021          24.000000
k52            24.000000
Length: 500, dtype: float64

In [78]:
print(list(chi_values.index))

['call', 'free', 'txt', 'mobil', 'claim', 'text', 'repli', 'stop', 'prize', 'tone', 'ur', 'servic', 'nokia', 'award', 'cash', 'urgent', 'week', '16', 'contact', 'collect', 'guarante', 'gt', 'lt', 'win', '1000', 'new', 'offer', '500', '150', 'cs', 'min', '150ppm', 'per', 'custom', '2000', 'voucher', '18', 'draw', 'chat', 'landlin', '100', 'latest', 'come', 'box', 'rington', 'po', 'holiday', 'phone', 'receiv', 'camera', 'code', 'show', 'appli', 'mob', 'ok', '150p', 'rate', 'video', '5000', 'onli', '1st', 'everi', 'await', 'attempt', 'poli', 'entri', 'got', 'select', 'uk', 'sae', 'lor', 'nt', '8007', 'orang', 'pleas', 'pic', 'network', '800', 'line', 'live', 'privat', 'expir', 'valid', 'wk', 'date', 'home', 'bonus', 'later', 'say', 'da', 'unredeem', 'identifi', 'chanc', 'cost', 'colour', 'love', 'statement', 'music', '750', '08000930705', 'camcord', 'messag', 'pobox', 'send', 'oper', 'doubl', 'tri', '2003', '250', '0800', 'pound', '08000839402', 'game', 'deliveri', 'mobileupd8', '2nd', '1

In [80]:
p_values = chi_scores[1]
p_values

array([0.0015654 , 0.01430588, 0.02534732, ..., 0.01430588, 0.00815097,
       0.31731051])

In [82]:
p_values = pd.Series(p_values, index = bow.get_feature_names())

In [84]:
p_values[p_values < 0.05]

008704050406    1.565402e-03
0089            1.430588e-02
0121            2.534732e-02
01223585334     5.320055e-04
02              1.565402e-03
                    ...     
zebra           4.677735e-03
zed             4.238055e-10
zouk            8.150972e-03
âªv             1.430588e-02
â¼120           8.150972e-03
Length: 2767, dtype: float64

In [99]:
# Feature selection using the model
NB.feature_log_prob_[0] # This gives array of prob of words to classify result as 0

array([-14.92315824, -14.92315824, -14.92315824, ..., -14.92315824,
       -14.92315824, -10.30803772])

In [100]:
NB.feature_log_prob_[1] # This gives array of prob of words to classify result as 1

array([ -8.75283438,  -9.26299423,  -9.44498306, ...,  -9.26299423,
        -9.10908128, -15.66158916])

In [105]:
top_20_pos_words = pd.Series(NB.feature_log_prob_[1],index=bow.get_feature_names()).sort_values(ascending=False)[:20]

In [106]:
top_20_pos_words

call     -3.486486
free     -3.953089
txt      -4.332575
ur       -4.366807
text     -4.379323
mobil    -4.413919
claim    -4.615456
repli    -4.679675
stop     -4.748302
get      -4.874313
prize    -4.949374
week     -4.958322
tone     -5.020914
servic   -5.108358
onli     -5.118856
cash     -5.129466
new      -5.159238
nokia    -5.159238
award    -5.221579
urgent   -5.230389
dtype: float64

In [107]:
top_20_neg_words = pd.Series(NB.feature_log_prob_[0],index=bow.get_feature_names()).sort_values(ascending=False)[:20]

In [111]:
top_20_neg_words.index

Index(['go', 'nt', 'get', 'gt', 'lt', 'come', 'call', 'ok', 'got', 'know',
       'like', 'ur', 'love', 'day', 'good', 'time', 'want', 'one', 'need',
       'home'],
      dtype='object')