In [2]:
import pandas as pd
import numpy as np

In [None]:
dataframe = pd.read_csv("main_data/spam.csv", encoding='latin-1')
dataframe.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df = dataframe.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])

In [5]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df.rename(columns={'v1':"label", 'v2':'text'}, inplace=True)
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
df.shape

(5572, 2)

In [8]:
df.isnull().sum()

Unnamed: 0,0
label,0
text,0


In [9]:
df.duplicated().sum()

np.int64(403)

In [10]:
df.drop_duplicates(keep='first', inplace=True)
df.shape

(5169, 2)

In [11]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
ham,4516
spam,653


In [12]:
import re
from gensim.utils import simple_preprocess


In [13]:
def preprocess_text(text):
  review = re.sub(r'[^a-zA-Z0-9\s]+', '', text)
  review = review.lower()
  fnl_review = simple_preprocess(review)
  return fnl_review

In [14]:
df['processed_text'] = df['text'].apply(preprocess_text)

In [15]:
df.head()

Unnamed: 0,label,text,processed_text
0,ham,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point, crazy, available, o..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, wkly, comp, to, win, fa, cup..."
3,ham,U dun say so early hor... U c already then say...,"[dun, say, so, early, hor, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, he, goes, to, usf, he, live..."


In [16]:
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [17]:
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

snow_stemmer = SnowballStemmer("english")
stop_words = set(stopwords.words('english'))

In [18]:
def preprocess_text(text):

  if not isinstance(text, str):
    return []

  review = re.sub(r'[^a-zA-Z0-9\s]', '', text)
  review = review.lower()
  review = word_tokenize(review)
  tokens = [snow_stemmer.stem(word) for word in review if word not in stop_words]
  return " ".join(tokens)

In [19]:
df['pre_processed_text'] = df['text'].apply(preprocess_text)

In [20]:
df.head()

Unnamed: 0,label,text,processed_text,pre_processed_text
0,ham,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point, crazy, available, o...",go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, oni]",ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, wkly, comp, to, win, fa, cup...",free entri 2 wkli comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,"[dun, say, so, early, hor, already, then, say]",u dun say earli hor u c alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, he, goes, to, usf, he, live...",nah dont think goe usf live around though


In [21]:
from gensim.models import Word2Vec

word2vec_model = Word2Vec(
    sentences=df['pre_processed_text'].apply(lambda x: x.split()),
    vector_size=100,
    min_count=1,
    window=5,
    workers=4,
    sg=1,
    epochs=20
)

In [22]:
word2vec_model.wv['joke']

array([ 0.080337  ,  0.08431076, -0.02361621, -0.09059858,  0.17348906,
       -0.2792862 ,  0.18971288,  0.47106102, -0.2083538 , -0.13308397,
        0.12491082, -0.25253722,  0.02743793,  0.07642177,  0.10964472,
       -0.17264016,  0.07724196, -0.11868825, -0.09102071, -0.35803884,
       -0.19809061, -0.01724925,  0.17035949, -0.18944569,  0.02437369,
        0.0111752 , -0.4452615 , -0.31328285, -0.18572016,  0.01796582,
        0.39250547,  0.05736278,  0.21554537, -0.28052896,  0.03729115,
        0.43326584, -0.21970756, -0.1736843 , -0.23213077, -0.45015588,
       -0.16667114, -0.07775938, -0.03479343,  0.03872067,  0.07586601,
       -0.2292701 ,  0.10566537, -0.17190129, -0.02038435,  0.0961208 ,
        0.06522729, -0.21930028,  0.04478997,  0.22516966, -0.1805509 ,
        0.12197752,  0.19172373,  0.0173922 , -0.3680367 ,  0.31490126,
        0.4712034 ,  0.08283363,  0.09962101,  0.0348433 , -0.24262452,
        0.32064953,  0.14901116,  0.14386247, -0.30636665,  0.09

In [23]:
word2vec_model.wv.most_similar('joke')

[('avoid', 0.9121329188346863),
 ('expos', 0.9055027365684509),
 ('everybodi', 0.9044913649559021),
 ('twice', 0.9032043218612671),
 ('amus', 0.8984502553939819),
 ('throw', 0.8949748277664185),
 ('lie', 0.8939349055290222),
 ('none', 0.8933122754096985),
 ('nyc', 0.8924438953399658),
 ('bottl', 0.8921034336090088)]

In [24]:
len(word2vec_model.wv.index_to_key)

8006

In [25]:
def avg_word_vectors(tokens):
  """Compute the average Word2Vec embedding for a list of tokens."""

  word_vectors = [word2vec_model.wv[token] for token in tokens if token in word2vec_model.wv]
  if word_vectors:
    avg_vectors = np.mean(word_vectors, axis=0)
  else:
    avg_vectors = np.zeros(word2vec_model.vector_size)
  return avg_vectors

In [26]:
df['avg_word_vectors'] = df['pre_processed_text'].apply(avg_word_vectors)

In [27]:
df['avg_word_vectors'].shape

(5169,)

In [28]:
word2vec_model.wv.key_to_index

{'u': 0,
 'call': 1,
 'im': 2,
 '2': 3,
 'go': 4,
 'get': 5,
 'ur': 6,
 'come': 7,
 'dont': 8,
 '4': 9,
 'ltgt': 10,
 'know': 11,
 'ok': 12,
 'like': 13,
 'free': 14,
 'got': 15,
 'want': 16,
 'time': 17,
 'love': 18,
 'good': 19,
 'day': 20,
 'ill': 21,
 'text': 22,
 'need': 23,
 'send': 24,
 'one': 25,
 'see': 26,
 'think': 27,
 'lor': 28,
 'take': 29,
 'home': 30,
 'today': 31,
 'tell': 32,
 'still': 33,
 'back': 34,
 'stop': 35,
 'repli': 36,
 'make': 37,
 'txt': 38,
 'r': 39,
 'week': 40,
 'well': 41,
 'mobil': 42,
 'say': 43,
 'ask': 44,
 'phone': 45,
 'hi': 46,
 'new': 47,
 'n': 48,
 'da': 49,
 'pleas': 50,
 'sorri': 51,
 'hope': 52,
 'work': 53,
 'meet': 54,
 'miss': 55,
 'night': 56,
 'hey': 57,
 'thing': 58,
 'wait': 59,
 'much': 60,
 'give': 61,
 'cant': 62,
 'oh': 63,
 'tri': 64,
 'na': 65,
 'happi': 66,
 'wat': 67,
 'great': 68,
 'thank': 69,
 'number': 70,
 'later': 71,
 'that': 72,
 'claim': 73,
 'way': 74,
 'dear': 75,
 'wan': 76,
 'messag': 77,
 'friend': 78,
 'alreadi

In [29]:
df.head()

Unnamed: 0,label,text,processed_text,pre_processed_text,avg_word_vectors
0,ham,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point, crazy, available, o...",go jurong point crazi avail bugi n great world...,"[0.041321594, 0.3557889, -0.13920489, 0.121806..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, oni]",ok lar joke wif u oni,"[0.05544707, 0.32845622, 0.025683489, 0.137503..."
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, wkly, comp, to, win, fa, cup...",free entri 2 wkli comp win fa cup final tkts 2...,"[-0.044723656, 0.39788422, -0.11872493, 0.0706..."
3,ham,U dun say so early hor... U c already then say...,"[dun, say, so, early, hor, already, then, say]",u dun say earli hor u c alreadi say,"[0.026679058, 0.30046755, -0.24037561, 0.13367..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, he, goes, to, usf, he, live...",nah dont think goe usf live around though,"[0.0493192, 0.26027393, 0.029878747, 0.1179631..."


In [30]:
avg_vectors = np.stack(df['avg_word_vectors'])
len(avg_vectors)

5169

In [31]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

target = le.fit_transform(df['label'])

In [32]:
target

array([0, 0, 1, ..., 0, 0, 0])

In [33]:
X = avg_vectors
y = target

In [34]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=42)

X_train.shape, y_train.shape,X_test.shape,y_test.shape


((4135, 100), (4135,), (1034, 100), (1034,))

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

models = [LogisticRegression(), SVC(), KNeighborsClassifier(), DecisionTreeClassifier(),
            RandomForestClassifier(), GradientBoostingClassifier()]

In [36]:
from sklearn.metrics import roc_auc_score, classification_report, accuracy_score

In [37]:
for model in models:

  model.fit(X_train, y_train)

  y_pred = model.predict(X_test)
  # y_pred_prob = model.predict_proba(X_test)

  print("-"*20)
  print(model.__class__.__name__)
  print(accuracy_score(y_test, y_pred))
  # print(y_pred_prob)
  print(classification_report(y_test, y_pred))
  print("="*20)


--------------------
LogisticRegression
0.9342359767891683
              precision    recall  f1-score   support

           0       0.95      0.98      0.96       889
           1       0.83      0.66      0.74       145

    accuracy                           0.93      1034
   macro avg       0.89      0.82      0.85      1034
weighted avg       0.93      0.93      0.93      1034

--------------------
SVC
0.9545454545454546
              precision    recall  f1-score   support

           0       0.96      0.99      0.97       889
           1       0.90      0.76      0.82       145

    accuracy                           0.95      1034
   macro avg       0.93      0.87      0.90      1034
weighted avg       0.95      0.95      0.95      1034

--------------------
KNeighborsClassifier
0.9555125725338491
              precision    recall  f1-score   support

           0       0.97      0.98      0.97       889
           1       0.86      0.81      0.84       145

    accuracy      

In [38]:
model = RandomForestClassifier()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)

print(classification_report(y_test, y_pred))
print(y_pred_prob)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       889
           1       0.96      0.75      0.84       145

    accuracy                           0.96      1034
   macro avg       0.96      0.87      0.91      1034
weighted avg       0.96      0.96      0.96      1034

[[0.95 0.05]
 [1.   0.  ]
 [1.   0.  ]
 ...
 [0.15 0.85]
 [1.   0.  ]
 [1.   0.  ]]


In [39]:
print(roc_auc_score(y_test, y_pred_prob[:,1]))

0.9540669485279858


In [40]:
model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [41]:
RandomForestClassifier()

In [42]:
rf_params = {
    'n_estimators':[100,150,200,300],
    'criterion':["gini", "entropy", "log_loss"],
    'max_depth':[2,3,5,7],
    'class_weight':['balanced']
}

In [43]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

rscv = RandomizedSearchCV(
    estimator=model,
    param_distributions=rf_params,
    cv=5,
    verbose=2,
    random_state=42,
    return_train_score=True
)

In [44]:
rscv.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END class_weight=balanced, criterion=entropy, max_depth=5, n_estimators=300; total time=   6.2s
[CV] END class_weight=balanced, criterion=entropy, max_depth=5, n_estimators=300; total time=   6.7s
[CV] END class_weight=balanced, criterion=entropy, max_depth=5, n_estimators=300; total time=   6.2s
[CV] END class_weight=balanced, criterion=entropy, max_depth=5, n_estimators=300; total time=   7.0s
[CV] END class_weight=balanced, criterion=entropy, max_depth=5, n_estimators=300; total time=   6.2s
[CV] END class_weight=balanced, criterion=log_loss, max_depth=5, n_estimators=100; total time=   2.4s
[CV] END class_weight=balanced, criterion=log_loss, max_depth=5, n_estimators=100; total time=   2.4s
[CV] END class_weight=balanced, criterion=log_loss, max_depth=5, n_estimators=100; total time=   2.1s
[CV] END class_weight=balanced, criterion=log_loss, max_depth=5, n_estimators=100; total time=   2.0s
[CV] END class_weight=bala

In [45]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       889
           1       0.96      0.75      0.84       145

    accuracy                           0.96      1034
   macro avg       0.96      0.87      0.91      1034
weighted avg       0.96      0.96      0.96      1034



In [46]:
rscv_y_pred = rscv.predict(X_test)

print(classification_report(y_test, rscv_y_pred))

              precision    recall  f1-score   support

           0       0.96      0.99      0.98       889
           1       0.90      0.78      0.84       145

    accuracy                           0.96      1034
   macro avg       0.93      0.88      0.91      1034
weighted avg       0.96      0.96      0.96      1034



In [47]:
rscv.best_params_

{'n_estimators': 100,
 'max_depth': 7,
 'criterion': 'gini',
 'class_weight': 'balanced'}