In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('spam.csv')

In [3]:
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.shape

(5572, 2)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   object
 1   text    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [6]:
df.isnull().sum()

label    0
text     0
dtype: int64

In [7]:
df.duplicated().sum()

415

In [8]:
df = df.drop_duplicates(keep='first')

In [9]:
df.duplicated().sum()

0

In [10]:
df['label'].value_counts()

label
ham     4516
spam     641
Name: count, dtype: int64

In [11]:
label_map = {'ham': 0, 'spam': 1}
df['label'] = df['label'].map(label_map)

In [12]:
df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
import re

In [14]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [15]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')

In [16]:
def text_preprocessing(col):
    # Lower case
    col = col.str.lower()

    # Removing stopwords and words with length=1
    col = col.apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words and len(word) > 1))
    
    # Removing numbers
    col = col.apply(lambda x: re.sub(r'[0-9]', '', x)) 
    
    # Removing extra spaces
    col = col.apply(lambda x: re.sub(r'\s+', ' ', x).strip()) 

    # Stemming
    col = col.apply(lambda x: ' '.join(stemmer.stem(word) for word in str(x).split()))

    return col

In [17]:
df['text'] = text_preprocessing(df['text'])

In [18]:
df.head()

Unnamed: 0,label,text
0,0,"go jurong point, crazy.. avail bugi great worl..."
1,0,ok lar... joke wif oni...
2,1,free entri wkli comp win fa cup final tkts st ...
3,0,dun say earli hor... alreadi say...
4,0,"nah think goe usf, live around though"


In [19]:
X = df['text'].values
y = df['label'].values

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=4000)

In [21]:
X = vectorizer.fit_transform(X).toarray()

In [22]:
X.shape

(5157, 4000)

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=64)

In [24]:
#X_train_vectorized = vectorizer.fit_transform(X_train)
#X_test_vectorized = vectorizer.transform(X_test)

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

In [26]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [27]:
mnb = MultinomialNB(alpha=.1)

mnb.fit(X_train, y_train)

y_pred = mnb.predict(X_test)

print('accuracy', accuracy_score(y_test, y_pred))
print('precision', precision_score(y_test, y_pred))
print('recall', recall_score(y_test, y_pred))

accuracy 0.9844961240310077
precision 0.9444444444444444
recall 0.9296875


In [28]:
bnb = BernoulliNB(alpha=.1)

bnb.fit(X_train, y_train)

y_pred = bnb.predict(X_test)

print('accuracy', accuracy_score(y_test, y_pred))
print('precision', precision_score(y_test, y_pred))
print('recall', recall_score(y_test, y_pred))

accuracy 0.9864341085271318
precision 0.9523809523809523
recall 0.9375


In [29]:
gnb = GaussianNB(var_smoothing=.01)

gnb.fit(X_train, y_train)

y_pred = gnb.predict(X_test)

print('accuracy', accuracy_score(y_test, y_pred))
print('precision', precision_score(y_test, y_pred))
print('recall', recall_score(y_test, y_pred))

accuracy 0.9166666666666666
precision 0.6029411764705882
recall 0.9609375


In [30]:
svc = SVC(kernel='sigmoid', C=5, random_state=360)

svc.fit(X_train, y_train)

y_pred = svc.predict(X_test)

print('accuracy', accuracy_score(y_test, y_pred))
print('precision', precision_score(y_test, y_pred))
print('recall', recall_score(y_test, y_pred))

accuracy 0.9806201550387597
precision 0.921875
recall 0.921875


In [31]:
lrc = LogisticRegression(solver='liblinear', penalty='l2', C=10, random_state=360)

lrc.fit(X_train, y_train)

y_pred = lrc.predict(X_test)

print('accuracy', accuracy_score(y_test, y_pred))
print('precision', precision_score(y_test, y_pred))
print('recall', recall_score(y_test, y_pred))

accuracy 0.9825581395348837
precision 0.9824561403508771
recall 0.875


In [32]:
dtc = DecisionTreeClassifier(max_depth=25)

dtc.fit(X_train, y_train)

y_pred = dtc.predict(X_test)

print('accuracy', accuracy_score(y_test, y_pred))
print('precision', precision_score(y_test, y_pred))
print('recall', recall_score(y_test, y_pred))

accuracy 0.9631782945736435
precision 0.8409090909090909
recall 0.8671875


In [33]:
rfc = RandomForestClassifier(n_estimators=50, random_state=360)

rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_test)

print('accuracy', accuracy_score(y_test, y_pred))
print('precision', precision_score(y_test, y_pred))
print('recall', recall_score(y_test, y_pred))

accuracy 0.9815891472868217
precision 1.0
recall 0.8515625


In [34]:
etc = ExtraTreesClassifier(n_estimators=50, random_state=64)

etc.fit(X_train, y_train)

y_pred = etc.predict(X_test)

print('accuracy', accuracy_score(y_test, y_pred))
print('precision', precision_score(y_test, y_pred))
print('recall', recall_score(y_test, y_pred))

accuracy 0.9844961240310077
precision 0.9745762711864406
recall 0.8984375


In [35]:
abc = AdaBoostClassifier(n_estimators=50, random_state=360)

abc.fit(X_train, y_train)

y_pred = abc.predict(X_test)

print('accuracy', accuracy_score(y_test, y_pred))
print('precision', precision_score(y_test, y_pred))
print('recall', recall_score(y_test, y_pred))



accuracy 0.9680232558139535
precision 0.8925619834710744
recall 0.84375


In [36]:
gbc = GradientBoostingClassifier(n_estimators=50, random_state=360)

gbc.fit(X_train, y_train)

y_pred = gbc.predict(X_test)

print('accuracy', accuracy_score(y_test, y_pred))
print('precision', precision_score(y_test, y_pred))
print('recall', recall_score(y_test, y_pred))

accuracy 0.9651162790697675
precision 0.96
recall 0.75


In [37]:
xgb = XGBClassifier(n_estimators=60)

xgb.fit(X_train, y_train)

y_pred = xgb.predict(X_test)

print('accuracy', accuracy_score(y_test, y_pred))
print('precision', precision_score(y_test, y_pred))
print('recall', recall_score(y_test, y_pred))

accuracy 0.9815891472868217
precision 0.9658119658119658
recall 0.8828125


In [38]:
bc = BaggingClassifier(n_estimators=60, random_state=40)

bc.fit(X_train, y_train)

y_pred = bc.predict(X_test)

print('accuracy', accuracy_score(y_test, y_pred))
print('precision', precision_score(y_test, y_pred))
print('recall', recall_score(y_test, y_pred))

accuracy 0.9602713178294574
precision 0.8372093023255814
recall 0.84375


In [39]:
from sklearn.ensemble import VotingClassifier

svc = SVC(kernel='sigmoid', C=5, probability=True)

voting_clf = VotingClassifier(estimators=[
    ('bnb', bnb),
    ('gnb', gnb),
    ('svc', svc),
    ('lrc', lrc),
    ('etc', etc),
    ('xgb', xgb)
], voting='soft')

voting_clf.fit(X_train, y_train)

y_pred = voting_clf.predict(X_test)

print('accuracy', accuracy_score(y_test, y_pred))
print('precision', precision_score(y_test, y_pred))
print('recall', recall_score(y_test, y_pred))

accuracy 0.9912790697674418
precision 1.0
recall 0.9296875


In [40]:
from sklearn.ensemble import StackingClassifier

svc = SVC(kernel='sigmoid', C=5, probability=True)

stacking_clf = StackingClassifier(estimators=[
    ('bnb', bnb),
    ('gnb', gnb),
    ('svc', svc),
    ('lrc', lrc),
    ('etc', etc),
    ('xgb', xgb)
], final_estimator=RandomForestClassifier(n_estimators=50, random_state=2))

stacking_clf.fit(X_train, y_train)

y_pred = stacking_clf.predict(X_test)

print('accuracy', accuracy_score(y_test, y_pred))
print('precision', precision_score(y_test, y_pred))
print('recall', recall_score(y_test, y_pred))

accuracy 0.9922480620155039
precision 0.9838709677419355
recall 0.953125
