In [1]:
import pandas as pd
import sqlite3
import regex as re
import matplotlib.pyplot as plt

from wordcloud import WordCloud

In [2]:
df = pd.read_csv('emails.csv')
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [3]:
print("spam count: " +str(len(df.loc[df.spam==1])))
print("not spam count: " +str(len(df.loc[df.spam==0])))
print(df.shape)
df['spam'] = df['spam'].astype(int)

df = df.drop_duplicates()
print(df.shape)

df = df.reset_index(inplace = False)[['text','spam']]

spam count: 1368
not spam count: 4360
(5728, 2)
(5695, 2)


In [4]:
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [5]:
clean_desc = []
for w in range(len(df.text)):
    desc = df['text'][w].lower()
    
    #remove punctuation
    desc = re.sub('[^a-zA-Z]', ' ', desc)
    
    #remove tags
    desc=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",desc)
    
    #remove digits and special chars
    desc=re.sub("(\\d|\\W)+"," ",desc)
    
    clean_desc.append(desc)
#assign the cleaned descriptions to the data frame
df['text'] = clean_desc
df = df.reset_index()        
df.head(3)

Unnamed: 0,index,text,spam
0,0,subject naturally irresistible your corporate ...,1
1,1,subject the stock trading gunslinger fanny is ...,1
2,2,subject unbelievable new homes made easy im wa...,1


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
text_vec = CountVectorizer().fit_transform(df['text'])

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(text_vec, df['spam'], test_size = 0.45
                                                    , random_state = 42, shuffle = True)
#from sklearn.tree import DecisionTreeClassifier
#classifier = DecisionTreeClassifier(max_depth = 7)

#from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
#classifier = RandomForestClassifier(max_depth=6, n_estimators=8)
#classifier = AdaBoostClassifier()

from sklearn.svm import SVC
classifier = SVC(kernel="linear", C=0.025)

#from sklearn import ensemble 
#classifier = ensemble.GradientBoostingClassifier(
#    n_estimators = 100, #how many decision trees to build
#    learning_rate = 0.5, #controls rate at which additional decision trees influes overall prediction
#    max_depth = 6, 
#     min_samples_split = 21,
#     min_samples_leaf = 19, 
    #max_features = 0.9,
    #loss = 'huber'
#)

classifier.fit(X_train, y_train)

predictions = classifier.predict(X_test)

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      1926
           1       0.97      0.94      0.95       637

    accuracy                           0.98      2563
   macro avg       0.97      0.96      0.97      2563
weighted avg       0.98      0.98      0.98      2563



In [7]:
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
pred = classifier.predict(X_train)
print(classification_report(y_train ,pred ))
print('Confusion Matrix: \n',confusion_matrix(y_train,pred))
print()
print('Accuracy: ', accuracy_score(y_train,pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2401
           1       1.00      0.99      1.00       731

    accuracy                           1.00      3132
   macro avg       1.00      1.00      1.00      3132
weighted avg       1.00      1.00      1.00      3132

Confusion Matrix: 
 [[2401    0]
 [   4  727]]

Accuracy:  0.9987228607918263


In [8]:
pred = classifier.predict(X_test)
print(classification_report(y_test ,pred ))
print('Confusion Matrix: \n', confusion_matrix(y_test,pred))

print()
print('Accuracy: ', accuracy_score(y_test,pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      1926
           1       0.97      0.94      0.95       637

    accuracy                           0.98      2563
   macro avg       0.97      0.96      0.97      2563
weighted avg       0.98      0.98      0.98      2563

Confusion Matrix: 
 [[1906   20]
 [  40  597]]

Accuracy:  0.9765899336714787
