In [1]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import seaborn as sns
import matplotlib.pyplot as plt
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import metrics


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
data=pd.read_csv("/content/drive/MyDrive/oasis-dataset/spam.csv", encoding = "ISO-8859-1")

In [4]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
data=data.drop('Unnamed: 2',axis=1)
data=data.drop('Unnamed: 3',axis=1)
data=data.drop('Unnamed: 4',axis=1)

In [6]:
data['v1']= data['v1'].replace({0: 'ham', 1: 'spam'})

In [7]:
data.isnull().sum()

v1    0
v2    0
dtype: int64

In [8]:
data = data.rename(columns={'v2': 'message'})
data = data.rename(columns={'v1': 'Label'})

In [9]:
data

Unnamed: 0,Label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [10]:
data['message'] = data['message'].apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in stop_words))

In [11]:
data['Label'] = data['Label'].replace({'ham': 0, 'spam':1})
data
data['Label'].value_counts()

0    4825
1     747
Name: Label, dtype: int64

In [12]:
data

Unnamed: 0,Label,message
0,0,"go jurong point, crazy.. available bugis n gre..."
1,0,ok lar... joking wif u oni...
2,1,free entry 2 wkly comp win fa cup final tkts 2...
3,0,u dun say early hor... u c already say...
4,0,"nah i think goes usf, lives around though"
...,...,...
5567,1,this 2nd time tried 2 contact u. u å£750 pound...
5568,0,will ì_ b going esplanade fr home?
5569,0,"pity, * mood that. so...any suggestions?"
5570,0,the guy bitching i acted like i'd interested b...


In [13]:
x_train, x_test, y_train, y_test = train_test_split(data.message, data.Label, test_size=0.2)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

cv = CountVectorizer()
cv.fit(x_train)

tdf = TfidfVectorizer()
tdf.fit(x_train)

In [15]:
cv_trained = cv.fit_transform(x_train)
tf_trained = tdf.fit_transform(x_train)

In [16]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [17]:
cv_test = cv.transform(x_test)
tdf_test = tdf.transform(x_test)

In [18]:
model=[MultinomialNB(),LogisticRegression(),svm.SVC(),tree.DecisionTreeClassifier(),RandomForestClassifier()]
for i in model:
    curr_model=i
    i.fit(cv_trained, y_train)
    # print(i," ","Train case R2 score in %: ",r2_score(y_train,y_train_pred)*100)
    y_test_pred=i.predict(cv_test)
    print(i," ","Test case accuracy_score in %: ",metrics.accuracy_score(y_test,y_test_pred)*100)
    print(confusion_matrix(y_test,y_test_pred))
    print(classification_report(y_test,y_test_pred))
    i.fit(tf_trained, y_train)
    y_test_pred=i.predict(tdf_test)
    print(i," ","Test case accuracy_score in %: ",metrics.accuracy_score(y_test,y_test_pred)*100)
    print(confusion_matrix(y_test,y_test_pred))
    print(classification_report(y_test,y_test_pred))
    print("\n")

MultinomialNB()   Test case accuracy_score in %:  98.7443946188341
[[969   6]
 [  8 132]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       975
           1       0.96      0.94      0.95       140

    accuracy                           0.99      1115
   macro avg       0.97      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115

MultinomialNB()   Test case accuracy_score in %:  97.30941704035875
[[975   0]
 [ 30 110]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       975
           1       1.00      0.79      0.88       140

    accuracy                           0.97      1115
   macro avg       0.99      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



LogisticRegression()   Test case accuracy_score in %:  98.65470852017937
[[973   2]
 [ 13 127]]
              precision    recall  f1-score   support

           0