# Spam Message Classification

# 1) Data Preprocessing

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [6]:
df = pd.read_csv('final.csv')

In [7]:
df.head()

Unnamed: 0,Command,Response,Label
0,Start the bike,The bike engine starts.,1
1,Stop the bike,The bike comes to a stop.,1
2,Accelerate,The bike accelerates smoothly.,1
3,Decelerate,The bike slows down.,1
4,Turn left indicator,The bike turns left indicator on for 10 sec.,1


In [8]:
df.isna().sum()

Command     0
Response    0
Label       0
dtype: int64

In [9]:
df.tail()

Unnamed: 0,Command,Response,Label
105,Mindful breathing exercises help alleviate str...,How do you manage stress in your daily life?,0
106,I'm reading a thought-provoking philosophy book.,Philosophical discussions can be enriching.,0
107,I'm fascinated by ancient architecture.,The history behind architectural marvels is ca...,0
108,Participating in a local charity run is on my ...,Supporting causes through events is impactful.,0
109,I've started a collection of vintage vinyl rec...,Music on vinyl has a timeless quality.,0


In [10]:
df.describe()

Unnamed: 0,Label
count,110.0
mean,0.427273
std,0.496946
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [14]:
df['Label'].value_counts()/ (len(df))

0    0.572727
1    0.427273
Name: Label, dtype: float64

In [15]:
df['Label'].value_counts()

0    63
1    47
Name: Label, dtype: int64

In [16]:
ham = df[df['Label'] == 1]
spam = df[df['Label'] == 0]

In [17]:
ham.shape, spam.shape

((47, 3), (63, 3))

In [19]:
spam = spam.sample(ham.shape[0])

In [20]:
ham.shape, spam.shape

((47, 3), (47, 3))

In [21]:
data = ham.append(spam, ignore_index=True)

  data = ham.append(spam, ignore_index=True)


In [22]:
data.shape

(94, 3)

In [23]:
data['Label'].value_counts()

1    47
0    47
Name: Label, dtype: int64

In [24]:
data.head()

Unnamed: 0,Command,Response,Label
0,Start the bike,The bike engine starts.,1
1,Stop the bike,The bike comes to a stop.,1
2,Accelerate,The bike accelerates smoothly.,1
3,Decelerate,The bike slows down.,1
4,Turn left indicator,The bike turns left indicator on for 10 sec.,1


In [29]:
# plt.hist(data[data['Label'] == 1]['length'], bins = 45, alpha = 0.7)
# plt.hist(data[data['Label'] == 0]['length'], bins = 45, alpha = 0.7)
# plt.show()

In [30]:
# plt.hist(data[data['label'] == 'ham']['punct'], bins = 100, alpha = 0.7)
# plt.hist(data[data['label'] == 'spam']['punct'], bins = 100, alpha = 0.7)
# plt.show()

In [31]:
data

Unnamed: 0,Command,Response,Label
0,Start the bike,The bike engine starts.,1
1,Stop the bike,The bike comes to a stop.,1
2,Accelerate,The bike accelerates smoothly.,1
3,Decelerate,The bike slows down.,1
4,Turn left indicator,The bike turns left indicator on for 10 sec.,1
...,...,...,...
89,I'm fascinated by ancient history.,Learning about the past helps us understand th...,0
90,I'm attending a photography workshop next week...,Photography is a wonderful hobby.,0
91,I find astronomy documentaries fascinating.,The vastness of the cosmos is awe-inspiring.,0
92,I'm participating in a community gardening pro...,Growing our own food fosters a sense of commun...,0


In [32]:
from sklearn.model_selection import train_test_split

In [34]:
X_train, X_test, y_train, y_test =  train_test_split(data['Command'], data['Label'], test_size = 0.3, random_state =0, shuffle = True)

In [35]:
1494 * 0.3

448.2

In [36]:
X_train.shape

(65,)

In [37]:
X_test.shape

(29,)

# 2) Building the Model (Random Forest)

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline

In [39]:
classifier = Pipeline([("tfidf", TfidfVectorizer()) , ("classifier", RandomForestClassifier(n_estimators=100))])

In [40]:
classifier.fit(X_train, y_train)

# 3) Predicting the results (Random Forest)

In [41]:
y_pred = classifier.predict(X_test)

In [42]:
y_test, y_pred

(2     1
 30    1
 56    0
 16    1
 13    1
 61    0
 62    0
 79    0
 33    1
 78    0
 76    0
 7     1
 51    0
 89    0
 93    0
 60    0
 8     1
 22    1
 73    0
 45    1
 26    1
 43    1
 24    1
 6     1
 42    1
 48    0
 74    0
 82    0
 77    0
 Name: Label, dtype: int64,
 array([1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 0, 1]))

In [43]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [44]:
accuracy_score(y_test, y_pred)

0.6551724137931034

In [45]:
0.9465478841870824 * 449

425.0

In [46]:
confusion_matrix(y_test, y_pred)

array([[ 5, 10],
       [ 0, 14]])

In [47]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.33      0.50        15
           1       0.58      1.00      0.74        14

    accuracy                           0.66        29
   macro avg       0.79      0.67      0.62        29
weighted avg       0.80      0.66      0.61        29



# 4) Building the Model (SVM)

In [48]:
from sklearn.svm import SVC

In [49]:
svm = Pipeline([("tfidf", TfidfVectorizer()) , ("classifier", SVC(C = 100, gamma='auto'))])

In [50]:
svm.fit(X_train, y_train)

# 5) Predicting the results (SVM)

In [51]:
y_pred = svm.predict(X_test)

In [52]:
accuracy_score(y_test, y_pred)

0.8620689655172413

In [53]:
confusion_matrix(y_test, y_pred)

array([[15,  0],
       [ 4, 10]])

In [54]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      1.00      0.88        15
           1       1.00      0.71      0.83        14

    accuracy                           0.86        29
   macro avg       0.89      0.86      0.86        29
weighted avg       0.89      0.86      0.86        29



In [65]:
test1 = ['Start the motor bike now']
test2 = ['On the right indicator']
test3 = ['Congratulations, You won a lottery ticket worth $1 Million ! To claim call on 446677']

In [66]:
# print(classifier.predict(test1))
# print(classifier.predict(test2))
# print(classifier.predict(test3))

In [67]:
print(svm.predict(test1))
print(svm.predict(test2))
print(svm.predict(test3))

[1]
[1]
[0]
