# Spam Message Classification

# 1) Data Preprocessing

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('merged_file_final.csv')

In [3]:
df.head()

Unnamed: 0,Command,Response,Label
0,Start the bike,The bike engine starts.,1
1,Stop the bike,The bike comes to a stop.,1
2,Accelerate,The bike accelerates smoothly.,1
3,Decelerate,The bike slows down.,1
4,Turn left,The bike turns left.,1


In [4]:
df.isna().sum()

Command     0
Response    0
Label       0
dtype: int64

In [5]:
df.tail()

Unnamed: 0,Command,Response,Label
661,Experiment with DIY social research projects,Conduct social research projects to gain insig...,0
662,Practice daily gratitude for social connections,Express gratitude for the diverse social conne...,0
663,Start a virtual sociology podcast,Share and discuss sociological topics through ...,0
664,Learn about the sociology of technology,Explore the impact of technology on social str...,0
665,Experiment with community-based art projects,Engage in art projects that highlight and cele...,0


In [6]:
df.describe()

Unnamed: 0,Label
count,666.0
mean,0.5
std,0.500376
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [7]:
df['Label'].value_counts()/ (len(df))

Label
1    0.5
0    0.5
Name: count, dtype: float64

In [8]:
df['Label'].value_counts()

Label
1    333
0    333
Name: count, dtype: int64

In [9]:
ham = df[df['Label'] == 1]
spam = df[df['Label'] == 0]

In [10]:
ham.shape, spam.shape

((333, 3), (333, 3))

In [11]:
spam = spam.sample(ham.shape[0])

In [12]:
ham.shape, spam.shape

((333, 3), (333, 3))

In [13]:
#data = ham.append(spam, ignore_index=True)
data = pd.concat([ham, spam], ignore_index=True)

In [14]:
data.shape

(666, 3)

In [15]:
data['Label'].value_counts()

Label
1    333
0    333
Name: count, dtype: int64

In [16]:
data.head()

Unnamed: 0,Command,Response,Label
0,Start the bike,The bike engine starts.,1
1,Stop the bike,The bike comes to a stop.,1
2,Accelerate,The bike accelerates smoothly.,1
3,Decelerate,The bike slows down.,1
4,Turn left,The bike turns left.,1


In [17]:
# plt.hist(data[data['Label'] == 1]['length'], bins = 45, alpha = 0.7)
# plt.hist(data[data['Label'] == 0]['length'], bins = 45, alpha = 0.7)
# plt.show()

In [18]:
# plt.hist(data[data['label'] == 'ham']['punct'], bins = 100, alpha = 0.7)
# plt.hist(data[data['label'] == 'spam']['punct'], bins = 100, alpha = 0.7)
# plt.show()

In [19]:
data

Unnamed: 0,Command,Response,Label
0,Start the bike,The bike engine starts.,1
1,Stop the bike,The bike comes to a stop.,1
2,Accelerate,The bike accelerates smoothly.,1
3,Decelerate,The bike slows down.,1
4,Turn left,The bike turns left.,1
...,...,...,...
661,Learn the art of shadow puppetry,Mastering the techniques of creating and perfo...,0
662,Start a virtual fitness challenge group,Initiating a virtual fitness challenge group t...,0
663,Plan a virtual vacation to Antarctica,Embarking on a virtual journey to the icy land...,0
664,Learn the basics of behavioral economics,Exploring behavioral economics principles to u...,0


In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test, y_train, y_test =  train_test_split(data['Command'], data['Label'], test_size = 0.3, random_state =0, shuffle = True)

In [22]:
1494 * 0.3

448.2

In [23]:
X_train.shape

(466,)

In [24]:
X_test.shape

(200,)

# 2) Building the Model (Random Forest)

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline

In [26]:
classifier = Pipeline([("tfidf", TfidfVectorizer()) , ("classifier", RandomForestClassifier(n_estimators=100))])

In [27]:
classifier.fit(X_train, y_train)

# 3) Predicting the results (Random Forest)

In [28]:
y_pred = classifier.predict(X_test)

In [29]:
y_test, y_pred

(578    0
 310    1
 14     1
 540    0
 76     1
       ..
 504    0
 354    0
 194    1
 319    1
 609    0
 Name: Label, Length: 200, dtype: int64,
 array([0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
        1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1,
        1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0,
        0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
        1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0,
        1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
        0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
        1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0,
        0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1,
        1, 0], dtype=int64))

In [30]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [31]:
accuracy_score(y_test, y_pred)

0.985

In [32]:
0.9465478841870824 * 449

425.0

In [33]:
confusion_matrix(y_test, y_pred)

array([[100,   0],
       [  3,  97]], dtype=int64)

In [34]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99       100
           1       1.00      0.97      0.98       100

    accuracy                           0.98       200
   macro avg       0.99      0.98      0.98       200
weighted avg       0.99      0.98      0.98       200



# 4) Building the Model (SVM)

In [35]:
from sklearn.svm import SVC

In [36]:
svm = Pipeline([("tfidf", TfidfVectorizer()) , ("classifier", SVC(C = 100, gamma='auto'))])

In [37]:
svm.fit(X_train, y_train)

# 5) Predicting the results (SVM)

In [38]:
y_pred = svm.predict(X_test)

In [39]:
accuracy_score(y_test, y_pred)

0.985

In [40]:
confusion_matrix(y_test, y_pred)

array([[100,   0],
       [  3,  97]], dtype=int64)

In [41]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99       100
           1       1.00      0.97      0.98       100

    accuracy                           0.98       200
   macro avg       0.99      0.98      0.98       200
weighted avg       0.99      0.98      0.98       200



In [42]:
test1 = ['Start the motor bike now']
test2 = ['On the right indicator']
test3 = ['Congratulations, You won a lottery ticket worth $1 Million ! To claim call on 446677']

In [43]:
# print(classifier.predict(test1))
# print(classifier.predict(test2))
# print(classifier.predict(test3))

In [44]:
if (svm.predict(test1) == 1):
    print("The input is a bike command.")
else:
    print("The input is not a bike command.")

The input is a bike command.


In [45]:
if (svm.predict(test2) == 1):
    print("The input is a bike command.")
else:
    print("The input is not a bike command.")

The input is not a bike command.


In [46]:
if (svm.predict(test3) == 1):
    print("The input is a bike command.")
else:
    print("The input is not a bike command.")

The input is not a bike command.


In [47]:
#########################
import joblib
joblib.dump(classifier, 'email_spam_model.pkl')

['email_spam_model.pkl']

In [48]:
###########################################################
import joblib
loaded_model = joblib.load('email_spam_model.pkl')

In [49]:
test1 = ['Start the motor bike now']

In [50]:
if (loaded_model.predict(test1) == 1):
    print("The input is a bike command.")
else:
    print("The input is not a bike command.")

The input is a bike command.


In [51]:
################################################################
import speech_recognition as sr

# Initialize the recognizer
recognizer = sr.Recognizer()

# Function to convert speech to text
def speech_to_text(audio_file_path):
    with sr.AudioFile(audio_file_path) as source:
        audio_data = recognizer.record(source)  # Record the audio file

    try:
        # Use Google Web Speech API to recognize speech
        text = recognizer.recognize_google(audio_data)
        return text
    except sr.UnknownValueError:
        print("Google Web Speech API could not understand audio")
    except sr.RequestError as e:
        print(f"Could not request results from Google Web Speech API; {e}")


In [52]:
# Example usage
audio_file_path = "harvard.wav"
text_result = speech_to_text(audio_file_path)
print("Text result:", text_result)

Text result: the still smell of old bearings it takes heat to bring out the order a cold storage find with him tacos Alpha store are my favourite is just for food is the hard cross bun


In [53]:
if (loaded_model.predict([text_result]) == 1):
    print("The input is a bike command.")
else:
    print("The input is not a bike command.")

The input is not a bike command.


In [54]:
# Example usage
audio_file_path = "hw_are_you.wav"
text_result = speech_to_text(audio_file_path)
print("Text result:", text_result)



Text result: hello Anirban how are you


In [55]:
if (loaded_model.predict([text_result]) == 1):
    print("The input is a bike command.")
else:
    print("The input is not a bike command.")

The input is not a bike command.


In [56]:
# Example usage
audio_file_path = "Turn_on_headLight.wav"
text_result = speech_to_text(audio_file_path)
print("Text result:", text_result)

Text result: turn on headlight


In [57]:
if (loaded_model.predict([text_result]) == 1):
    print("The input is a bike command.")
else:
    print("The input is not a bike command.")

The input is a bike command.


In [59]:
##################################
# Example usage
audio_file_path = "bike_command_1.wav"
text_result = speech_to_text(audio_file_path)
print("Text result:", text_result)

Text result: hey TVS show today's weather forecast


In [61]:
if (loaded_model.predict([text_result]) == 1):
    print("The input is a bike command.")
else:
    print("The input is not a bike command.")

The input is a bike command.


In [62]:
# Example usage
audio_file_path = "non_bike_command_1.wav"
text_result = speech_to_text(audio_file_path)
print("Text result:", text_result)

Text result: hello Anirban tell me something about your IISC


In [63]:
if (loaded_model.predict([text_result]) == 1):
    print("The input is a bike command.")
else:
    print("The input is not a bike command.")

The input is not a bike command.


In [71]:
# Example usage
audio_file_path = "bk_2.wav"
text_result = speech_to_text(audio_file_path)
print("Text result:", text_result)
if (loaded_model.predict([text_result]) == 1):
    print("The input is a bike command.")
else:
    print("The input is not a bike command.")

Text result: hello play top 5 songs in my playlist
The input is a bike command.


In [72]:
# Example usage
audio_file_path = "bk_3.wav"
text_result = speech_to_text(audio_file_path)
print("Text result:", text_result)
if (loaded_model.predict([text_result]) == 1):
    print("The input is a bike command.")
else:
    print("The input is not a bike command.")

Text result: show me less traffic path to reach IISC
The input is a bike command.


In [73]:
# Example usage
audio_file_path = "nbk_2.wav"
text_result = speech_to_text(audio_file_path)
print("Text result:", text_result)
if (loaded_model.predict([text_result]) == 1):
    print("The input is a bike command.")
else:
    print("The input is not a bike command.")

Text result: hello tell me about my account summary
The input is not a bike command.


In [74]:
# Example usage
audio_file_path = "nbk_3.wav"
text_result = speech_to_text(audio_file_path)
print("Text result:", text_result)
if (loaded_model.predict([text_result]) == 1):
    print("The input is a bike command.")
else:
    print("The input is not a bike command.")

Text result: tell me percentage of my mobile charge
The input is not a bike command.


In [75]:
# Example usage
audio_file_path = "bk_4.wav"
text_result = speech_to_text(audio_file_path)
print("Text result:", text_result)
if (loaded_model.predict([text_result]) == 1):
    print("The input is a bike command.")
else:
    print("The input is not a bike command.")

Text result: show me charging
The input is not a bike command.
