# Import Modules

In [1]:
import nltk
nltk.download("popular")
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import string
from nltk.corpus import stopwords
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/stef/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /Users/stef/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /Users/stef/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /Users/stef/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /Users/stef/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /Users/stef/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    

# Load Dataset

In [2]:
data = pd.read_csv("dataset.csv")
data.head()

Unnamed: 0,index,source_text,smishing_text,label
0,0,Your bank statement is ready for viewing. Log ...,Urgent: Suspicious activity detected. Log in t...,1.0
1,1,Your package is on its way! Track it using our...,Delivery issue: Your package is on hold. Click...,1.0
2,2,You are receiving this update on your recent p...,Thank you for your purchase! Verify your infor...,1.0
3,3,Your account balance is available. Please log ...,Attention! Account balance alert: Immediate ac...,1.0
4,4,Update your app to the latest version in the A...,Your account requires immediate update. Click ...,1.0


In [3]:
data['label'].value_counts()
# 1 = scam and 0 = safe 

label
1.0    131
Name: count, dtype: int64

In [4]:
data.shape
# number of rows

(201, 4)

# Clean Text

In [5]:
def preprocess_text(text):
    #remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    #convert to lowercase
    text = text.lower()
    #remove stop words that are super common *most important*
    stop_words = set(stopwords.words('english'))
    text = " ".join((word for word in text.split() if word not in stop_words))
    return text
    
preprocess_text("This is a Te$T for / dum,my ^ text")

'tet dummy text'

In [6]:
data['source_text'] = data['source_text'].apply(preprocess_text)
data['smishing_text'] = data['smishing_text'].apply(preprocess_text)

In [7]:
data

Unnamed: 0,index,source_text,smishing_text,label
0,0,bank statement ready viewing log online bankin...,urgent suspicious activity detected log secure...,1.0
1,1,package way track using official app,delivery issue package hold click confirm address,1.0
2,2,receiving update recent purchase,thank purchase verify information confirm,1.0
3,3,account balance available please log view,attention account balance alert immediate acti...,1.0
4,4,update app latest version app store,account requires immediate update click install,1.0
...,...,...,...,...
196,196,subscription benefits renewed,0,
197,197,mortgage payment received thank staying current,0,
198,198,flight checkin available online great trip,0,
199,199,customer service request logged we’ll reach soon,0,


# Vectorization

In [8]:
vectorization_data = TfidfVectorizer()
X = vectorization_data.fit_transform(data['source_text'] + " " + data['smishing_text'])

In [9]:
y = data['label']

# Train Test Split

In [10]:
X_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Applying Logistic Regression

In [11]:
model = LogisticRegression()

model.fit(X_train, y_train)
y_pred = model.predict(x_test)

print("Accuracy", accuracy_score(y_test, y_pred))
print("Classification", classification_report(y_test, y_pred))
print("confusion", confusion_matrix(y_test, y_pred))

ValueError: Input y contains NaN.

# Random Forest Model

In [15]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators = 100, random_state = 42)

model.fit(X_train, y_train)

y_pred =  model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

classification_rep = classification_report(y_test, y_pred)

cm = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:")
print("classification_rep")
print(cm)

ValueError: Input y contains NaN.

# Naiv Bays Model

In [17]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

classification_rep = classification_report(y_test, y_pred)

cm = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:")
print("Confusion Matrix:")
print(cm)

ValueError: Input y contains NaN.

# SVM

In [18]:
from sklearn.svm import SVC

model = SVC(kernel = 'linear', random_state = 42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

classification_rep = classification_report(y_test, y_pred)

cm = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_rep)
print("Confusion Matrix")
print(cm)

ValueError: Input y contains NaN.

# Save SVM and Vectorized

In [23]:
import pickle

pickle.dump(model, open("model.pkl", 'wb'))
pickle.dump(vectorization_data, open('vectorization_data.pkl', 'wb'))

# Load Model and Vectorizer

In [24]:
model = pickle.load(open('model.pkl', 'rb'))
vectorization_data = pickle.load(open('vectorization_data.pkl', 'rb'))

# Detection System

In [25]:
def detect(input_text):
    vectorized_text = vectorization_data.transform([input_text])
    result = model.predict(vectorized_text)
    return "Smishing Detexted" if result[0] == 1 else "No Smishing Detected"

In [26]:
input_text = "Delivery issue: Your package is on hold."
detect(input_text)

NotFittedError: This SVC instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [28]:
import sklearn
sklearn.__version__

'1.5.2'