In [62]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score
import pandas as pd


df = pd.read_csv("../dataset/sms.csv")

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['SMS test'])  
y = df['Fraudolent']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)


## Prediction Function

In [63]:

def predict_fraud(user_query):
    user_query_vectorized = vectorizer.transform([user_query])
    prediction = model.predict(user_query_vectorized)

    if prediction[0] == 1:
        return "The message IS fraudulent."
    else:
        return "The message is NOT fraudulent."


### Quick-Testing the model

In [64]:
non_fraud = "Squeeeeeze!! This is christmas hug.. If u lik my frndshp den hug me back.. If u get 3 u r cute:) 6 u r luvd:* 9 u r so lucky;) None? People hate u:"
print(predict_fraud(non_fraud))

The message is NOT fraudulent.


In [65]:
frau = "FREE RINGTONE text FIRST to 87131 for a poly or text GET to 87131 for a true tone! Help? 0845 2814032 16 after 1st free, tones are 3x£150pw to e£nd txt stop"
print(predict_fraud(frau))

The message IS fraudulent.


In [66]:
custom_fraud = "GET 1000$ FREE NOW BY CLICKING HERE http://www.1000dollarsfreeclickhere.com"
print(predict_fraud(custom_fraud))

The message IS fraudulent.


In [67]:
custom_non_fraud = "Hi, how are you? I'm fine, thanks for asking."
print(predict_fraud(custom_non_fraud))

The message is NOT fraudulent.


### Ambiguous/Hybrid Cases

In [68]:
hybrid  = "Hi, how are you? I'm fine, thanks for asking. GET 1000$ FREE NOW BY CLICKING HERE http://www.1000dollarsfreeclickhere.com"
print(predict_fraud(hybrid))

The message IS fraudulent.


In [69]:
ambiguos = "I love you :3, send me 1000$"
print(predict_fraud(ambiguos))

The message is NOT fraudulent.


## Accuracy test

In [70]:
# Predict on the test set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the model: {accuracy:.2f}")

Accuracy of the model: 0.98


In [71]:
conf_matrix = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"ROC AUC Score: {roc_auc:.2f}")

Accuracy: 0.98
Confusion Matrix:
[[949   4]
 [ 22 140]]
Precision: 0.97
Recall: 0.86
F1 Score: 0.92
ROC AUC Score: 0.93
