<a href="https://colab.research.google.com/github/Bienbaz/Bienbaz/blob/main/NLP_Model_for_Scams.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install + Imports

In [1]:
# =========================
# COLAB CELL 1: Install + Imports
# =========================
!pip -q install nltk

import re
import random
import pandas as pd

import nltk
nltk.download("punkt")
nltk.download("stopwords")

from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Dataset (100 rows)

(                                                text  label     lang
 0  POS: Payment N80000 no reflect. Send OTP 43990...  Fraud    mixed
 1  Verify links before clicking; scammers often m...   Safe  english
 2  POS: Payment N150000 no reflect. Send OTP 1748...  Fraud    mixed
 3  Dear customer, confirm your PIN to stop unauth...  Fraud  english
 4  Your account will be suspended in 24 hours. Ve...  Fraud  english
 5  Use bank app or USSD you know, not the one ins...   Safe   pidgin
 6  POS issue: you pay N12,000 but e no show. Send...  Fraud   pidgin
 7  Always confirm transactions in your bank app b...   Safe    mixed
 8  Don Allah ka tuna ka canza kalmar sirri lokaci...   Safe    hausa
 9  Assalamu alaikum, an toshe katin ATM dinka. Da...  Fraud    hausa,
 label
 Fraud    50
 Safe     50
 Name: count, dtype: int64)

Preprocess Multilingual Text (NLTK-based)

In [4]:
# =========================
# COLAB CELL 3: Preprocess Multilingual Text (NLTK-based)
# - Lowercase
# - Remove URLs, emails, phone-like numbers
# - Keep letters/numbers/spaces
# - Tokenize
# - Remove English stopwords only (works fine even if mixed; Hausa/Pidgin are kept)
# =========================
nltk.download("punkt_tab")
EN_STOPWORDS = set(stopwords.words("english"))

def clean_text(text: str) -> str:
    text = str(text).lower()
    text = re.sub(r"http\S+|www\.\S+", " ", text)                 # urls
    text = re.sub(r"\b[\w\.-]+@[\w\.-]+\.\w+\b", " ", text)      # emails
    text = re.sub(r"\b\d{10,}\b", " ", text)                     # long numbers (phones/accts)
    text = re.sub(r"[^a-z0-9\s]", " ", text)                     # punctuation (basic)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def tokenize_and_filter(text: str) -> str:
    text = clean_text(text)
    tokens = nltk.word_tokenize(text)
    tokens = [t for t in tokens if t not in EN_STOPWORDS and len(t) > 1]
    return " ".join(tokens)

df["text_clean"] = df["text"].apply(tokenize_and_filter)

df[["text", "text_clean", "label", "lang"]].head(10)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Unnamed: 0,text,text_clean,label,lang
0,POS: Payment N80000 no reflect. Send OTP 43990...,pos payment n80000 reflect send otp 439902 mak...,Fraud,mixed
1,Verify links before clicking; scammers often m...,verify links clicking scammers often mimic ban...,Safe,english
2,POS: Payment N150000 no reflect. Send OTP 1748...,pos payment n150000 reflect send otp 174870 ma...,Fraud,mixed
3,"Dear customer, confirm your PIN to stop unauth...",dear customer confirm pin stop unauthorized tr...,Fraud,english
4,Your account will be suspended in 24 hours. Ve...,account suspended 24 hours verify details via ...,Fraud,english
5,"Use bank app or USSD you know, not the one ins...",use bank app ussd know one inside suspicious m...,Safe,pidgin
6,"POS issue: you pay N12,000 but e no show. Send...",pos issue pay n12 000 show send otp make reverse,Fraud,pidgin
7,Always confirm transactions in your bank app b...,always confirm transactions bank app leave pos,Safe,mixed
8,Don Allah ka tuna ka canza kalmar sirri lokaci...,allah ka tuna ka canza kalmar sirri lokaci lok...,Safe,hausa
9,"Assalamu alaikum, an toshe katin ATM dinka. Da...",assalamu alaikum toshe katin atm dinka danna w...,Fraud,hausa


(TF-IDF + Logistic Regression)

In [5]:
# =========================
# COLAB CELL 4: Train/Test Split + ML Pipeline (TF-IDF + Logistic Regression)
# =========================
X = df["text_clean"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

model = Pipeline([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1,2),
        min_df=1,
        max_df=0.95
    )),
    ("clf", LogisticRegression(max_iter=1000))
])

model.fit(X_train, y_train)

pred = model.predict(X_test)

print("Confusion Matrix:\n", confusion_matrix(y_test, pred))
print("\nClassification Report:\n", classification_report(y_test, pred))


Confusion Matrix:
 [[6 4]
 [1 9]]

Classification Report:
               precision    recall  f1-score   support

       Fraud       0.86      0.60      0.71        10
        Safe       0.69      0.90      0.78        10

    accuracy                           0.75        20
   macro avg       0.77      0.75      0.74        20
weighted avg       0.77      0.75      0.74        20



(input() -> prediction)

In [6]:
# =========================
# COLAB CELL 5: Interactive Testing Cell (input() -> prediction)
# =========================
def predict_message(msg: str):
    msg_clean = tokenize_and_filter(msg)
    pred = model.predict([msg_clean])[0]
    proba = model.predict_proba([msg_clean])[0]
    classes = list(model.named_steps["clf"].classes_)
    conf = dict(zip(classes, proba))
    return pred, conf, msg_clean

while True:
    user_msg = input("Type a message to test (or type 'exit' to stop): ").strip()
    if user_msg.lower() == "exit":
        print("Bye!")
        break

    label, conf, cleaned = predict_message(user_msg)

    print("\n--- Result ---")
    print("Original:", user_msg)
    print("Cleaned :", cleaned)
    print("Prediction:", "ðŸš¨ Fraud/Scam" if label == "Fraud" else "âœ… Safe")
    print("Confidence:", {k: round(v, 3) for k, v in conf.items()})
    print("-------------\n")


Type a message to test (or type 'exit' to stop): Congrats! You don win N{amt} promo. Send BVN + acct to claim.

--- Result ---
Original: Congrats! You don win N{amt} promo. Send BVN + acct to claim.
Cleaned : congrats win amt promo send bvn acct claim
Prediction: ðŸš¨ Fraud/Scam
Confidence: {'Fraud': np.float64(0.646), 'Safe': np.float64(0.354)}
-------------

Type a message to test (or type 'exit' to stop): exit
Bye!


Save dataset to CSV

In [7]:
# =========================
# (OPTIONAL) COLAB CELL 6: Save dataset to CSV
# =========================
df.to_csv("nigeria_scam_synthetic_dataset.csv", index=False)
print("Saved: nigeria_scam_synthetic_dataset.csv")


Saved: nigeria_scam_synthetic_dataset.csv
