# Topic: Natural Language Process Scam Detection
# 1- N-Gram Analysis
# 2-

In [1]:
import pandas as pd
df = pd.read_csv('/content/SMS Spam Dataset.csv')
df.head()

Unnamed: 0,sms,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...\n,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [2]:
df.shape

(5574, 2)

# Cleaning data..

In [3]:
import re
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

def clean_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove links
    text = re.sub(r'http\S+', '', text)

    # Tokenize the text
    words = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]

    # Initialize Porter Stemmer
    stemmer = PorterStemmer()

    # Perform stemming
    stemmed_words = [stemmer.stem(word) for word in filtered_words]

    # Join the stemmed words back into a single string
    cleaned_text = ' '.join(stemmed_words)

    return cleaned_text

df['clean_text'] = df['sms'].apply(lambda x: clean_text(x))

In [5]:
df.head()

Unnamed: 0,sms,label,clean_text
0,"Go until jurong point, crazy.. Available only ...",0,go jurong point crazi avail bugi n great world...
1,Ok lar... Joking wif u oni...\n,0,ok lar joke wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entri wkli comp win fa cup final tkt st m...
3,U dun say so early hor... U c already then say...,0,u dun say earli hor u c alreadi say
4,"Nah I don't think he goes to usf, he lives aro...",0,nah dont think goe usf live around though


In [6]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,4827
1,747


# Balancing Data.

In [7]:
from imblearn.over_sampling import RandomOverSampler
x = df.drop('label', axis= 1)
y = df['label']

ros = RandomOverSampler()

x_resample, y_resample = ros.fit_resample(x, y)

balanced_df = pd.DataFrame(x_resample, columns= x.columns)
balanced_df['label'] = y_resample



In [8]:
balanced_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,4827
1,4827


# Train Test Split & Vectorization(TF-IDF).

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Doing train_test_split.
x_train, x_test, y_train, y_test = train_test_split(balanced_df['clean_text'], balanced_df['label'], test_size= 0.2)

# Applying TF-IDF Vectorization.
tfvec = TfidfVectorizer()
x_train_tfVec = tfvec.fit_transform(x_train)
x_test_tfVec = tfvec.transform(x_test)

In [10]:
x_train_tfVec.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Traning 3 Classifiers and then build them.

In [11]:
# First RandomForestClassifiers.

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
# Fit and Prediction.
rfc = RandomForestClassifier()
rfc.fit(x_train_tfVec, y_train)
rfc_prediction = rfc.predict(x_test_tfVec)

# Confusion Matrix & Classification_report.

conf = confusion_matrix(y_test, rfc_prediction)
print('Confusion Matrix')
print(conf)

clasf = classification_report(y_test, rfc_prediction)
print('Classification Report')
print(clasf)

Confusion Matrix
[[955   0]
 [  0 976]]
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       955
           1       1.00      1.00      1.00       976

    accuracy                           1.00      1931
   macro avg       1.00      1.00      1.00      1931
weighted avg       1.00      1.00      1.00      1931



In [12]:
# Secondly LogisticRegression.

from sklearn.linear_model import LogisticRegression

# Fit & Prediction

lr = LogisticRegression()
lr.fit(x_train_tfVec, y_train)
lr_prediction = lr.predict(x_test_tfVec)

# Confusion Matrix & Classification Report

conf = confusion_matrix(y_test, lr_prediction)
print('Confusion Matrix')
print(conf)

clasf = classification_report(y_test, lr_prediction)
print('Classification Report')
print(clasf)

Confusion Matrix
[[946   9]
 [  8 968]]
Classification Report
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       955
           1       0.99      0.99      0.99       976

    accuracy                           0.99      1931
   macro avg       0.99      0.99      0.99      1931
weighted avg       0.99      0.99      0.99      1931



In [13]:
# Third SVM Classifier.

from sklearn.svm import SVC
# SVC Classifier.

svm_classf = SVC(kernel= 'linear')

# Fit & Prediction
svm_classf.fit(x_train_tfVec, y_train)
svm_prediction = svm_classf.predict(x_test_tfVec)

# Confusion Matrix & Classification Report

conf = confusion_matrix(y_test, svm_prediction)
print('Confusion Matrix')
print(conf)

clasf = classification_report(y_test, svm_prediction)
print('Classification Report')
print(clasf)

Confusion Matrix
[[946   9]
 [  1 975]]
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       955
           1       0.99      1.00      0.99       976

    accuracy                           0.99      1931
   macro avg       0.99      0.99      0.99      1931
weighted avg       0.99      0.99      0.99      1931



# Ensemble Learning

In [14]:
from sklearn.ensemble import VotingClassifier
# Defining Ensemble Classf with SVM

ensemble_classf = VotingClassifier(estimators= [
    ('Random_Classf ', rfc),
    ('Logistic Reg', lr),
    ('SVM_Classf', svm_classf)
], voting= 'hard') # 'hard' is for majority voting.

# Fit & Preds.

ensemble_classf.fit(x_train_tfVec, y_train)
ensemble_pred = ensemble_classf.predict(x_test_tfVec)

# Confusion & Classification Report.

conf = confusion_matrix(y_test, ensemble_pred)
print('Confusion Matrix')
print(conf)

clasf = classification_report(y_test, ensemble_pred)
print('Classification Report')
print(clasf)

Confusion Matrix
[[950   5]
 [  1 975]]
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       955
           1       0.99      1.00      1.00       976

    accuracy                           1.00      1931
   macro avg       1.00      1.00      1.00      1931
weighted avg       1.00      1.00      1.00      1931



# Saving & Loading Files.

In [15]:
import pickle

pickle.dump(rfc, open('rfc_model.pkl', 'wb'))
pickle.dump(tfvec, open('tfvec.pkl', 'wb'))

# Loading rfc

with open('rfc_model.pkl','rb') as file:
    rf_classf = pickle.load(file)

# Loading tfvec.

with open('tfvec.pkl', 'rb') as file:
    tf_vec = pickle.load(file)

# Detection System.

In [16]:
def predict_real_or_fake(text):
    cleaning_text = clean_text(text)
    vec_text = tfvec.transform([cleaning_text])
    prediction = rf_classf.predict(vec_text)
    return prediction

In [17]:
# Test 1

sample_text = "I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today.\n"
res = predict_real_or_fake(sample_text)

if res[0] == 0:
    print('Real')
else:
    print('Fake')

Real


In [18]:
# Test 2

sample_text2 = "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\n"
res = predict_real_or_fake(sample_text2)

if res[0] == 0:
    print('Real')
else:
    print('Fake')

Fake
