In [1]:
setwd("C:/Users/DJ COMPUTERS/OneDrive/Desktop/R prog")
### SMS FRAUD DETECTION MODEL ###
### ADITYA KUMAR ROY ###

In [3]:
# Load required libraries
library(readr)
library(dplyr)
library(tidyr)
library(tm)         # For text mining
library(caret)      # For model evaluation
library(e1071)      # For Naive Bayes and SVM
library(text2vec)   # For TF-IDF
library(glmnet)     # For Logistic Regression

In [4]:
# loading datasets
sms_data <- read_csv("spam (1).csv",show_col_types = FALSE)


[1m[22mNew names:
[36m•[39m `` -> `...3`
[36m•[39m `` -> `...4`
[36m•[39m `` -> `...5`


In [5]:
#Data Preprocessing

# Adjust column names
colnames(sms_data) <- c("label", "message")  
sms_data$label <- as.factor(sms_data$label)

# Remove or replace any non-UTF-8 characters
sms_data$message <- iconv(sms_data$message, "UTF-8", "ASCII", sub = "")
sms_data$message <- sms_data$message %>%
  tolower() %>%removePunctuation() %>%removeNumbers() %>%stripWhitespace()

In [6]:
# Exploratory Data Analysis

head(sms_data)
summary(sms_data)
str(sms_data)

label,message,NA,NA,NA
<fct>,<chr>,<chr>,<chr>.1,<chr>.2
ham,go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat,,,
ham,ok lar joking wif u oni,,,
spam,free entry in a wkly comp to win fa cup final tkts st may text fa to to receive entry questionstd txt ratetcs apply overs,,,
ham,u dun say so early hor u c already then say,,,
ham,nah i dont think he goes to usf he lives around here though,,,
spam,freemsg hey there darling its been weeks now and no word back id like some fun you up for it still tb ok xxx std chgs to send to rcv,,,


  label        message               NA                 NA           
 ham :4825   Length:5572        Length:5572        Length:5572       
 spam: 747   Class :character   Class :character   Class :character  
             Mode  :character   Mode  :character   Mode  :character  
      NA           
 Length:5572       
 Class :character  
 Mode  :character  

spc_tbl_ [5,572 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ label  : Factor w/ 2 levels "ham","spam": 1 1 2 1 1 2 1 1 2 2 ...
 $ message: chr [1:5572] "go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat" "ok lar joking wif u oni" "free entry in a wkly comp to win fa cup final tkts st may text fa to to receive entry questionstd txt ratetcs apply overs" "u dun say so early hor u c already then say" ...
 $ NA     : chr [1:5572] NA NA NA NA ...
 $ NA     : chr [1:5572] NA NA NA NA ...
 $ NA     : chr [1:5572] NA NA NA NA ...
 - attr(*, "spec")=
  .. cols(
  ..   v1 = [31mcol_character()[39m,
  ..   v2 = [31mcol_character()[39m,
  ..   ...3 = [31mcol_character()[39m,
  ..   ...4 = [31mcol_character()[39m,
  ..   ...5 = [31mcol_character()[39m
  .. )
 - attr(*, "problems")=<externalptr> 


In [7]:
#Train-Test Split
set.seed(123)
train_index <- createDataPartition(sms_data$label, p = 0.7, list = FALSE)
train_data <- sms_data[train_index, ]
test_data <- sms_data[-train_index, ]

In [8]:
# TF-IDF Vectorization
vectorizer <- text2vec::itoken(train_data$message, progressbar = FALSE)
vocab <- text2vec::create_vocabulary(vectorizer)
dtm_train <- text2vec::create_dtm(vectorizer, text2vec::vocab_vectorizer(vocab))
tfidf <- text2vec::TfIdf$new()
dtm_train <- tfidf$fit_transform(dtm_train)

# Transform test data with the same TF-IDF model
dtm_test <- text2vec::create_dtm(itoken(test_data$message), text2vec::vocab_vectorizer(vocab))
dtm_test <- tfidf$transform(dtm_test)

In [9]:
# Naive Bayes
dtm_train_matrix <- as.matrix(dtm_train)
nb_model <- naiveBayes(dtm_train_matrix, train_data$label)
dtm_test_matrix <- as.matrix(dtm_test)
nb_pred <- predict(nb_model, dtm_test_matrix)
nb_cm <- confusionMatrix(nb_pred, test_data$label)
print("Naive Bayes Model Evaluation:")
print(nb_cm)

[1] "Naive Bayes Model Evaluation:"
Confusion Matrix and Statistics

          Reference
Prediction  ham spam
      ham     0    0
      spam 1447  224
                                          
               Accuracy : 0.1341          
                 95% CI : (0.1181, 0.1513)
    No Information Rate : 0.8659          
    P-Value [Acc > NIR] : 1               
                                          
                  Kappa : 0               
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.0000          
            Specificity : 1.0000          
         Pos Pred Value :    NaN          
         Neg Pred Value : 0.1341          
             Prevalence : 0.8659          
         Detection Rate : 0.0000          
   Detection Prevalence : 0.0000          
      Balanced Accuracy : 0.5000          
                                          
       'Positive' Class : ham  

In [10]:
#Logistic Regression
# Convert DTM to matrix for glmnet
x_train <- as.matrix(dtm_train)
x_test <- as.matrix(dtm_test)
log_model <- glmnet(x_train, train_data$label, family = "binomial", alpha = 0)
log_pred <- predict(log_model, newx = x_test, type = "class", s = 0.01)  # Adjust 's' for regularization
log_cm <- confusionMatrix(as.factor(log_pred), test_data$label)
print("Logistic Regression Model Evaluation:")
print(log_cm)

[1] "Logistic Regression Model Evaluation:"
Confusion Matrix and Statistics

          Reference
Prediction  ham spam
      ham  1446  103
      spam    1  121
                                          
               Accuracy : 0.9378          
                 95% CI : (0.9251, 0.9489)
    No Information Rate : 0.8659          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.668           
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.9993          
            Specificity : 0.5402          
         Pos Pred Value : 0.9335          
         Neg Pred Value : 0.9918          
             Prevalence : 0.8659          
         Detection Rate : 0.8654          
   Detection Prevalence : 0.9270          
      Balanced Accuracy : 0.7697          
                                          
       'Positive' Class

In [10]:
### LOGIISTIC REGRESSION MODEL IS THE BEST FIT FOR PREDICTING FRAUD SMS(FOR THE GIVEN DATASETS).