In [2]:
# Mount Google Drive onto the Colab environment for accessing files and directories.

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Importing necessary libraries and modules for data manipulation, computer vision, and machine learning tasks.

import os
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# To train model
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [4]:
# Set the main directory path and navigate to it to access files and directories within the NLP project.

main_dir="/content/drive/MyDrive/NLP"
os.chdir(main_dir)
os.listdir()

['SMSSpamCollection']

In [5]:
# Read the SMS data from the specified file into a DataFrame, with columns labeled as "label" and "message".

messages=pd.read_csv('/content/drive/MyDrive/NLP/SMSSpamCollection', sep='\t', names=["label", "message"])

In [6]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [7]:
#Data cleaning and preprocessing
# Clean and preprocess each message by removing non-alphabetic characters, converting to lowercase,
# removing stopwords, and stemming words using the PorterStemmer.

import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
  review = re.sub('[^a-zA-Z]', '', messages['message'][i])
  review = review.lower()
  review = review.split()

  review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
  review =' '.join(review)
  corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
### Creating the Bag of Words model ###

# Creating a Bag of Words model with a maximum of 2500 features and converting the text data into numerical features.

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()

y=pd.get_dummies(messages['label'])
y=y.iloc[:,1].values

In [9]:
# Train Test Split
# Splitting the data into training and testing sets, with 20% of the data allocated for testing and setting a seed for reproducibility.

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Training model using Naive bayes classifier
# Training a Naive Bayes classifier using the Multinomial Naive Bayes algorithm and evaluating its accuracy on the test set.

from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)
y_pred=spam_detect_model.predict(X_test)
from sklearn.metrics import accuracy_score
print("accuracy:", accuracy_score(y_test, y_pred))

accuracy: 0.8699551569506726


In [11]:
### Creating the TF_IDF model ###

# Creating a TF-IDF (Term Frequency-Inverse Document Frequency) model with a maximum of 2500 features and converting the text data into numerical features.

from sklearn.feature_extraction.text import TfidfVectorizer
# fit
tfvectorizer = TfidfVectorizer(max_features=2500)
X = tfvectorizer.fit_transform(corpus).toarray()

y=pd.get_dummies(messages['label'])
y=y.iloc[:,1].values

In [12]:
# Train Test Split
# Splitting the TF-IDF features and the target variable into training and testing sets, with 20% of the data allocated for testing and setting a seed for reproducibility.

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [13]:
# Training model using Naive bayes classifier
# Training a Naive Bayes classifier using the Multinomial Naive Bayes algorithm and evaluating its accuracy on the test set.

from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

y_pred=spam_detect_model.predict(X_test)

from sklearn.metrics import accuracy_score
print("accuracy:", accuracy_score(y_test, y_pred))

accuracy: 0.8600896860986547
