In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import nltk
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score


In [5]:
# Load dataset
dataset1 = pd.read_csv(r"C:\Users\HUPSIKA CHARY\Downloads\SMS_Classify\SMS_Classify\spam.csv", encoding='latin-1')
df = dataset1[['v2', 'v1']]


In [6]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to C:\Users\HUPSIKA
[nltk_data]     CHARY\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to C:\Users\HUPSIKA
[nltk_data]     CHARY\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to C:\Users\HUPSIKA
[nltk_data]     CHARY\AppData\Roaming\nltk_data...


True

In [7]:
# Initialize the lemmatizer and stopwords
lemma = WordNetLemmatizer()
stopwords_set = set(stopwords.words('english'))

In [8]:
# Define the tokenizer function
def tokenizer(row):
    row = row.lower()
    row = re.sub(r'[^a-zA-Z\s]', ' ', row)
    row = ' '.join([word for word in row.split() if word not in stopwords_set])
    row = ' '.join([lemma.lemmatize(word) for word in row.split()])
    return row

In [9]:
# Apply the tokenizer function to the 'text' column
df.loc[:, 'v2'] = df['v2'].apply(tokenizer)

In [10]:
# Split data
x_train, x_test, y_train, y_test = train_test_split(df['v2'], df['v1'], test_size=0.2, random_state=42)

In [11]:
# Vectorization
vt = TfidfVectorizer(max_features=50000, lowercase=False, ngram_range=(1, 2))
x_train = vt.fit_transform(x_train)
x_test = vt.transform(x_test)

In [12]:
# Create DataFrames with meaningful column names
x_train_df = pd.DataFrame(x_train.toarray(), columns=vt.get_feature_names_out())
x_test_df = pd.DataFrame(x_test.toarray(), columns=vt.get_feature_names_out())

In [13]:
# MODEL
model = MultinomialNB()
model.fit(x_train, y_train)
pred = model.predict(x_train)


In [14]:
# Model evaluation on training set
cf = confusion_matrix(y_train, pred)
ac = accuracy_score(y_train, pred)
print("Confusion Matrix on Training Set:")
print(cf)
print("Accuracy on Training Set:", ac)

Confusion Matrix on Training Set:
[[3860    0]
 [  97  500]]
Accuracy on Training Set: 0.9782364819385236


In [15]:
# Predicting for a mail
text = "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
text = tokenizer(text)

In [16]:
# Assuming vt is the vectorizer trained during training
text_vectorized = vt.transform([text]).toarray()


In [17]:
# Ensure the dimensions are consistent with the model's expectations
if text_vectorized.shape[1] != x_train_df.shape[1]:
    raise ValueError("Mismatch in the number of features between vectorizer and model.")

In [18]:
# Make predictions
pred1 = model.predict(text_vectorized)
print("Prediction for the mail:", pred1)

Prediction for the mail: ['spam']
