In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import nltk
from nltk.corpus import stopwords
import string
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score

In [None]:
encodings = ['utf-8', 'iso-8859-1', 'latin1', 'cp1252']

for encoding in encodings:
    try:
        data = pd.read_csv('/content/drive/MyDrive/spam.csv', encoding=encoding)
        print(f'Successfully read the file using encoding: {encoding}')
        break
    except UnicodeDecodeError:
        print(f'Failed to read using encoding: {encoding}')

Failed to read using encoding: utf-8
Successfully read the file using encoding: iso-8859-1


In [None]:
nltk.download('punkt')

# Step 2: Data Preprocessing

# Tokenization: Split the text into individual words (tokens)
def tokenize_text(text):
    return nltk.word_tokenize(text)

# Lowercasing: Convert all words to lowercase
def lowercase_text(tokens):
    return [word.lower() for word in tokens]

# Stopword Removal: Remove common words (stopwords)
stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

# Data Cleaning: Remove special characters, numbers, and symbols
def clean_text(text):
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    # Remove numbers
    text = ''.join([char for char in text if not char.isdigit()])
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

# Apply the preprocessing steps to dataset
data['v2'] = data['v2'].apply(tokenize_text)
data['v2'] = data['v2'].apply(lowercase_text)
data['v2'] = data['v2'].apply(remove_stopwords)
data['v2'] = data['v2'].apply(clean_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Step 3: Feature Extraction (TF-IDF)
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(data['v2'])
y = data['v1']

In [None]:
# Step 4: Model Selection (Naive Bayes)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Create and train the Naive Bayes classifier
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train, y_train)


In [None]:
# Step 5: Model Evaluation
y_pred = naive_bayes_classifier.predict(X_test)

In [None]:
# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='spam')
recall = recall_score(y_test, y_pred, pos_label='spam')
f1 = f1_score(y_test, y_pred, pos_label='spam')

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-Score: {f1:.2f}')


Accuracy: 0.89
Precision: 1.00
Recall: 0.21
F1-Score: 0.35


In [None]:
# Generate a classification report and confusion matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

         ham       0.89      1.00      0.94       965
        spam       1.00      0.21      0.35       150

    accuracy                           0.89      1115
   macro avg       0.95      0.61      0.65      1115
weighted avg       0.91      0.89      0.86      1115

[[965   0]
 [118  32]]


In [None]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameters and their range to search
param_grid = {
    'alpha': [0.1, 0.2, 0.5, 1.0],
}

# Used GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(naive_bayes_classifier, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

#To Get the best hyperparameters
best_alpha = grid_search.best_params_['alpha']

# Create a new Naive Bayes classifier with the best hyperparameters
best_naive_bayes_classifier = MultinomialNB(alpha=best_alpha)

# Train the model with the best hyperparameters on the entire training set
best_naive_bayes_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_naive_bayes_classifier.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='spam')
recall = recall_score(y_test, y_pred, pos_label='spam')
f1 = f1_score(y_test, y_pred, pos_label='spam')

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-Score: {f1:.2f}')

print(classification_report(y_test, y_pred))


Accuracy: 0.94
Precision: 0.90
Recall: 0.63
F1-Score: 0.75
              precision    recall  f1-score   support

         ham       0.95      0.99      0.97       965
        spam       0.90      0.63      0.75       150

    accuracy                           0.94      1115
   macro avg       0.93      0.81      0.86      1115
weighted avg       0.94      0.94      0.94      1115



#Testing the model on sample input

In [None]:
while True:
    user_input = input("Enter an SMS text (or type 'exit' to quit): ")

    if user_input.lower() == 'exit':
        break

    user_inputs = [user_input]
    user_inputs_tfidf = tfidf_vectorizer.transform(user_inputs)

    prediction = best_naive_bayes_classifier.predict(user_inputs_tfidf)

    if prediction[0] == 'spam':
        print("Spam")
    else:
        print("Not Spam")

Enter an SMS text (or type 'exit' to quit): SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info 
Spam
Enter an SMS text (or type 'exit' to quit): Congratulations! You've won a free iPhone. Claim your prize now by clicking this link. 
Spam
Enter an SMS text (or type 'exit' to quit): Hi, are you available for a meeting tomorrow at 2 PM?
Not Spam
Enter an SMS text (or type 'exit' to quit): Your order has been confirmed and will be delivered tomorrow.
Not Spam
Enter an SMS text (or type 'exit' to quit): Free iPhone giveaway. Claim yours now. Limited stock available.
Not Spam
Enter an SMS text (or type 'exit' to quit): Get rich quick! Double your income in just one week. Don't miss this opportunity!
Not Spam
Enter an SMS text (or type 'exit' to quit): You've been selected to receive a free movie pass. Click the link to redeem your pass at xxxmobilemovieclub.com
Not Spam
Enter an SMS text (or type 'exit' to 