# SPAM SMS DETECTION
## To Do:
### Build an AI model that can classify SMS messages as spam or legitimate. Use techniques like TF-IDF or word embeddings with classifiers like Naive Bayes, Logistic Regression, or Support Vector Machines to identify spam messages.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

In [2]:
data  = pd.read_csv('C:/Users/CHARAN/Downloads/SMS Spam Collection Dataset/spam.csv', encoding='latin-1')

In [3]:
data = data[['v1', 'v2']]  # Keep only necessary columns
data.columns = ['label', 'message']  # Rename columns for easier access

In [4]:
# Convert labels to binary values (0 = ham, 1 = spam)
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['message'], data['label'], test_size=0.2, random_state=42)

In [6]:
# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [7]:
# Initialize the classifiers
nb = MultinomialNB()
lr = LogisticRegression(max_iter=1000)
svm = SVC(kernel='linear')

In [8]:
# Train and evaluate Naive Bayes classifier
nb.fit(X_train_tfidf, y_train)
nb_pred = nb.predict(X_test_tfidf)
print("Naive Bayes Classifier:")
print(f"Accuracy: {accuracy_score(y_test, nb_pred)}")
print(classification_report(y_test, nb_pred))


Naive Bayes Classifier:
Accuracy: 0.9668161434977578
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115



In [9]:
# Train and evaluate Logistic Regression classifier
lr.fit(X_train_tfidf, y_train)
lr_pred = lr.predict(X_test_tfidf)
print("\nLogistic Regression Classifier:")
print(f"Accuracy: {accuracy_score(y_test, lr_pred)}")
print(classification_report(y_test, lr_pred))


Logistic Regression Classifier:
Accuracy: 0.9524663677130045
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       965
           1       0.97      0.67      0.79       150

    accuracy                           0.95      1115
   macro avg       0.96      0.83      0.88      1115
weighted avg       0.95      0.95      0.95      1115



In [10]:
# Train and evaluate Support Vector Machine classifier
svm.fit(X_train_tfidf, y_train)
svm_pred = svm.predict(X_test_tfidf)
print("\nSupport Vector Machine Classifier:")
print(f"Accuracy: {accuracy_score(y_test, svm_pred)}")
print(classification_report(y_test, svm_pred))


Support Vector Machine Classifier:
Accuracy: 0.979372197309417
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.97      0.87      0.92       150

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [11]:
# Open a file to save the results
with open("spam_classification_results.txt", "w") as f:
    
    # Naive Bayes classifier
    f.write("Naive Bayes Classifier:\n")
    f.write(f"Accuracy: {accuracy_score(y_test, nb_pred)}\n")
    f.write(classification_report(y_test, nb_pred))
    f.write("\n" + "="*60 + "\n")
    
    # Logistic Regression classifier
    f.write("\nLogistic Regression Classifier:\n")
    f.write(f"Accuracy: {accuracy_score(y_test, lr_pred)}\n")
    f.write(classification_report(y_test, lr_pred))
    f.write("\n" + "="*60 + "\n")
    
    # Support Vector Machine classifier
    f.write("\nSupport Vector Machine Classifier:\n")
    f.write(f"Accuracy: {accuracy_score(y_test, svm_pred)}\n")
    f.write(classification_report(y_test, svm_pred))
    f.write("\n" + "="*60 + "\n")