<a href="https://colab.research.google.com/github/ANAGHALAKSHMIA/DataScienceTask1/blob/main/SMS_CLASSIFIER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# Load the CSV file into a DataFrame
df = pd.read_csv('spam.csv', encoding='latin-1')

# Display the first few rows to inspect the data
df.head()

# Drop unnecessary columns and rename columns for clarity
df = df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])
df.columns = ['label', 'message']

# Strip any leading/trailing whitespace characters from text columns
df['label'] = df['label'].str.strip()
df['message'] = df['message'].str.strip()

# Check for and handle missing values
df.dropna(inplace=True)

# Ensure there are no empty strings which can also cause NaNs
df = df[df['message'] != '']

# Verify there are no missing values
print(df.isnull().sum())

label      0
message    0
dtype: int64


In [None]:
# Encode labels (spam=1, ham=0)
df['label'] = df['label'].map({'spam': 1, 'ham': 0})

# Check the distribution of labels
print(df['label'].value_counts())


label
0    4825
1     747
Name: count, dtype: int64


In [None]:
# Split the dataset into training and test sets ensuring balanced classes
X_train, X_test, y_train, y_test = train_test_split(
    df['message'], df['label'], test_size=0.2, random_state=42, stratify=df['label'])

#print the sizes of training and test sets are consistent
print("Training set size:", X_train.shape[0])
print("Test set size:", X_test.shape[0])
print("Training labels size:", y_train.shape[0])
print("Test labels size:", y_test.shape[0])

Training set size: 4457
Test set size: 1115
Training labels size: 4457
Test labels size: 1115


In [None]:
# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.5)

# Fit and transform the training data, transform the test data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Ensure that the sizes of the transformed features match the labels
print("Transformed training set size:", X_train_tfidf.shape)
print("Transformed test set size:", X_test_tfidf.shape)


Transformed training set size: (4457, 7440)
Transformed test set size: (1115, 7440)


In [None]:
model = MultinomialNB()

# Train the model
model.fit(X_train_tfidf, y_train)

# Predict on the test data
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.968609865470852
Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       966
           1       1.00      0.77      0.87       149

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115

Confusion Matrix:
 [[966   0]
 [ 35 114]]
