In [26]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Load the dataset (using the spam.csv file)
df = pd.read_csv('spam.csv', encoding='latin-1')  

# Step 2: Check the first few rows to understand the structure of the CSV file
print(df.head())

# Step 3: Preprocess the data - Rename columns if needed
df = df.rename(columns={'v1': 'Category', 'v2': 'Message'}) 

# Step 4: Check for missing values in the 'Label' column (target variable)
print("Missing values in Category column:", df['Category'].isna().sum())

# Step 5: Remove rows where the 'Label' is NaN
df = df.dropna(subset=['Category'])

# Step 6: Convert 'ham' and 'spam' labels to binary (ham = 0, spam = 1)
df['Category'] = df['Category'].map({'ham': 0, 'spam': 1})

# Step 7: Convert 'Message' column to string and handle missing values
df['Message'] = df['Message'].astype(str)  # Ensure all messages are strings
df['Message'] = df['Message'].fillna('')  # Fill NaN values with empty string

# Step 8: Preprocess the messages - Remove stopwords
stop_words = set(stopwords.words('english'))
df['Message'] = df['Message'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))

# Step 9: Split the data into training and testing sets
X = df['Message']  # Features (text messages)
y = df['Category']    # Target variable (ham or spam)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 10: Convert text data into numeric features using TF-IDF
vec = TfidfVectorizer()
X_train_tfidf = vec.fit_transform(X_train)
X_test_tfidf = vec.transform(X_test)

# Step 11: Train the model using Naive Bayes (Multinomial Naive Bayes)
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Step 12: Predict the labels on the test set
y_pred = model.predict(X_test_tfidf)

# Step 13: Evaluate the model
a = accuracy_score(y_test, y_pred)
print(f"Accuracy: {a*100:.2f}%")

# Step 14: Display the classification report (precision, recall, F1-score)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Step 15: Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
Missing values in Category column: 0
Accuracy: 97.67%

Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       966
           1       1.00      0.83      0.90       149

    accuracy                           0.98      1115
   macro avg       0.99      0.91      0.95      1115
weighted avg       0.98      0.98      0.98      1115


Confusion Matrix:
[[966   0]
 [ 26 123]]
