In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [10]:
# load the dataset
df = pd.read_csv('spam.csv', encoding='latin-1')

print(df.head())


     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [11]:
print(df.dtypes)

v1            object
v2            object
Unnamed: 2    object
Unnamed: 3    object
Unnamed: 4    object
dtype: object


In [12]:
# Drop unnecessary columns
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

In [44]:
# Rename columns for better understanding
df.columns = ['label', 'message']

In [45]:
# Mapping the 'label' column in a way thar it shows 1 for spam, 0 for legitimate
df['label'] = df['label'].map({'spam': 1, 'ham': 0})

# Display the distribution of classes
print(df['label'].value_counts())

Series([], Name: count, dtype: int64)


In [46]:
# Check for missing values in features and target
print("\nMissing values in the features and target:")
print("X missing values:", X.isnull().sum())
print("y missing values:", y.isnull().sum())

# Split the dataset into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("\nShapes after splitting:")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)



Missing values in the features and target:
X missing values: 0
y missing values: 0

Shapes after splitting:
X_train shape: (4457,)
X_test shape: (1115,)
y_train shape: (4457,)
y_test shape: (1115,)


In [47]:
# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

# Fit training data, transform the test data
X_train_vectorizer = vectorizer.fit_transform(X_train)
X_test_vectorizer = vectorizer.transform(X_test)


In [48]:
# Check the shapes of the vectorized data
print("\nShapes after TF-IDF vectorization:")
print("X_train_vectorizer shape:", X_train_vectorizer.shape)
print("X_test_vectorizer shape:", X_test_vectorizer.shape)


Shapes after TF-IDF vectorization:
X_train_vectorizer shape: (4457, 7472)
X_test_vectorizer shape: (1115, 7472)


In [51]:
# Naive Bayes is used for its simplicity and effectiveness in handling text classification tasks, assuming feature independence.
# Initialize the Naive Bayes model
naive_bayes_model = MultinomialNB()

# Train the model with the training data
naive_bayes_model.fit(X_train_vectorizer, y_train)

# Predict on the test data
y_pred = naive_bayes_model.predict(X_test_vectorizer)

In [50]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.9668

Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115


Confusion Matrix:
[[965   0]
 [ 37 113]]
