In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np

# Step 1: Load dataset
data = pd.read_csv("C:\\Users\\Student\\Downloads\\emails.csv\\emails.csv")

# Step 2: Inspect dataset
print("Dataset sample:")
print(data.head())
print("\nChecking for missing values:")
print(data.isnull().sum().sum(), "missing values found")

# Step 3: Prepare features and labels
X = data.drop(columns=["Email No.", "Prediction"])
y = data["Prediction"]

# Step 4: Split dataset
X_train, X_test, y_train, y_test = train_test_split( 
    X, y, test_size=0.2, random_state=42
)

# Step 5: Train Multinomial Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)

# Step 6: Evaluate model
y_pred = model.predict(X_test)
print("\nModel Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Step 7: Predict new email with zero vector (example)
new_email = pd.DataFrame(np.zeros((1, X.shape[1])), columns=X.columns)
prediction = model.predict(new_email)[0]
print("\nNew Email Prediction:", "Spam" if prediction == 1 else "Not Spam")


Dataset sample:
  Email No.  the  to  ect  and  for  of    a  you  hou  ...  connevey  jay  \
0   Email 1    0   0    1    0    0   0    2    0    0  ...         0    0   
1   Email 2    8  13   24    6    6   2  102    1   27  ...         0    0   
2   Email 3    0   0    1    0    0   0    8    0    0  ...         0    0   
3   Email 4    0   5   22    0    5   1   51    2   10  ...         0    0   
4   Email 5    7   6   17    1    5   2   57    0    9  ...         0    0   

   valued  lay  infrastructure  military  allowing  ff  dry  Prediction  
0       0    0               0         0         0   0    0           0  
1       0    0               0         0         0   1    0           0  
2       0    0               0         0         0   0    0           0  
3       0    0               0         0         0   0    0           0  
4       0    0               0         0         0   1    0           0  

[5 rows x 3002 columns]

Checking for missing values:
0 missing values