# Implement e-mail spam filtering using text classification algorithm with appropriate dataset. 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Step 1: Load the dataset
data = pd.read_csv('spam.csv')

In [3]:
# Step 2: Data preprocessing
# 'v1' column has the labels (spam/ham) and 'v2' has the email text
data = data.rename(columns={'v1': 'label', 'v2': 'text'})
data['label'] = data['label'].map({'spam': 1, 'ham': 0})

In [4]:
# Step 3: Vectorize the text data
# Data preprocessing (handle NaN values)
# Drop rows with NaN in the 'text' column
data = data.dropna(subset=['text'])

# Continue with the vectorization step
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['text'])
y = data['label']

In [5]:
# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Step 5: Train the model
#model = MultinomialNB()
#model.fit(X_train, y_train)

In [7]:
#from sklearn.linear_model import LogisticRegression
#model = LogisticRegression()
#model.fit(X_train, y_train)

In [8]:
from sklearn.svm import SVC
model = SVC()
model.fit(X_train, y_train)

In [9]:
# Step 6: Model evaluation
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9676840215439856
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       956
           1       1.00      0.77      0.87       158

    accuracy                           0.97      1114
   macro avg       0.98      0.89      0.93      1114
weighted avg       0.97      0.97      0.97      1114

