In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Load the dataset
data = pd.read_csv('../datasets/spam.csv')
data.head()

print(data.isna().sum())


# Split the dataset
x = data['Message'].values
y = data['Category'].values

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


# Convert email text into numerical features using CountVectorizer
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train_vec, y_train)


# Make predictions on the test data
y_pred = model.predict(X_test_vec)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label="spam")
recall = recall_score(y_test, y_pred, pos_label="spam")
f1 = f1_score(y_test, y_pred, pos_label="spam")

# Print the model performance metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Category    0
Message     0
dtype: int64
Accuracy: 0.9865470852017937
Precision: 1.0
Recall: 0.8993288590604027
F1 Score: 0.9469964664310955
