<a href="https://colab.research.google.com/github/Alhagie1/Alhagie1/blob/main/EmailSpamClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import os

# Check if file exists
file_path = "/content/mail_data.csv"
if os.path.exists(file_path):
    print("File exists.")

    # Load the dataset
    data = pd.read_csv(file_path)

    # Replace missing values with empty string
    data['Message'] = data['Message'].fillna('')

    # Convert labels to binary (e.g., spam=1, ham=0)
    data['Category'] = data['Category'].map({'spam': 1, 'ham': 0})

    # Split features and labels
    X = data['Message']
    y = data['Category']

    # Split into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Convert text to TF-IDF features
    vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # Train logistic regression model
    model = LogisticRegression()
    model.fit(X_train_tfidf, y_train)

    # Predict on test set
    y_pred = model.predict(X_test_tfidf)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model Accuracy: {accuracy:.4f}")
else:
    print("File does not exist. Please upload the file using the following code:")
    print("from google.colab import files")
    print("uploaded = files.upload()")

# After loading the data, process and count the labels for the chart
if os.path.exists(file_path):
    data = pd.read_csv(file_path)
    # Convert labels to binary if they aren't already (in case the file didn't exist previously)
    if data['Category'].dtype == 'object':
        data['Category'] = data['Category'].map({'spam': 1, 'ham': 0})
    # Count spam and ham using the mapped binary values
    label_counts = data['Category'].value_counts()
    # Rename index to 'ham' and 'spam' for the chart labels
    label_counts = label_counts.rename(index={0: 'ham', 1: 'spam'})
    print(f"Label counts for chart: {label_counts.to_dict()}") # Print counts to verify

File exists.
Model Accuracy: 0.9677
Label counts for chart: {'ham': 4825, 'spam': 747}
