In [36]:
import numpy as np
import matplotlib.pyplot as plt
import chardet
import pandas as pd
import io
import re

# Read in the data

In [37]:
# Detect file encoding
with open('spam.csv', 'rb') as file:
    result = chardet.detect(file.read())
    encoding = result['encoding']

# Read the file with detected encoding and handle decoding errors
with open('spam.csv', 'r', encoding=encoding, errors='replace') as file:
    content = file.read()

# Use io.StringIO to create a file-like object from the content
file_like = io.StringIO(content)

alldata = pd.read_csv(file_like, usecols=[0,1])

## Separate into features and labelss

In [38]:
X = alldata['v2']
y = alldata['v1']

In [39]:
y

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: v1, Length: 5572, dtype: object

In [40]:
X[5567]

'This is the 2nd time we have tried 2 contact u. U have won the å£750 Pound prize. 2 claim is easy, call 087187272008 NOW1! Only 10p per minute. BT-national-rate.'

# Feature Extraction

In [54]:
import re

def extract_features(email):
    length = len(email)
    exclamations = email.count('!')
    numbers = len(re.findall(r'\d', email))
    capitals = sum(1 for c in email if c.isupper())
    special_chars = len(re.findall(r'[^\w\s]', email))  # Count of special characters
    url_count = len(re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', email))  # Count of URLs
    email_count = len(re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', email))  # Count of email addresses
    
    return [length, exclamations, numbers, capitals, special_chars, url_count, email_count]

# Extract features for all emails
features = [extract_features(email) for email in X]

# Extract common words using CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
common_words = vectorizer.fit_transform(X)

# Combine count features and common words
from scipy.sparse import hstack
features = hstack((features, common_words))

# Split into train and test

In [55]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.2, random_state=42)

## Standardize features

In [56]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(with_mean=False)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define Logistic Regression model

In [57]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

# Training

In [58]:
model.fit(X_train, y_train)

# Evaluation

In [59]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9775784753363229
Classification Report:
               precision    recall  f1-score   support

         ham       0.97      1.00      0.99       965
        spam       1.00      0.83      0.91       150

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

