In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import string

# Step 1: Load and Preprocess Data
# Load data from CSV file
data = pd.read_csv('spam.csv', encoding='ISO-8859-1')

# Display the first few rows to understand the data
print("First five rows of the data:")
print(data.head())

# Keep only necessary columns and rename them for clarity
data = data[['v1', 'v2']]
data.columns = ['label', 'text']

# Check for missing values
print("Checking for missing values:")
print(data.isnull().sum())

# Convert labels to binary values
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# Check the distribution of labels
print("Distribution of labels:")
print(data['label'].value_counts())

# Step 2: Text Preprocessing
def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    return text

# Apply text preprocessing
data['text'] = data['text'].apply(preprocess_text)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)

# Step 3: Feature Engineering using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Step 4: Train Models
# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
y_pred_nb = nb_model.predict(X_test_tfidf)

# Logistic Regression
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_tfidf, y_train)
y_pred_lr = lr_model.predict(X_test_tfidf)

# Support Vector Machine
svc_model = SVC(kernel='linear', random_state=42)
svc_model.fit(X_train_tfidf, y_train)
y_pred_svc = svc_model.predict(X_test_tfidf)

# Step 5: Evaluate Models
def evaluate_model(model_name, y_test, y_pred):
    print(f'--- {model_name} ---')
    print('Accuracy:', accuracy_score(y_test, y_pred))
    print('Classification Report:\n', classification_report(y_test, y_pred, target_names=['ham', 'spam']))
    print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))
    print('\n')

evaluate_model('Naive Bayes', y_test, y_pred_nb)
evaluate_model('Logistic Regression', y_test, y_pred_lr)
evaluate_model('Support Vector Machine', y_test, y_pred_svc)

# Step 6: Save Predictions
# Assuming Logistic Regression performed the best
output = pd.DataFrame({
    'text': X_test,
    'predicted_label': y_pred_lr,
    'actual_label': y_test
})

output.to_csv('spam_predictions.csv', index=False)
print('Predictions saved to spam_predictions.csv')


First five rows of the data:
     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  
Checking for missing values:
label    0
text     0
dtype: int64
Distribution of labels:
label
0    4825
1     747
Name: count, dtype: int64
--- Naive Bayes ---
Accuracy: 0.9739910313901345
Classification Report:
               precision    recall  f1-score   support

         ham       0.97      1.00      0.99       965
        spam       1.00      0.81      0.89       150
