Write Python code to implement Logistic Regression Model for Spam detection.

**CODE**

In [1]:
# Importing Required Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.utils.class_weight import compute_class_weight

# Load the data
raw_mail_data = pd.read_csv('/content/mail_data.csv', encoding='latin-1')

# Replace null values with an empty string
mail_data = raw_mail_data.where(pd.notnull(raw_mail_data), '')

# Label encoding: spam -> 0, ham -> 1
mail_data['Category'] = mail_data['Category'].map({'spam': 0, 'ham': 1})

# Drop rows where 'Category' is NaN
mail_data = mail_data.dropna(subset=['Category'])

# Separate features and labels
X = mail_data['Message']
Y = mail_data['Category']
print(X)
print(Y)
# Split data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3, stratify=Y)

# Feature extraction using TF-IDF with enhanced preprocessing
vectorizer = TfidfVectorizer(
    stop_words='english',
    max_features=10000,  # Increase feature set
    lowercase=True,
    ngram_range=(1, 3),  # Use unigrams, bigrams, and trigrams
    sublinear_tf=True  # Apply sublinear term frequency scaling
)

X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

# Convert Y_train and Y_test to integers
Y_train = Y_train.astype(int)
Y_test = Y_test.astype(int)

# Handle class imbalance
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(Y_train), y=Y_train)
class_weight_dict = dict(enumerate(class_weights))

# Train logistic regression with regularization and class weights
model = LogisticRegression(
    solver='liblinear',
    penalty='l2',  # L2 regularization
    class_weight=class_weight_dict,
    C=1.0  # Adjust regularization strength
)
model.fit(X_train_features, Y_train)

# Evaluate the model
# Accuracy on training data
train_predictions = model.predict(X_train_features)
training_accuracy = accuracy_score(Y_train, train_predictions)
print('Accuracy on training data:', training_accuracy)

# Accuracy on test data
test_predictions = model.predict(X_test_features)
testing_accuracy = accuracy_score(Y_test, test_predictions)
print('Accuracy on test data:', testing_accuracy)

# Predictive system
input_mail = ["Hi, can we reschedule our meeting?"]
input_data_features = vectorizer.transform(input_mail)
prediction = model.predict(input_data_features)

if prediction[0] == 1:
    print('Ham mail')
else:
    print('Spam mail')


0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5583    This is the 2nd time we have tried 2 contact u...
5584                Will Ã¼ b going to esplanade fr home?
5585    Pity, * was in mood for that. So...any other s...
5586    The guy did some bitching but I acted like i'd...
5587                           Rofl. Its true to its name
Name: Message, Length: 5587, dtype: object
0       1.0
1       1.0
2       0.0
3       1.0
4       1.0
       ... 
5583    0.0
5584    1.0
5585    1.0
5586    1.0
5587    1.0
Name: Category, Length: 5587, dtype: float64
Accuracy on training data: 0.9923920340120832
Accuracy on test data: 0.9874776386404294
Ham mail
