In [2]:
# Importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from scipy.sparse import csr_matrix


In [3]:
# Load dataset
file_path = r'C:\Users\anand\Downloads\minorprjct\Data_set\mail_data.csv'
mail_data = pd.read_csv(file_path)

# Check for missing values
print(mail_data.isnull().sum())

# Convert 'Category' labels to 0 and 1 (ham: 0, spam: 1)
mail_data['Category'] = mail_data['Category'].map({'ham': 0, 'spam': 1})

# Split data into features and target
X = mail_data['Message']
y = mail_data['Category']

# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data split completed!")

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


Category    0
Message     0
dtype: int64
Data split completed!


In [4]:
print(f"Training data shape after TF-IDF: {X_train_tfidf.shape}")
print(f"Testing data shape after TF-IDF: {X_test_tfidf.shape}")

Training data shape after TF-IDF: (4457, 3000)
Testing data shape after TF-IDF: (1115, 3000)


In [5]:
# Function to preprocess sparse data using StandardScaler
def preprocess_data(data):
    scaler = StandardScaler(with_mean=False)  # Avoid dense conversion for sparse data
    scaled_data = scaler.fit_transform(data)
    return scaled_data

# Example: Scaling the TF-IDF data
X_train_scaled = preprocess_data(X_train_tfidf)
X_test_scaled = preprocess_data(X_test_tfidf)

print(f"Scaled training data shape: {X_train_scaled.shape}")


Scaled training data shape: (4457, 3000)


In [6]:
class LogisticRegressionModel:
    def __init__(self, learning_rate=0.01, n_iters=1000):
        self.learning_rate = learning_rate
        self.n_iters = n_iters
        self.weights = None
        self.bias = None

    def initialize_parameters(self, n_features):
        self.weights = np.zeros(n_features)
        self.bias = 0

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def update_weights(self, X, y, predictions):
        dw = np.dot(X.T, (predictions - y)) / len(y)
        db = np.sum(predictions - y) / len(y)

        self.weights -= self.learning_rate * dw
        self.bias -= self.learning_rate * db

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        predictions = self.sigmoid(linear_model)
        return predictions

    def train(self, X, y):
        n_samples, n_features = X.shape
        self.initialize_parameters(n_features)

        for _ in range(self.n_iters):
            predictions = self.predict(X)
            self.update_weights(X, y, predictions)

        return self.weights, self.bias


In [7]:
# Convert sparse matrix to dense for custom implementation
X_train_dense = X_train_scaled.toarray()
X_test_dense = X_test_scaled.toarray()

# Instantiate the model and train
model = LogisticRegressionModel(learning_rate=0.01, n_iters=1000)
model.train(X_train_dense, y_train)

# Predict on the test set
y_pred_prob = model.predict(X_test_dense)
y_pred = [1 if prob >= 0.5 else 0 for prob in y_pred_prob]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Custom Logistic Regression Accuracy: {accuracy * 100:.2f}%")
print(classification_report(y_test, y_pred))


Custom Logistic Regression Accuracy: 98.83%
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       0.99      0.92      0.95       149

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [8]:
# Train a logistic regression model using Scikit-learn
from sklearn.linear_model import LogisticRegression

sklearn_model = LogisticRegression()
sklearn_model.fit(X_train_tfidf, y_train)

# Predict using Scikit-learn's model
y_pred_sklearn = sklearn_model.predict(X_test_tfidf)

# Evaluate the model
accuracy_sklearn = accuracy_score(y_test, y_pred_sklearn)
print(f"Sklearn Logistic Regression Accuracy: {accuracy_sklearn * 100:.2f}%")
print(classification_report(y_test, y_pred_sklearn))


Sklearn Logistic Regression Accuracy: 97.49%
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       966
           1       1.00      0.81      0.90       149

    accuracy                           0.97      1115
   macro avg       0.99      0.91      0.94      1115
weighted avg       0.98      0.97      0.97      1115

