In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [None]:
raw_mail_data = pd.read_csv("mail_data.csv")

In [None]:
raw_mail_data.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

In [None]:
raw_mail_data.isnull().sum()

In [None]:
df = raw_mail_data.where((pd.notnull(raw_mail_data)), '')

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
# Supervised -> target class
# Unsupervised -> clustering problem

# Label encoding

In [None]:
df['Category'] = df['Category'].map({'spam': 0, 'ham': 1})

In [None]:
df.head()

In [None]:
# loc function

In [None]:
df.loc[df['Category'] == 'spam', 'Category',] = 0
df.loc[df['Category'] == 'ham', 'Category',] = 1

In [None]:
df.head()

In [None]:
X = df['Message']
Y = df['Category']

In [None]:
X

In [None]:
Y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2, random_state = 42)

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
X_test.shape

In [None]:
feature_extraction = TfidfVectorizer(min_df = 2, stop_words="english", binary=True, max_df=0.95)
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [None]:
df.info()

In [None]:
y_train = y_train.astype("int")
y_test = y_test.astype("int")

In [None]:
X_train

In [None]:
print(X_train_features)

In [None]:
print(X_test_features)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
import matplotlib.pyplot as plt 
import seaborn as sns 
model = LogisticRegression()
model.fit(X_train_features, y_train)

In [None]:
prediction_train_data = model.predict(X_train_features)

# Calculate accuracy, F1 score, recall, precision, and confusion matrix
accuracy_train_data = accuracy_score(y_train, prediction_train_data)
f1 = f1_score(y_train, prediction_train_data, average='binary')  # Use 'macro' or 'weighted' if multiclass
recall = recall_score(y_train, prediction_train_data, average='binary')
precision = precision_score(y_train, prediction_train_data, average='binary')


In [None]:
print(f"Accuracy: {accuracy_train_data}")
print(f"F1 Score: {f1}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")



In [None]:
conf_matrix = confusion_matrix(y_train, prediction_train_data)
print(f"Confusion Matrix:\n{conf_matrix}")

# Plot the confusion matrix as a heatmap
plt.figure(figsize=(6, 5))
sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues', cbar=False, 
            xticklabels=['Predicted Negative', 'Predicted Positive'],
            yticklabels=['Actual Negative', 'Actual Positive'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
prediction_test_data = model.predict(X_test_features)
accuracy_test_data = accuracy_score(y_test, prediction_test_data)

In [None]:
print("Accuarcy on test data: ", accuracy_test_data)

In [None]:
# Building predictive system

In [None]:
# input_user_mail = ["Hello, sir my name is Sanja. I am from Nepal. I am a student of computer science. I am in 3rd year of my degree. I am looking for a job. If you are interested, please contact me. My email is 6ZQ4D@example.com"]
input_user_mail = ["You have won I phone 15 pro"]
# input_user_mail = ["SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info"]

input_data_features = feature_extraction.transform(input_user_mail)

prediction = model.predict(input_data_features)

if prediction[0] == 1:
    print("This is a ham mail")
else:
    print("This is a spam mail")

In [None]:
# import pickle
# pickle.dump(model, open("logistic_regression.pkl", "wb"))
# pickle.dump(feature_extraction, open("feature_extraction.pkl", "wb"))

In [None]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, roc_curve, auc

# Suppress warnings
warnings.filterwarnings("ignore")

# Load and preview the dataset
raw_mail_data = pd.read_csv("mail_data.csv")
raw_mail_data.head()

# Check for missing values
raw_mail_data.isnull().sum()

# Handle missing data by replacing with empty strings
df = raw_mail_data.where(pd.notnull(raw_mail_data), '')
df.isnull().sum()

# Encode target variable (spam: 0, ham: 1)
df['Category'] = df['Category'].map({'spam': 0, 'ham': 1})
df.head()

# Feature extraction and target variable
X = df['Message']
Y = df['Category']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# TF-IDF Vectorizer
feature_extraction = TfidfVectorizer(min_df=2, stop_words="english", binary=True, max_df=0.95)
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train_features, y_train)

# Predictions on training data
prediction_train_data = model.predict(X_train_features)

# Calculate accuracy, F1 score, recall, precision, and confusion matrix
accuracy_train_data = accuracy_score(y_train, prediction_train_data)
f1 = f1_score(y_train, prediction_train_data, average='binary')
recall = recall_score(y_train, prediction_train_data, average='binary')
precision = precision_score(y_train, prediction_train_data, average='binary')
conf_matrix = confusion_matrix(y_train, prediction_train_data)

# Print evaluation metrics
print(f"Accuracy: {accuracy_train_data}")
print(f"F1 Score: {f1}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")
print(f"Confusion Matrix:\n{conf_matrix}")

# Display metrics in a DataFrame (table format)
metrics_df = pd.DataFrame({
    'Metric': ['F1 Score', 'Recall', 'Precision'],
    'Value': [f1, recall, precision]
})
print("\nPerformance Metrics:\n")
print(metrics_df)

# Plot the confusion matrix as a heatmap
plt.figure(figsize=(6, 5))
sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues', cbar=False, 
            xticklabels=['Predicted Negative', 'Predicted Positive'],
            yticklabels=['Actual Negative', 'Actual Positive'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_train, model.predict_proba(X_train_features)[:, 1])
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')  # Diagonal line (no skill)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

# Evaluate on test data
prediction_test_data = model.predict(X_test_features)
accuracy_test_data = accuracy_score(y_test, prediction_test_data)
print(f"Accuracy on test data: {accuracy_test_data}")

# Example for prediction on user input
input_user_mail = ["You have won I phone 15 pro"]
input_data_features = feature_extraction.transform(input_user_mail)
prediction = model.predict(input_data_features)

if prediction[0] == 1:
    print("This is a ham mail")
else:
    print("This is a spam mail")

