In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from wordcloud import WordCloud

In [None]:

df = pd.read_csv("/kaggle/input/sms-spam-collection-dataset/spam.csv", encoding='ISO-8859-1')

In [None]:

df.head()

In [None]:
# Drop the columns that do not influence the result
df = df.drop(columns=df.columns[2:5])

In [None]:
# Checking the result
df.head()

In [None]:
# Renaming the columns to make the names clearer
df.columns = ['labels', 'data']

In [None]:
# Visualization of spam and not spam cases

# Count the number of observations for each category
label_counts = df['labels'].value_counts()

# Define pastel colors
pastel_colors = ['#66B2FF', '#FF9999']

# Create a histogram with pastel colors and space between bars
plt.hist([df[df['labels'] == 'spam']['labels'], df[df['labels'] == 'ham']['labels']],
         bins=2, color=pastel_colors, stacked=True, label=['Spam', 'Ham'])

# Add labels and title
plt.xlabel('Labels')
plt.ylabel('Frequency')
plt.title('Distribution of Spam and Ham')

# Add text with counts on the bars
for i, count in enumerate(label_counts):
    plt.text(i, count, str(count), ha='center', va='bottom')

# Move the legend lower
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15))

# Show the plot
plt.show()

In [None]:
# Split data into training and testing sets
df_train, df_test, y_train, y_test = train_test_split(df['data'], y, test_size=0.2, random_state=42)

In [None]:
# Check the shapes of the data splits
df_train.shape, df_test.shape, y_train.shape, y_test.shape

In [None]:
# Create a CountVectorizer for text feature extraction
featurizer = CountVectorizer(decode_error='ignore')
x_train = featurizer.fit_transform(df_train)
x_test = featurizer.transform(df_test)

In [None]:
# Check the result of feature extraction
x_train

In [None]:
# Create the Multinomial Naive Bayes model
model = MultinomialNB()
model.fit(x_train, y_train)

In [None]:
# Evaluate the model's performance
train_accuracy = model.score(x_train, y_train)
test_accuracy = model.score(x_test, y_test)

In [None]:
# Making predictions on the training and test data
Ptrain = model.predict(x_train)
Ptest = model.predict(x_test)

In [None]:
# Calculate F1 scores for training and test data
train_f1 = f1_score(y_train, Ptrain)
test_f1 = f1_score(y_test, Ptest)

# Calculate predicted probabilities for being in class 1 (spam) for both training and test data
Prob_train = model.predict_proba(x_train)[:, 1]
Prob_test = model.predict_proba(x_test)[:, 1]

# Calculate the AUC-ROC (Area Under the Receiver Operating Characteristic) score for both training and test data
train_auc = roc_auc_score(y_train, Prob_train)
test_auc = roc_auc_score(y_test, Prob_test)

In [None]:
# Create a DataFrame to display the metrics
metrics_df = pd.DataFrame({
    'Train/Test': ['Train', 'Test'],
    'Accuracy': [train_accuracy, test_accuracy],
    'F1': [train_f1, test_f1],
    'AUC': [train_auc, test_auc]
})

In [None]:
# Display the metrics table
print(metrics_df)

In [None]:
# Create a confusion matrix for training data
cm = confusion_matrix(y_train, Ptrain)

# Create a heatmap for the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", linewidths=.5, square=True, cbar=False,
            xticklabels=['Not Spam', 'Spam'],
            yticklabels=['Not Spam', 'Spam'])
plt.xlabel('Predicted')
plt.title('Confusion Matrix (Training Data)')
plt.gca().xaxis.tick_top()  # Put x-axis labels on top
plt.gca().xaxis.set_label_position('top')
plt.ylabel('Actual')
plt.show()

In [None]:
# Create a confusion matrix for test data
cm_test = confusion_matrix(y_test, Ptest)

# Create a heatmap for the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_test, annot=True, fmt="d", cmap="Blues", linewidths=.5, square=True, cbar=False,
            xticklabels=['Not Spam', 'Spam'],
            yticklabels=['Not Spam', 'Spam'])
plt.xlabel('Predicted')
plt.title('Confusion Matrix (Test Data)')
plt.gca().xaxis.tick_top()  # Put x-axis labels on top
plt.gca().xaxis.set_label_position('top')
plt.ylabel('Actual')
plt.show()

In [None]:
# Make predictions on the entire dataset
res=df['data'].iloc[2]
res=[res]
print(res)
x = featurizer.transform(res)
result= model.predict(x)
result

In [None]:
# The messages that were spam but were treated as not spam

unident_spam = df[(df['predictions'] == 0) & (df['binary_labels'] == 1)]['data']
for msg in unident_spam:
    print(msg)

In [None]:
# The messages that were not spam but were treated as spam

not_spam = df[(df['predictions'] == 1) & (df['binary_labels'] == 0)]['data']
for msg in not_spam:
    print(msg)

In [None]:
# Creating a word cloud

def visualize(label):
    words = ''
    for msg in df[df['labels'] == label]['data']:
        msg = msg.lower()
        words += msg + ' '
    wordcloud = WordCloud(width=600, height=400).generate(words)
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.show()

In [None]:
# The keywords that were more popular in spam category
visualize('spam')

In [None]:
# The keywords that were more popular in not spam category
visualize('ham')