In [1]:
#importing the necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('mail_data.csv') #Importing the data-set

In [3]:
print(df)

     Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567     spam  This is the 2nd time we have tried 2 contact u...
5568      ham               Will ü b going to esplanade fr home?
5569      ham  Pity, * was in mood for that. So...any other s...
5570      ham  The guy did some bitching but I acted like i'd...
5571      ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [4]:
data = df.where((pd.notnull(df)), '')

In [5]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [7]:
data.shape #To see the number of rows & columns

(5572, 2)

In [8]:
data.loc[data['Category'] == 'spam', 'Category',] = 0
data.loc[data['Category'] == 'ham', 'Category',] = 1

In [9]:
X= data['Message']
Y= data['Category']

In [10]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [11]:
print(Y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state = 3)
#0.2 means 80% training & 20% testing
#Random is used in ML to get consistent result

In [13]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5572,)
(4457,)
(1115,)


In [14]:
print(Y.shape)
print(Y_train.shape)
print(Y_test.shape)

(5572,)
(4457,)
(1115,)


In [15]:
# Initialize the TfidfVectorizer
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
#stop_words is here used to ignore those words which can be saely ignored wihtout changing the meaning of the sentene (eg: the, have, an, etc.)

# Transform the training and test data
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# Convert Y labels to integer
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [16]:
print(X_train)

3075                  Don know. I did't msg him recently.
1787    Do you know why god created gap between your f...
1614                         Thnx dude. u guys out 2nite?
4304                                      Yup i'm free...
3266    44 7732584351, Do you want a New Nokia 3510i c...
                              ...                        
789     5 Free Top Polyphonic Tones call 087018728737,...
968     What do u want when i come back?.a beautiful n...
1667    Guess who spent all last night phasing in and ...
3321    Eh sorry leh... I din c ur msg. Not sad alread...
1688    Free Top ringtone -sub to weekly ringtone-get ...
Name: Message, Length: 4457, dtype: object


In [17]:
print(X_train_features)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 34775 stored elements and shape (4457, 7431)>
  Coords	Values
  (0, 2329)	0.38783870336935383
  (0, 3811)	0.34780165336891333
  (0, 2224)	0.413103377943378
  (0, 4456)	0.4168658090846482
  (0, 5413)	0.6198254967574347
  (1, 3811)	0.17419952275504033
  (1, 3046)	0.2503712792613518
  (1, 1991)	0.33036995955537024
  (1, 2956)	0.33036995955537024
  (1, 2758)	0.3226407885943799
  (1, 1839)	0.2784903590561455
  (1, 918)	0.22871581159877646
  (1, 2746)	0.3398297002864083
  (1, 2957)	0.3398297002864083
  (1, 3325)	0.31610586766078863
  (1, 3185)	0.29694482957694585
  (1, 4080)	0.18880584110891163
  (2, 6601)	0.6056811524587518
  (2, 2404)	0.45287711070606745
  (2, 3156)	0.4107239318312698
  (2, 407)	0.509272536051008
  (3, 7414)	0.8100020912469564
  (3, 2870)	0.5864269879324768
  (4, 2870)	0.41872147309323743
  (4, 487)	0.2899118421746198
  :	:
  (4454, 2855)	0.47210665083641806
  (4454, 2246)	0.47210665083641806
  (4455, 4456)	0.24

In [18]:
model = LogisticRegression()

In [19]:
#training the logisticregression data with the training data
model.fit(X_train_features,Y_train) 

In [20]:
# Evaluate model accuracy
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [21]:
print(f"Accuracy on Training Data: {accuracy_on_training_data * 100:.2f}%")
print(f"Accuracy on Test Data: {accuracy_on_test_data * 100:.2f}%")

Accuracy on Training Data: 96.77%
Accuracy on Test Data: 96.68%


In [22]:
# Test the model with custom input
input_your_mail = [
    "This is the 2nd time we have tried to contact you. You have won the Rs 200000 prize. To claim is easy, just call"
]

# Transform the input text
input_data_features = feature_extraction.transform(input_your_mail)

# Predict
prediction = model.predict(input_data_features)

# Output the result
if prediction[0] == 1:
    print('Ham mail')
else:
    print('Spam mail')

Spam mail


In [None]:
#Exploring the Dataset

In [None]:
# Distribution of Spam vs. Ham
category_counts = data['Category'].value_counts()
print(category_counts)

# Average message length by category
data['Message_Length'] = data['Message'].apply(len)
avg_length = data.groupby('Category')['Message_Length'].mean()
print(avg_length)

In [None]:
#Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Bar plot for category distribution
category_counts.plot(kind='bar', color=['green', 'red'])
plt.title('Spam vs Ham Distribution')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(ticks=[0, 1], labels=['Ham', 'Spam'], rotation=0)
plt.show()

# Boxplot for message length
sns.boxplot(x='Category', y='Message_Length', data=data)
plt.title('Message Length by Category')
plt.show()

In [None]:
#Keyword Analysis 
#Identifying the most frequently used words in spam vs. ham messages using TfidfVectorizer or Counter

In [None]:
from collections import Counter

# Create a function to extract common words
def common_words(messages, top_n=10):
    all_words = ' '.join(messages).split()
    return Counter(all_words).most_common(top_n)

# Get top words in spam and ham
spam_words = common_words(data[data['Category'] == 0]['Message'])
ham_words = common_words(data[data['Category'] == 1]['Message'])

print("Top words in Spam:", spam_words)
print("Top words in Ham:", ham_words)

In [None]:
from wordcloud import WordCloud

spam_text = ' '.join(data[data['Category'] == 0]['Message'])
ham_text = ' '.join(data[data['Category'] == 1]['Message'])

spam_wordcloud = WordCloud(width=800, height=400, background_color='red').generate(spam_text)
ham_wordcloud = WordCloud(width=800, height=400, background_color='green').generate(ham_text)

# Spam WordCloud
plt.imshow(spam_wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Spam WordCloud")
plt.show()

# Ham WordCloud
plt.imshow(ham_wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Ham WordCloud")
plt.show()

In [None]:
#Analyzing Message Length Distribution
#Examine how message length varies across spam and ham. Longer messages could have a pattern worth analyzing.

In [None]:
# Plot histogram of message lengths by category
import matplotlib.pyplot as plt
import seaborn as sns

sns.histplot(data=data, x='Message_Length', hue='Category', bins=30, kde=True, palette='coolwarm')
plt.title('Message Length Distribution by Category')
plt.xlabel('Message Length')
plt.ylabel('Frequency')
plt.show()

In [None]:
#Character-Based Features
#Calculate and compare features like:
#Number of uppercase letters (often used in spam for emphasis).
#Special character count (e.g., $, !, #).

In [None]:
# Create new features
data['Uppercase_Count'] = data['Message'].apply(lambda x: sum(1 for char in x if char.isupper()))
data['Special_Char_Count'] = data['Message'].apply(lambda x: sum(1 for char in x if char in '!@#$%^&*'))

# Compare across categories
print(data.groupby('Category')[['Uppercase_Count', 'Special_Char_Count']].mean())


In [None]:
# Boxplot for uppercase letter counts
sns.boxplot(x='Category', y='Uppercase_Count', data=data)
plt.title('Uppercase Letter Count by Category')
plt.show()

# Boxplot for special character counts
sns.boxplot(x='Category', y='Special_Char_Count', data=data)
plt.title('Special Character Count by Category')
plt.show()


In [None]:
#Model Evaluation Enhancements

In [None]:
#a. Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Confusion matrix
cm = confusion_matrix(Y_test, prediction_on_test_data)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Spam', 'Ham'])
disp.plot(cmap='Blues')
plt.title('Confusion Matrix')
plt.show()


In [None]:
#b. Precision, Recall, F1-Score

In [None]:
from sklearn.metrics import classification_report

# Classification report
report = classification_report(Y_test, prediction_on_test_data, target_names=['Spam', 'Ham'])
print(report)


In [None]:
#Feature Importance Analysis
#Identifying which features (words) contribute the most to spam classification using the coefficients from the Logistic Regression model.

In [None]:
# Get feature importance from the model
feature_names = feature_extraction.get_feature_names_out()
coefficients = model.coef_[0]

# Combine feature names and their importance scores
feature_importance = pd.DataFrame({'Feature': feature_names, 'Importance': coefficients})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

# Top 10 positive and negative features
print("Top 10 Spam Indicators:\n", feature_importance.head(10))
print("Top 10 Ham Indicators:\n", feature_importance.tail(10))


In [None]:
# Bar plot for top spam indicators
top_spam_features = feature_importance.head(10)
top_spam_features.plot(kind='bar', x='Feature', y='Importance', legend=False, color='red')
plt.title('Top Words Indicating Spam')
plt.show()


In [None]:
#Experimenting with Other Models

In [None]:
#a. Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train_features, Y_train)

# Evaluate
rf_prediction = rf_model.predict(X_test_features)
rf_accuracy = accuracy_score(Y_test, rf_prediction)
print(f"Random Forest Accuracy: {rf_accuracy * 100:.2f}%")


In [None]:
#b. Support Vector Machine (SVM)

In [None]:
from sklearn.svm import SVC

# Train SVM
svm_model = SVC()
svm_model.fit(X_train_features, Y_train)

# Evaluate
svm_prediction = svm_model.predict(X_test_features)
svm_accuracy = accuracy_score(Y_test, svm_prediction)
print(f"SVM Accuracy: {svm_accuracy * 100:.2f}%")


In [None]:
#Comparing Model Performances

In [None]:
# Collect accuracy scores
model_accuracies = {
    "Logistic Regression": accuracy_on_test_data,
    "Random Forest": rf_accuracy,
    "SVM": svm_accuracy,
}

# Print model performances
print("Model Performance Comparison:")
for model, accuracy in model_accuracies.items():
    print(f"{model}: {accuracy * 100:.2f}%")

# Bar plot for model comparison
import matplotlib.pyplot as plt

plt.bar(model_accuracies.keys(), [v * 100 for v in model_accuracies.values()], color=['blue', 'green', 'orange'])
plt.ylabel('Accuracy (%)')
plt.title('Model Performance Comparison')
plt.show()


In [None]:
#Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'C': [0.1, 1, 10], 
    'kernel': ['linear', 'rbf'], 
    'gamma': [0.1, 1, 10]
}

# Perform grid search
grid_search = GridSearchCV(SVC(), param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train_features, Y_train)

# Best parameters and accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Test with best model
best_svm = grid_search.best_estimator_
best_svm_accuracy = accuracy_score(Y_test, best_svm.predict(X_test_features))
print(f"Best SVM Test Accuracy: {best_svm_accuracy * 100:.2f}%")


In [None]:
#Cross-Validation

In [None]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation on the best model
scores = cross_val_score(best_svm, X_train_features, Y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Scores: {scores}")
print(f"Mean Cross-Validation Accuracy: {np.mean(scores) * 100:.2f}%")


In [None]:
#Cluster Analysis

In [None]:
# using only the spam messages from the dataset. Transform the text data into numerical vectors using the TfidfVectorizer.
#Applying K-Means Clustering
#Using the KMeans algorithm to group similar spam messages into clusters.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import pandas as pd

# Filter spam messages
spam_messages = data[data['Category'] == 0]['Message']

# Vectorize the spam messages using TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=500)  # Limit features for better clustering
spam_vectors = tfidf.fit_transform(spam_messages)

# Apply K-Means clustering
num_clusters = 5  # Adjust based on your preference
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(spam_vectors)

# Get cluster labels
spam_messages_clustered = pd.DataFrame({
    'Message': spam_messages.values,
    'Cluster': kmeans.labels_
})

# Display sample messages for each cluster
for cluster in range(num_clusters):
    print(f"\nCluster {cluster} Messages:")
    print(spam_messages_clustered[spam_messages_clustered['Cluster'] == cluster].head(5)['Message'].to_string(index=False))


In [None]:
#Visualizing the Clusters

In [None]:
#a. Elbow Method for Optimal Clusters

In [None]:
from sklearn.metrics import silhouette_score

# Determine the optimal number of clusters
inertia = []
silhouette_scores = []
cluster_range = range(2, 11)

for k in cluster_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(spam_vectors)
    inertia.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(spam_vectors, kmeans.labels_))

# Plot inertia and silhouette scores
plt.figure(figsize=(12, 5))

# Elbow plot
plt.subplot(1, 2, 1)
plt.plot(cluster_range, inertia, marker='o')
plt.title('Elbow Method: Inertia')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')

# Silhouette score plot
plt.subplot(1, 2, 2)
plt.plot(cluster_range, silhouette_scores, marker='o', color='red')
plt.title('Silhouette Score')
plt.xlabel('Number of Clusters')
plt.ylabel('Score')

plt.tight_layout()
plt.show()


In [None]:
#b. Word Cloud for Clusters

In [None]:
from wordcloud import WordCloud

for cluster in range(num_clusters):
    cluster_messages = spam_messages_clustered[spam_messages_clustered['Cluster'] == cluster]['Message']
    cluster_text = ' '.join(cluster_messages)
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(cluster_text)
    
    # Plot the word cloud
    plt.figure(figsize=(8, 4))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"Cluster {cluster} Word Cloud")
    plt.show()
