In [None]:
#importing Libraries
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, mean_squared_error, mean_absolute_error
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

#Loading the dataset
data = pd.read_csv('spam.csv', encoding='latin-1')

In [None]:
#Explore the dataset
print(data.head())
print(data['v1'].value_counts())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  
ham     4825
spam     747
Name: v1, dtype: int64


In [None]:
#Preprocessing
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
# Remove special characters and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)
    
# Expand abbreviations
    abbreviations = {
        "u": "you",
        "r": "are",
        "ur": "your",
        "n": "and",
        "plz": "please",
# Only a few abbreviations added
    }
    words = text.split()
    expanded_words = [abbreviations.get(word, word) for word in words]
    text = ' '.join(expanded_words)
    
    return text

data['v2'] = data['v2'].apply(preprocess_text)
print(data.head())

     v1                                                 v2 Unnamed: 2  \
0   ham  go until jurong point crazy available only in ...        NaN   
1   ham                          ok lar joking wif you oni        NaN   
2  spam  free entry in a wkly comp to win fa cup final ...        NaN   
3   ham    you dun say so early hor you c already then say        NaN   
4   ham  nah i don t think he goes to usf he lives arou...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [None]:
#Split the data into training and test sets
X = data['v2']
y = data['v1']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#Vectorize the text data (with stop words removal)
vectorizer = CountVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
print(X_train.head())

1978    no i m in the same boat still here at my moms ...
3989    bank of granite issues strong buy explosive pi...
3935      they are giving a second chance to rahul dengra
4078              o i played smash bros lt gt religiously
4086    private your account statement for shows un re...
Name: v2, dtype: object


In [None]:
#Train a prediction model
model = MultinomialNB()
model.fit(X_train_vec, y_train)

In [None]:
#Evaluate the model
y_pred_train = model.predict(X_train_vec)
y_pred_test = model.predict(X_test_vec)

In [None]:
#accuracy
accuracy = accuracy_score(y_test, y_pred_test)
print("Accuracy:", accuracy)

Accuracy: 0.9829596412556054


In [None]:
#classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred_test))

Classification Report:
              precision    recall  f1-score   support

         ham       0.99      0.99      0.99       965
        spam       0.95      0.93      0.94       150

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [46]:
#Decison Tree

from sklearn.tree import DecisionTreeClassifier
categorical_cols = ['v1', 'v2', ...]
preprocessor = ColumnTransformer([('encoder', OneHotEncoder(), categorical_cols)],
                                 remainder='passthrough')

# Preprocess the training data
# X_train_encoded = preprocessor.fit_transform(X_train.values.reshape(-1, 1))

model_dt = DecisionTreeClassifier()

model_dt.fit(X_train,y_train)

y_pred_dt = model_dt.predict(X_test)
y_pred_dt

print("accuracy score :- ",accuracy_score(y_test,y_pred_dt))
print("precision score :- ",precision_score(y_test,y_pred_dt))

# Calculate RMSE
rmse_dt = np.sqrt(mean_squared_error(y_test, y_pred_dt))

# Calculate MSE
mse_dt = mean_squared_error(y_test, y_pred_dt)

# Calculate MAE
mae_dt = mean_absolute_error(y_test, y_pred_dt)

# Print the evaluation metrics
print("Root Mean Square Error (RMSE):", rmse_dt)
print("Mean Squared Error (MSE):", mse_dt)
print("Mean Absolute Error (MAE):", mae_dt)

ValueError: ignored

In [42]:
# Applyin Logistic Regression
from sklearn.linear_model import LogisticRegression

model_lr = LogisticRegression()
# train the model
model_lr.fit(X_train,y_train)

y_pred_lr=model_lr.predict(X_test)
y_pred_lr

print("accuracy score :- ",accuracy_score(y_test,y_pred_lr))
print("precision score :- ",precision_score(y_test,y_pred_lr))

# Calculate RMSE
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))

# Calculate MSE
mse_lr = mean_squared_error(y_test, y_pred_lr)

# Calculate MAE
mae_lr = mean_absolute_error(y_test, y_pred_lr)

# Print the evaluation metrics
print("Root Mean Square Error (RMSE):", rmse_lr)
print("Mean Squared Error (MSE):", mse_lr)
print("Mean Absolute Error (MAE):", mae_lr)

ValueError: ignored

In [47]:
#Applying Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score 
from sklearn.metrics import precision_score

from sklearn.metrics import mean_squared_error, mean_absolute_error



model_mnb=MultinomialNB()

model_mnb.fit(X_train,y_train)
# Predicting values
y_pred_mnb = model_mnb.predict(X_test)
y_pred_mnb

# Accessing Accuracy and Precision of model
print("Accuracy score :- ",accuracy_score(y_test,y_pred_mnb))
print("Precision score :- ",precision_score(y_test,y_pred_mnb))

# Calculate RMSE
rmse_mnb = np.sqrt(mean_squared_error(y_test, y_pred_mnb))

# Calculate MSE
mse_mnb = mean_squared_error(y_test, y_pred_mnb)

# Calculate MAE
mae_mnb = mean_absolute_error(y_test, y_pred_mnb)

# Print the evaluation metrics
print("Root Mean Square Error (RMSE):", rmse_mnb)
print("Mean Squared Error (MSE):", mse_mnb)
print("Mean Absolute Error (MAE):", mae_mnb)

ValueError: ignored

In [43]:
import matplotlib.pyplot as plt

# Define model names and error metrics
models = ["Monomial NB", "Logistic Regression","Decision tree"]
rmse = [rmse_mnb, rmse_lr, rmse_dt]
mae = [mae_mnb, mae_lr, mae_dt]
mse = [mse_mnb, mse_lr, mse_dt]

# Set the width of the bars
width = 0.2

# Set the positions of the bars
x1 = np.arange(len(models))
x2 = [x + width for x in x1]
x3 = [x + width for x in x2]

#plt.figure(figsize=(10, 15))

# Plot the bars
plt.bar(x1, rmse, width=width, color="#0081a7", label="RMSE")
plt.bar(x2, mae, width=width, color="#e09f3e", label="MAE")
plt.bar(x3, mse, width=width, color="#e63946", label="MSE")

# Add labels and title
plt.xlabel("Machine Learning Models")
plt.ylabel("Evalution Measures")
plt.title("Comparative Analysis Graphs")
plt.xticks(x2, models, rotation=45)
#plt.xticks(x2, models)
plt.legend(loc="upper right", bbox_to_anchor=(1.25, 1))

# Show the plot
plt.show()

NameError: ignored