In [None]:

import kagglehub
organizations_uciml_sms_spam_collection_dataset_path = kagglehub.dataset_download('organizations/uciml/sms-spam-collection-dataset')

print('Data source import complete.')


In [None]:

import numpy as np
import pandas as pd



import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
df = pd.read_csv('/kaggle/input/sms-spam-collection-dataset/spam.csv', encoding='latin1')

In [None]:
df.head()

## Data Cleaning

In [None]:
df.info()

In [None]:
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

In [None]:
df.sample(2)

In [None]:
df.rename(columns={'v1':'result','v2':'message'},inplace=True)

In [None]:
df.sample(2)

In [None]:
#changing the result coulm values to 0 and 1
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()

In [None]:
df['result']=encoder.fit_transform(df['result'])

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
#dropping duplicates
df=df.drop_duplicates(keep='first')

In [None]:
df.duplicated().sum()

In [None]:
df.shape

## EDA

In [None]:
df['result'].value_counts().plot.bar()

We hame more no. of ham messgaes than spam

In [None]:
df.sample(2)

In [None]:
import re

In [None]:
df['num_characters']=df['message'].apply(len)
df['num_words']=df['message'].apply(lambda x:len(x.split(' ')))
df['num_sentences']=df['message'].apply(lambda x:re.split(r'[.?]', x))

In [None]:
df.sample(2)

In [None]:
df['message'][1]

In [None]:
df['num_sentences'][1]

In [None]:
len(list(filter(None, df['num_sentences'][1])))

In [None]:
df['num_sentences']=df['num_sentences'].apply(lambda x:len(list(filter(None,x))))

In [None]:
df.head()

In [None]:
import seaborn as sns

In [None]:
#plot on basis on number of characters
sns.histplot(df[df['result']==0]['num_characters'],color='green')
sns.histplot(df[df['result']==1]['num_characters'],color='red')

In [None]:
#plot on basis on number of words
sns.histplot(df[df['result']==0]['num_words'],color='green')
sns.histplot(df[df['result']==1]['num_words'],color='red')

In [None]:
#plot on basis on number of sentences
sns.histplot(df[df['result']==0]['num_sentences'],color='green')
sns.histplot(df[df['result']==1]['num_sentences'],color='red')

In [None]:
df_excluded = df.drop(columns=['message'])

In [None]:
sns.heatmap(df_excluded.corr(),annot=True)

Increase in no. of characters -> means ham

## Model Building

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfid=TfidfVectorizer(max_features=2500)

In [None]:
X=tfid.fit_transform(df['message']).toarray()

In [None]:
y=df['result'].values

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score
from mlxtend.plotting import plot_confusion_matrix
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt

In [None]:
models = [[GaussianNB(), "Gaussian Naive Bayes"],
          [MultinomialNB(), "Multinomial Naive Bayes"],
          [BernoulliNB(), "Bernoulli Naive Bayes"],
          [SVC(kernel='sigmoid',gamma=1.0), "Support Vector"],
          [LogisticRegression(solver='liblinear',penalty='l1'), "Logistic regression"],
          [RandomForestClassifier(n_estimators = 50,random_state=2), "Random Forest"],
          [DecisionTreeClassifier(max_depth = 5), "Decision Trees"],
          [XGBClassifier(n_estimators=50, random_state=2), "XGBClassifier"],
          [GradientBoostingClassifier(n_estimators=50, random_state=2),"GradientBoostingClassifier"]]

for i in models:
    name = i[1]
    model = i[0]
    model.fit(X_train, y_train)
    y_pred1 = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred1)
    precision = precision_score(y_test, y_pred1)

    print(f'Model ==> {name}')
    print(f'Accuracy ==> {accuracy:.2f}')
    print(f'Precision ==> {precision:.2f}')
    cnf = confusion_matrix(y_test,y_pred1)
    fig, ax = plot_confusion_matrix(conf_mat = cnf)
    plt.show()
    print("\n")

## Using Multinomial Naive Bayes as Precision score is 1
### It is a specialized version of Naive Bayes that is designed more for text documents.

In [None]:
mnb=MultinomialNB()
mnb.fit(X_train, y_train)
y_pred = mnb.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))

In [None]:
import pickle
pickle.dump(tfid,open('vector.pkl','wb'))
pickle.dump(mnb,open('model.pkl','wb'))