In [None]:
# importing the data 

import pandas as pd

df = pd.read_csv(r"C:\Users\dhruv\Downloads\Datasets\NLP DATA\spam.csv", encoding='ISO-8859-1')
df.head()


df=df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)
df.rename(columns={'v1':'target','v2':'text'},inplace=True)

# EDA
df.duplicated().sum()
df.drop_duplicates(inplace=True)

df['target'].value_counts()
# class is imbalanced

# Preprocessing the data 
df['text']=df['text'].str.lower()

# removing htlm tags
import bs4
from bs4 import BeautifulSoup

def remove_html(text):
    soup=BeautifulSoup(text,'html.parser')
    return soup.get_text(separator='',strip=True)

# remove punctuations
import string
exclude=string.punctuation

def remove_punctions(text):
    for word in exclude:
        text=text.replace(word,"")
    return text 


# lemitization tokenization urls and numbers
import spacy       

nlp=spacy.load('en_core_web_sm')

def Tokenization(text):
    cols=nlp(text)
    Tokens=[]
    for token in cols:
        if token.like_url:
            Tokens.append('url')
        elif token.like_num:
            Tokens.append('num')
        elif token.is_alpha and not  token.is_stop:
            Tokens.append(token.lemma_)
    return " ".join(Tokens)
        

def preprocess(text):
    if  not isinstance(text,str):
        return ""
     
    text=text.lower()
    text=remove_html(text)
    text=remove_punctions(text)
    return Tokenization(text)

    
df['text']=df['text'].apply(preprocess)

# vectorizing the data  

x=df['text']
y=df['target']

# convertintg first target to numeric
import sklearn
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
y=encoder.fit_transform(y)

#spliting the data
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=2)

# Now convertintg text to numeric
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()

x_train_vec=tfidf.fit_transform(x_train)
x_test_vec=tfidf.transform(x_test)

x_train_dense=  pd.DataFrame(x_train_vec.toarray(), columns=tfidf.get_feature_names_out())
x_test_dense=  pd.DataFrame(x_test_vec.toarray(), columns=tfidf.get_feature_names_out())


# handling the imbalance in the data 
from imblearn.over_sampling import SMOTE
smote=SMOTE(random_state=42)
xtrain,ytrain=smote.fit_resample(x_train_dense,y_train)
print("After SMOTE:")
print(pd.Series(ytrain).value_counts())
#===============================================================================================
print("="*100)
print(" ")
print('MultinomialNB:\n')
# Create MultinomialNB Model
from sklearn.naive_bayes import MultinomialNB


Model1=MultinomialNB()
Model1.fit(xtrain,ytrain)
ypred=Model1.predict(x_test_dense)

# Checking Accuracy 
from sklearn.metrics import accuracy_score,classification_report
acc=accuracy_score(ypred,y_test)
cr=classification_report(ypred,y_test)
print(f'Accuracy:{acc*100:.2f}')
print(" ")
print(f'Classification Reort:\n{cr}')

print("="*100)
print(" ")
print('RandomForestClassifier:\n')

from sklearn.ensemble import RandomForestClassifier

Model2=RandomForestClassifier()
Model2.fit(xtrain,ytrain)
ypred2=Model2.predict(x_test_dense)

# Checking Accuracy 
from sklearn.metrics import accuracy_score,classification_report
acc=accuracy_score(y_test,ypred2)
cr=classification_report(y_test,ypred2)
print(f'Accuracy:{acc*100:.2f}')
print(" ")
print(f'Classification Reort:\n{cr}')

print("="*100)
print(" ")
print('LGBMClassifier:\n')

from lightgbm import LGBMClassifier

Model3=LGBMClassifier()
Model3.fit(xtrain,ytrain)
ypred3=Model3.predict(x_test_dense)

# Checking Accuracy 
from sklearn.metrics import accuracy_score,classification_report
acc=accuracy_score(y_test,ypred3)
cr=classification_report(y_test,ypred3)
print(f'Accuracy:{acc*100:.2f}')
print(" ")
print('Classification Reort:\n')
print(cr)
print("="*100)

# saving 
import joblib
joblib.dump(Model3,'spam_classfier.joblib')
joblib.dump(tfidf,'tfidf_vec.joblib')
print('Model and vectorizer saved')

After SMOTE:
0    3620
1    3620
Name: count, dtype: int64
 
MultinomialNB:

Accuracy:94.68
 
Classification Reort:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       857
           1       0.94      0.73      0.83       177

    accuracy                           0.95      1034
   macro avg       0.94      0.86      0.90      1034
weighted avg       0.95      0.95      0.94      1034

 
RandomForestClassifier:

Accuracy:97.58
 
Classification Reort:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       896
           1       0.99      0.83      0.90       138

    accuracy                           0.98      1034
   macro avg       0.98      0.91      0.94      1034
weighted avg       0.98      0.98      0.97      1034

 
LGBMClassifier:

[LightGBM] [Info] Number of positive: 3620, number of negative: 3620
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of te

Model and vectorizer saved
