In [17]:
import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import re
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix,accuracy_score
import tensorflow as tf
import spacy
from tensorflow import keras

In [6]:
data = pd.read_csv('spam.csv',encoding = 'latin1').drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4']).rename(columns={'v1':'label','v2':'text'})

In [7]:
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   object
 1   text    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [9]:
data.label.value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [13]:
data['label'].replace({'ham': 0, 'spam': 1}, inplace=True)

In [15]:
data.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [16]:
data.text[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [18]:
nlp = spacy.load('en_core_web_lg')

In [57]:
stemmer = WordNetLemmatizer()

In [60]:
def preprocess(x):
    
    # x = '' if pd.isna(x) else x # apply this only you have missing values in data
    # Replacing Spacial Charactors
    #text = re.sub(r'\W',' ',x)
        
    # Replacing Other words like .,__ etc...
    text = re.sub(r'[^a-zA-Z0-9]',' ',x)
    # Replacing all sibngle Charactors
    text = re.sub(r'\s+[a-zA-Z]\s+',' ',text)
    
    # Replacing sibngle Charactors from start
    text = re.sub(r'\^[a-zA-Z]\s+',' ',text)
    
    # Replacing Multiple space into single space
    text = re.sub(r'\s+',' ',text,flags=re.I) 
    
    text = nlp(text)
    
    text = [i.lemma_ for i in text if not i.is_punct]
         
    #text = [stemmer.lemmatize(i) for i in text]
    
    text = ' '.join(text)
    
    return text

In [61]:
data.text = data.text.apply(preprocess)

In [62]:
data.head()

Unnamed: 0,label,text
0,0,go until jurong point crazy Available only in ...
1,0,ok lar joke wif oni
2,1,free entry in 2 wkly comp to win FA Cup final ...
3,0,U dun say so early hor c already then say
4,0,nah don think he go to usf he live around here...


In [63]:
x_train,x_test,y_train,y_test = train_test_split(data.text,data.label,random_state=42,test_size=0.2)

In [69]:
cv = CountVectorizer(max_features=1000,min_df=5,max_df=0.7,stop_words=stopwords.words('english'))

In [70]:
cv_x_train = cv.fit_transform(x_train)
cv_x_test = cv.transform(x_test)

In [71]:
cv_x_train = cv_x_train.toarray()
cv_x_test = cv_x_test.toarray()

In [72]:
cv_x_train.shape

(4457, 1000)

In [73]:
pd.DataFrame(cv_x_train,columns=cv.get_feature_names_out())

Unnamed: 0,00,000,02,03,04,06,0800,08000839402,08000930705,0870,...,yeah,year,yep,yes,yesterday,yet,yo,yoga,yr,yup
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4452,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4453,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4454,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4455,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [74]:
tf = TfidfVectorizer(max_features=1000,min_df=5,max_df=0.7,stop_words=stopwords.words('english'))

In [75]:
tf_x_train = tf.fit_transform(x_train)
tf_x_test = tf.transform(x_test)

In [76]:
tf_x_train = tf_x_train.toarray()
tf_x_test = tf_x_test.toarray()

In [77]:
tf_x_train.shape

(4457, 1000)

In [78]:
pd.DataFrame(tf_x_train,columns=tf.get_feature_names_out())

Unnamed: 0,00,000,02,03,04,06,0800,08000839402,08000930705,0870,...,yeah,year,yep,yes,yesterday,yet,yo,yoga,yr,yup
0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.462654,0.0,0.0,0.0
1,0.389485,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
4,0.000000,0.0,0.0,0.0,0.286809,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4452,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
4453,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
4454,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
4455,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0


In [79]:
nv_model = MultinomialNB()

In [80]:
nv_model.fit(cv_x_train,y_train)

In [81]:
nv_pred = nv_model.predict(cv_x_test)
nv_pred

array([0, 0, 1, ..., 0, 0, 1], dtype=int64)

In [82]:
accuracy_score(y_test,nv_pred)

0.9811659192825112

In [83]:
confusion_matrix(y_test,nv_pred)

array([[956,   9],
       [ 12, 138]], dtype=int64)

In [84]:
model_1 = keras.Sequential([
    keras.layers.Dense(128, activation='relu', input_shape=(1000,)),  # Adjust the number of units as needed
    keras.layers.Dropout(0.5),  # Optional dropout layer for regularization
    keras.layers.Dense(64, activation='relu'),  # Additional layers can be added as needed
    keras.layers.Dropout(0.5),  # Optional dropout layer for regularization
    keras.layers.Dense(1, activation='sigmoid')  # Binary classification output (0 or 1)
])

In [85]:
model_1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [86]:
model_1.fit(cv_x_train, y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x14f4452ded0>

In [95]:
model_2 = keras.Sequential([
    keras.layers.Embedding(input_dim=1000, output_dim=128, input_length=1000),
    keras.layers.GlobalAveragePooling1D(),  # Use GlobalAveragePooling1D instead of Flatten for variable-length input
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(1, activation='sigmoid')
])

In [96]:
model_2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [97]:
model_2.fit(tf_x_train, y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x14f4a208b10>