In [1]:
import pandas as pd  

In [2]:
df=pd.read_csv("smsspamcollection.csv",sep="\t",names=["label","message"])

In [3]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
#data cleaning and preprocessing 

In [5]:
df["message"][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [6]:
# converting all to lower case 
df["message"]=df["message"].str.lower()

In [7]:
df["message"][0]

'go until jurong point, crazy.. available only in bugis n great world la e buffet... cine there got amore wat...'

In [8]:
# This regex matches any character that is not a letter, number, or space
#cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

In [9]:
#Remove all special characters using re
import re 
df["message"]=df["message"].apply(lambda x:re.sub('[^a-zA-Z0-9\s]'," ",x))

In [10]:
df["message"][0]

'go until jurong point  crazy   available only in bugis n great world la e buffet    cine there got amore wat   '

In [11]:
#Removing trailing extra spaces 
df["message"]=df["message"].str.strip()

In [12]:
df["message"][0]

'go until jurong point  crazy   available only in bugis n great world la e buffet    cine there got amore wat'

In [13]:
import nltk
from nltk.corpus import stopwords

In [14]:
# Remove URLs, emails, and mentions (@username)
df["message"]=df["message"].apply(lambda x:re.sub(r'http\S+|www\S+|@\S+', ' ',x))

In [15]:
#Removing stopwords from the sentences
stopWords=stopwords.words("english")
df["message"]=df["message"].apply(lambda x:" ".join([y for y in x.split() if y not in set(stopWords)]))

In [16]:
df["message"][0]

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [17]:
#applying preprocess from gensim
from gensim.utils import simple_preprocess

In [18]:
df["message"]=df["message"].apply(lambda x:" ".join(simple_preprocess(x)))

In [19]:
df["message"][0]

'go jurong point crazy available bugis great world la buffet cine got amore wat'

In [20]:
#finally we cleaned the data lets apply lemmatization 

In [21]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [22]:
#lemmatizer.lemmatize("happiest",pos="a")

In [23]:
def lemmatization(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

In [24]:
df["message"]=df["message"].apply(lambda x:lemmatization(x))

In [25]:
df["message"][0]

'go jurong point crazy available bugis great world la buffet cine got amore wat'

In [26]:
df.head()

Unnamed: 0,label,message
0,ham,go jurong point crazy available bugis great wo...
1,ham,ok lar joking wif oni
2,spam,free entry wkly comp win fa cup final tkts st ...
3,ham,dun say early hor already say
4,ham,nah think go usf life around though


In [27]:
#pd.get_dummies(df["label"])

ham is 0 and spam is 1

In [28]:
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
y = df['label'].values

In [29]:
y

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [30]:
y

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [31]:
#y=y.iloc[:,0].values

In [32]:

#y

In [33]:
#df["message"]

In [34]:
#Successfully cleaned the data and also applied lemmatization to the data 
#Lets do train test split

Train Test Split 

In [35]:
from sklearn.model_selection import train_test_split

In [36]:
X_train,X_test,y_train,y_test=train_test_split(df["message"],y,test_size=0.3,random_state=42)

In [37]:
X_train.shape,X_test.shape

((3899,), (1672,))

In [38]:
y_train.shape,y_test.shape 

((3899,), (1672,))

Creating pretrained word2vec model which is trained by google

In [39]:
import gensim.downloader as api 

In [40]:
word2vec=api.load("word2vec-google-news-300")

In [41]:
word2vec.similar_by_word("boy")

[('girl', 0.8543272018432617),
 ('teenager', 0.7606689929962158),
 ('toddler', 0.7043969035148621),
 ('teenage_girl', 0.6851482391357422),
 ('man', 0.6824870109558105),
 ('teen_ager', 0.6499968767166138),
 ('son', 0.6337764263153076),
 ('kid', 0.63228440284729),
 ('youngster', 0.618381679058075),
 ('stepfather', 0.5989423394203186)]

In [42]:
word2vec["boy"].shape 

(300,)

In [43]:
df["message"].head()

0    go jurong point crazy available bugis great wo...
1                                ok lar joking wif oni
2    free entry wkly comp win fa cup final tkts st ...
3                        dun say early hor already say
4                  nah think go usf life around though
Name: message, dtype: object

In [44]:
import numpy as np

In [45]:
type(word2vec.vector_size)

int

In [46]:
#k=np.zeros(300)
#k.ndim

In [47]:
#Function for converting each sentence into a vector 
def sentence_to_vector(sentences,model):
    avg_vectors=[]
    for sentence in sentences:
        words=sentence.split()
        word_vectors=[model[word] for word in words if word in model]
        if word_vectors:
            avg_vector=np.mean(word_vectors,axis=0)
        else:
            avg_vector=np.zeros(model.vector_size)
        avg_vectors.append(avg_vector)
    return avg_vectors


In [48]:
#converting all training sentences into vectors for training model
X_train_word2vec=sentence_to_vector(X_train,model=word2vec)

In [49]:
# for each sentence we are generating a vector 
len(X_train_word2vec)

3899

In [50]:
#Converting all test data sentences into Testing vecctors for model
X_test_word2vec=sentence_to_vector(X_test,model=word2vec)

Applying machine learning algorithm

In [51]:
# Applying logistics regression 
from sklearn.linear_model import LogisticRegression
ml_model=LogisticRegression()

In [52]:
ml_model.fit(X_train_word2vec,y_train)

Making predictions

In [53]:
y_pred=ml_model.predict(X_test_word2vec)

In [54]:
#y_pred

Evaluating the model

In [55]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [56]:
print(f"Accuracy score :{accuracy_score(y_test,y_pred)*100:.2f}%")
print()
print(f"confusion matrix :\n{confusion_matrix(y_test,y_pred)}")
print()
print(f"classification report :{classification_report(y_test,y_pred)}")

Accuracy score :95.04%

confusion matrix :
[[1424   27]
 [  56  165]]

classification report :              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1451
           1       0.86      0.75      0.80       221

    accuracy                           0.95      1672
   macro avg       0.91      0.86      0.89      1672
weighted avg       0.95      0.95      0.95      1672



In [57]:
''' 
By this we can conclude that using deeplearning trained word2vec 
can inceases a lot of accuracy which is very high compared BOW,TF-IDF
'''

' \nBy this we can conclude that using deeplearning trained word2vec \ncan inceases a lot of accuracy which is very high compared BOW,TF-IDF\n'

In [79]:
spam_mail = "you won 100000 million rupees.please open the link "
notspam_mail = "In today's news scientists have discovered a new species of frog in the Amazon rainforest"
spam_vector = sentence_to_vector([spam_mail], model=word2vec)
notspam_vector = sentence_to_vector([notspam_mail], model=word2vec)

# Ensure input shapes match expected dimensions
print("Spam prediction:", ml_model.predict(spam_vector))
print("Not-spam prediction:", ml_model.predict(notspam_vector))

Spam prediction: [1]
Not-spam prediction: [0]


checking weather mail is spam or not 

In [86]:
def check_mail(model=word2vec,mlmodel=ml_model):
    text=input("Write mail to check spam or not ")
    vector=sentence_to_vector([text],model)
    result=mlmodel.predict(vector)
    
    if 1 in result:
        print("Spam Mail.....")
    else:
        print("Not Spam Mail.....")

In [89]:
check_mail()

Spam Mail.....
