<a href="https://colab.research.google.com/github/ArthAgrawal/NLP_Concepts/blob/main/SMS_Spam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
messages = pd.read_csv('/content/SMSSpamCollection.txt', sep='\t',
                           names=["label", "message"])

In [4]:
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [7]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
corpus

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()

In [9]:
y=pd.get_dummies(messages['label'])
y=y.iloc[:,1].values

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [11]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [12]:
y_pred=spam_detect_model.predict(X_test)

In [13]:
from sklearn.metrics import accuracy_score,classification_report

In [14]:
score=accuracy_score(y_test,y_pred)
print(score)

0.9856502242152466


In [15]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

       False       0.99      0.99      0.99       953
        True       0.96      0.94      0.95       162

    accuracy                           0.99      1115
   macro avg       0.97      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [39]:
def predict_spam(message):
    # Preprocess the new message
    review = re.sub('[^a-zA-Z]', ' ', message)
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)

    # Convert the preprocessed message into a format that the model can understand
    new_message = cv.transform([review]).toarray()

    # Make the prediction
    prediction = spam_detect_model.predict(new_message)

    # Return the prediction (0 for not spam, 1 for spam)
    return prediction[0]

# Example usage
message = "Congratulations! You've won a free vacation. Click here to claim your prize."
message2 = "Hello, youve just won the lottery"
prediction = predict_spam(message)
if prediction == 1:
    print("The message is classified as spam.")
else:
    print("The message is not classified as spam.")


The message is classified as spam.


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features=2500)
X = tv.fit_transform(corpus).toarray()

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [19]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [20]:
y_pred=spam_detect_model.predict(X_test)

In [21]:
score=accuracy_score(y_test,y_pred)
print(score)

0.979372197309417


In [22]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

       False       1.00      0.98      0.99       978
        True       0.86      1.00      0.92       137

    accuracy                           0.98      1115
   macro avg       0.93      0.99      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [46]:
def predict_spam(message):
    # Preprocess the new message
    review = re.sub('[^a-zA-Z]', ' ', message)
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)

    # Convert the preprocessed message into a format that the model can understand
    new_message = tv.transform([review]).toarray()

    # Make the prediction
    prediction = spam_detect_model.predict(new_message)

    # Return the prediction (0 for not spam, 1 for spam)
    return prediction[0]

# Example usage
message1 = "Congratulations! You've won a free vacation. Click here to claim your prize."
message2 = "Hello, you've just won the lottery for a million dollars free vacation prize"

prediction1 = predict_spam(message1)
prediction2 = predict_spam(message2)

if prediction1 == 1:
    print("The message is classified as spam.")
else:
    print("The message is not classified as spam.")

if prediction2 == 1:
    print("The message is classified as spam.")
else:
    print("The message is not classified as spam.")

The message is classified as spam.
The message is not classified as spam.


In [23]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [24]:
import nltk
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [25]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()

    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [26]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [44]:
corpus[0]

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [27]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [28]:
words=[]
for sent in corpus:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))

In [None]:
words

In [29]:
import gensim

In [30]:
model=gensim.models.Word2Vec(words,window=5,min_count=2)

In [None]:
model.wv.index_to_key

In [52]:
model.corpus_count

5564

In [32]:
model.train(words, total_examples=model.corpus_count, epochs=5)



(202846, 237810)

In [35]:
model.wv.similar_by_word('dollar')

[('decimal', 0.9775878190994263),
 ('transfered', 0.9736889600753784),
 ('murdered', 0.9733108282089233),
 ('gt', 0.9725741147994995),
 ('division', 0.9712346196174622),
 ('lt', 0.9690624475479126),
 ('beneficiary', 0.9688500165939331),
 ('disconnect', 0.9546566605567932),
 ('wifi', 0.9521756768226624),
 ('credited', 0.9512760043144226)]

In [49]:
import numpy as np
from sklearn.linear_model import LogisticRegression
def message_to_vec(message):
    review = re.sub('[^a-zA-Z]', ' ', message)
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]

    # Aggregate word vectors
    vec = np.zeros(model.vector_size)
    count = 0
    for word in review:
        if word in model.wv.index_to_key:
            vec += model.wv[word]
            count += 1
    if count > 0:
        vec /= count
    return vec

# Transform the entire corpus to vector representations
X = np.array([message_to_vec(message) for message in corpus])
y = pd.get_dummies(messages['label'])
y = y.iloc[:, 1].values

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

# Train Naive Bayes model (or any other suitable classifier)
spam_detect_model = LogisticRegression().fit(X_train, y_train)
y_pred = spam_detect_model.predict(X_test)

# Function to predict if a message is spam
def predict_spam(message):
    vec = message_to_vec(message)
    vec = vec.reshape(1, -1)
    prediction = spam_detect_model.predict(vec)
    return prediction[0]

# Example usage
message1 = "Congratulations! You've won a free vacation. Click here to claim your prize."
message2 = "Hello, you've just won the lottery"

prediction1 = predict_spam(message1)
prediction2 = predict_spam(message2)

if prediction1 == 1:
    print("The message is classified as spam.")
else:
    print("The message is not classified as spam.")

if prediction2 == 1:
    print("The message is classified as spam.")
else:
    print("The message is not classified as spam.")

The message is classified as spam.
The message is not classified as spam.
