<a href="https://colab.research.google.com/github/AdityaKumbhar21/Natural_Language_Processing/blob/main/SpamClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

In [2]:
messages = pd.read_csv('/content/SMSSpamCollection.txt', sep='\t', names=['label', 'message'])
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
messages.shape

(5572, 2)

In [4]:
# Data Cleaning and preprocessing
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [6]:
corpus = []
for i in range(len(messages)):
  review = re.sub('[^a-zA-Z0-9]',' ',messages['message'][i])
  review = review.lower()
  review = review.split()
  review = [ps.stem(word) for word in review if word not in set(stopwords.words('english'))]
  review = ' '.join(review)
  corpus.append(review)

In [58]:
corpus[:5]

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply',
 'u dun say early hor u c already say',
 'nah think go usf life around though']

In [8]:
# Bag of words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500, binary=True)
X = cv.fit_transform(corpus).toarray()

In [9]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [10]:
# label encoding for y
y = pd.get_dummies(messages['label'])
y = y.iloc[:,1].values

In [11]:
y

array([False, False,  True, ..., False, False, False])

In [12]:
# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0, test_size=0.2)

In [13]:
# training the multibionomialNB
from sklearn.naive_bayes import MultinomialNB
model_nb = MultinomialNB()
model_nb.fit(X_train, y_train)

In [14]:
y_pred = model_nb.predict(X_test)

In [15]:
from sklearn.metrics import accuracy_score, classification_report

print(accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

0.9865470852017937
              precision    recall  f1-score   support

       False       0.99      0.99      0.99       955
        True       0.97      0.94      0.95       160

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [16]:
# creating tfidf model
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features=2500)
X = tv.fit_transform(corpus).toarray()

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0, test_size=0.2)

In [18]:
model_nb = MultinomialNB()
model_nb.fit(X_train, y_train)

In [19]:
y_pred = model_nb.predict(X_test)

In [20]:
from sklearn.metrics import accuracy_score, classification_report

print(accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

0.9847533632286996
              precision    recall  f1-score   support

       False       0.98      1.00      0.99       955
        True       1.00      0.89      0.94       160

    accuracy                           0.98      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.99      0.98      0.98      1115



#### Word2Vec & AvgWord2Vec

In [21]:
!pip install gensim

In [22]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [23]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()

    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [57]:
corpus[:5]

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply',
 'u dun say early hor u c already say',
 'nah think go usf life around though']

In [26]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
words = []
for sent in corpus:
  sent_token = sent_tokenize(sent)
  for wrd in sent_token:
    words.append(simple_preprocess(wrd))

In [56]:
words[:10]

[['go',
  'jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'wkly',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply'],
 ['dun', 'say', 'early', 'hor', 'already', 'say'],
 ['nah', 'think', 'go', 'usf', 'life', 'around', 'though'],
 ['freemsg',
  'hey',
  'darling',
  'week',
  'word',
  'back',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'send',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'melle',
  'melle',
  'oru',
  'minnaminunginte',
  'nurungu',
  'vettam',
  'set',
  'callertune',
  'caller',
  'press',
  'copy',
  'friend',
  'callertune'],
 ['winner',
  'valued',
  'network',
  'customer',
  'selected',
  'receivea',
 

In [29]:
from gensim.models import Word2Vec
model = Word2Vec(words, window=5, min_count=2)

In [38]:
model.wv.index_to_key

TypeError: list indices must be integers or slices, not tuple

In [32]:
model.corpus_count

5564

In [33]:
model.epochs

5

In [36]:
model.wv.similar_by_word('happy')

[('day', 0.9994567632675171),
 ('special', 0.999413013458252),
 ('year', 0.9994052052497864),
 ('make', 0.9993621110916138),
 ('new', 0.9993387460708618),
 ('money', 0.9993364214897156),
 ('amp', 0.9993364214897156),
 ('life', 0.9993342161178589),
 ('like', 0.9993338584899902),
 ('keep', 0.9993292689323425)]

In [39]:
import numpy as np
def avg_word2vec(doc):
    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key],axis=0)


In [53]:
X=[]
for i in range(len(words)):
    X.append(avg_word2vec(words[i]))

In [60]:
X[:10]

[array([-0.09965029,  0.2405314 ,  0.01609807,  0.00749533,  0.05976058,
        -0.35290587,  0.09866989,  0.5344816 , -0.22521585, -0.16907847,
        -0.13517113, -0.31890607, -0.00599211,  0.07567509,  0.08025682,
        -0.24798854, -0.00938385, -0.3131826 ,  0.04140504, -0.51804656,
         0.15333107,  0.14894451,  0.11159205, -0.16477653, -0.1605067 ,
         0.00217226, -0.22292799, -0.14236598, -0.27193198,  0.05304903,
         0.3151811 , -0.01259398,  0.05170102, -0.13275789, -0.10945567,
         0.2633752 ,  0.05067233, -0.23324962, -0.16227466, -0.524902  ,
        -0.02445137, -0.23070551, -0.08280462,  0.01974526,  0.23087996,
        -0.07612992, -0.17764801, -0.02832966,  0.06426805,  0.14215116,
         0.14131065, -0.18566114, -0.04806152, -0.0087379 , -0.15222423,
         0.11606707,  0.18596889, -0.03257706, -0.31787124,  0.01838999,
         0.08800445,  0.17192304, -0.10727925, -0.02655538, -0.38929522,
         0.19063367,  0.14373247,  0.15379758, -0.3

In [54]:
type(X)

list

In [55]:
len(X)

5564