In [0]:
import nltk
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [0]:
df = pd.read_csv("SMSSpamCollection",sep='\t',header=None)

In [0]:
df.columns = ['Category','Message']
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [0]:
#PRE PROCESSING
#TO ELIMINATE PUNCTUATIONS
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [0]:
def ClearPunc(text):
  NewMessage = "".join([c for c in text if c not in string.punctuation])
  return NewMessage

In [0]:
df['NoPuncMsg'] = df['Message'].apply(lambda x : ClearPunc(x))

In [0]:
df.head()

Unnamed: 0,Category,Message,NoPuncMsg
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...


In [0]:
#TOKENZATION 
import re
def token(text):
  newtoken = re.split('\W',text)
  return newtoken

In [0]:
df['TokenData'] = df['NoPuncMsg'].apply(lambda x : token(x))

In [0]:
df.head()

Unnamed: 0,Category,Message,NoPuncMsg,TokenData
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[Go, until, jurong, point, crazy, Available, o..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[Ok, lar, Joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[Free, entry, in, 2, a, wkly, comp, to, win, F..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[U, dun, say, so, early, hor, U, c, already, t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[Nah, I, dont, think, he, goes, to, usf, he, l..."


In [0]:
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> all
    Downloading collection 'all'
       | 
       | Downloading package abc to /root/nltk_data...
       |   Unzipping corpora/abc.zip.
       | Downloading package alpino to /root/nltk_data...
       |   Unzipping corpora/alpino.zip.
       | Downloading package biocreative_ppi to /root/nltk_data...
       |   Unzipping corpora/biocreative_ppi.zip.
       | Downloading package brown to /root/nltk_data...
       |   Unzipping corpora/brown.zip.
       | Downloading package brown_tei to /root/nltk_data...
       |   Unzipping corpora/brown_tei.zip.
       | Downloading package cess_cat to /root/nltk_data...
       |   Unzipping corpora/cess_cat.zip.
       | Downloading package

True

In [0]:
#REMOVE STOP WORDS
stopwords = nltk.corpus.stopwords.words('english')

In [0]:
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [0]:
def stopwor(text):
  new = [word for word in text if word not in stopwords]
  return new

In [0]:
df['NoStopMsg'] = df['TokenData'].apply(lambda x : stopwor(x))

In [0]:
df.head()

Unnamed: 0,Category,Message,NoPuncMsg,TokenData,NoStopMsg
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[Go, until, jurong, point, crazy, Available, o...","[Go, jurong, point, crazy, Available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[Ok, lar, Joking, wif, u, oni]","[Ok, lar, Joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[Free, entry, in, 2, a, wkly, comp, to, win, F...","[Free, entry, 2, wkly, comp, win, FA, Cup, fin..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[U, dun, say, so, early, hor, U, c, already, t...","[U, dun, say, early, hor, U, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[Nah, I, dont, think, he, goes, to, usf, he, l...","[Nah, I, dont, think, goes, usf, lives, around..."


In [0]:
#STEMMING 
from nltk.stem import PorterStemmer

In [0]:
ps = PorterStemmer()

In [0]:
def port(text):
  new=[ps.stem(word) for word in text]
  return new

In [0]:
df['StemMsg'] = df['NoStopMsg'].apply(lambda x : port(x))

In [0]:
df.head()

Unnamed: 0,Category,Message,NoPuncMsg,TokenData,NoStopMsg,StemMsg
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[Go, until, jurong, point, crazy, Available, o...","[Go, jurong, point, crazy, Available, bugis, n...","[Go, jurong, point, crazi, avail, bugi, n, gre..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[Ok, lar, Joking, wif, u, oni]","[Ok, lar, Joking, wif, u, oni]","[Ok, lar, joke, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[Free, entry, in, 2, a, wkly, comp, to, win, F...","[Free, entry, 2, wkly, comp, win, FA, Cup, fin...","[free, entri, 2, wkli, comp, win, FA, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[U, dun, say, so, early, hor, U, c, already, t...","[U, dun, say, early, hor, U, c, already, say]","[U, dun, say, earli, hor, U, c, alreadi, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[Nah, I, dont, think, he, goes, to, usf, he, l...","[Nah, I, dont, think, goes, usf, lives, around...","[nah, I, dont, think, goe, usf, live, around, ..."


In [0]:
#LEMMATIZATION
wd = nltk.WordNetLemmatizer()

In [0]:
def lemm(text):
  new = [ wd.lemmatize(i) for i in text]
  return new

In [0]:
df.drop('NoStopMsg',inplace=True,axis=1)

In [0]:
#df['LemmData'] = df['NoStopMsg'].apply(lambda x : lemm(x))
df['LemmData'] = df['TokenData'].apply(lambda x : lemm(x))

In [0]:
df.head()

Unnamed: 0,Category,Message,NoPuncMsg,TokenData,LemmData
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[Go, until, jurong, point, crazy, Available, o...","[Go, until, jurong, point, crazy, Available, o..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[Ok, lar, Joking, wif, u, oni]","[Ok, lar, Joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[Free, entry, in, 2, a, wkly, comp, to, win, F...","[Free, entry, in, 2, a, wkly, comp, to, win, F..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[U, dun, say, so, early, hor, U, c, already, t...","[U, dun, say, so, early, hor, U, c, already, t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[Nah, I, dont, think, he, goes, to, usf, he, l...","[Nah, I, dont, think, he, go, to, usf, he, lif..."


In [0]:
#WORD2VEC ALGORITHM
import gensim.models
from gensim.models import Word2Vec

In [0]:
word2vec = Word2Vec(df['LemmData'],min_count=2,size=1)

In [0]:
def wordlem(text):
  naveen = [ word2vec.wv[word] for word in text if word in word2vec ]
  return naveen

In [0]:
def addlabel(text):
  for word in text:
    if word == 'ham':
      return 1
    else:
      return 0

In [0]:
df['Label'] = pd.get_dummies(df['Category'],drop_first=True)

In [0]:
df.head()

Unnamed: 0,Category,Message,NoPuncMsg,TokenData,LemmData,VecData,Label
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[Go, until, jurong, point, crazy, Available, o...","[Go, until, jurong, point, crazy, Available, o...","[[-1.0569474], [-2.2356384], [-1.8089591], [-0...",0
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[Ok, lar, Joking, wif, u, oni]","[Ok, lar, Joking, wif, u, oni]","[[-3.1942227], [-2.0337067], [-2.065663], [-5....",0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[Free, entry, in, 2, a, wkly, comp, to, win, F...","[Free, entry, in, 2, a, wkly, comp, to, win, F...","[[-2.8035989], [-1.3487732], [-5.8547773], [-6...",1
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[U, dun, say, so, early, hor, U, c, already, t...","[U, dun, say, so, early, hor, U, c, already, t...","[[-6.2960367], [-2.8825257], [-3.8846712], [-5...",0
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[Nah, I, dont, think, he, goes, to, usf, he, l...","[Nah, I, dont, think, he, go, to, usf, he, lif...","[[-0.26281384], [-5.615696], [-4.686973], [-3....",0


In [0]:
df['VecData'] = df['LemmData'].apply(lambda x : wordlem(x))

  


In [0]:
print(word2vec)

Word2Vec(vocab=5057, size=1, alpha=0.025)


In [0]:
voc = word2vec.wv.vocab

In [0]:
print(voc)



In [0]:
word2vec.wv['Go']

array([-1.0569474], dtype=float32)

In [0]:
word2vec.wv.most_similar('reward')

  if np.issubdtype(vec.dtype, np.int):


[('Hi', 0.9990946054458618),
 ('NEW', 0.9990354776382446),
 ('welcome', 0.9990329742431641),
 ('Its', 0.9990317225456238),
 ('hi', 0.9990272521972656),
 ('game', 0.9990159273147583),
 ('of', 0.9989968538284302),
 ('That', 0.9989962577819824),
 ('is', 0.9989895820617676),
 ('after', 0.9989854097366333)]

In [137]:
#POS TAGGING
def pos_tagging(text):
  for i in text:
    newtoken = re.split('\W',i)
    newtag = nltk.pos_tag(newtoken)
    #newner = nltk.ne_chunk(newtag)
    return newtoken

var ="THIS IS NAVEEN SPEAKING"
n = nltk.word_tokenize(var)
m = nltk.pos_tag(n)
r = nltk.ne_chunk(m)
print(r)

(S THIS/NNP IS/VBZ (ORGANIZATION NAVEEN/NNP) SPEAKING/NN)


In [138]:
#VECTORIZATION
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(analyzer=CleanFunc).fit(df['Message'])
tran = cv.transform(df['Message'])

#SHAPE OF SPARSE MATRIX
print(tran.shape)

#NON ZERO ACCURANCE
tran.nnz

NameError: ignored

In [0]:
#print(len(cv.vocabulary_))

In [0]:
#mess4 = df['Message'][3]

In [0]:
"""print(mess4)
bow = cv.transform([mess4])
print(bow)
print(bow.shape)"""

'print(mess4)\nbow = cv.transform([mess4])\nprint(bow)\nprint(bow.shape)'

In [0]:
#cv.get_feature_names()[6410]

In [0]:
"""X = cv.fit(df['Message'])
X = cv.transform(df['Message'])"""

"X = cv.fit(df['Message'])\nX = cv.transform(df['Message'])"

In [0]:
#print(X.shape)

In [0]:
#print(X.toarray())

In [0]:
#print(cv.get_feature_names())

In [0]:
#print(X)

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from sklearn.model_selection import train_test_split

In [0]:
lr = LogisticRegression()

In [0]:
X = df['VecData']
y = df['Label']

In [0]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42,test_size=0.3)

In [0]:
lr.fit(X_train,y_train)



ValueError: ignored