In [1]:
pip install nltk



In [2]:
import nltk

In [3]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [5]:
import pandas as pd

In [7]:
dt = pd.read_csv("spam.csv" , encoding = 'Windows-1252')

In [9]:
import chardet
with open("spam.csv" , 'rb') as rawdata:
   result = chardet.detect(rawdata.read(100000))
result

{'confidence': 0.73, 'encoding': 'Windows-1252', 'language': ''}

In [10]:
dt.head()

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
dt['spam'] = dt['type'].map({'spam' : 1 , 'ham' : 0}).astype(int)

In [12]:
dt.head(4)

Unnamed: 0,type,text,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0


In [15]:
t = len(dt['type'])
print("NO OF ROWS IN REVIEW COLUMN:",t)

NO OF ROWS IN REVIEW COLUMN: 116


In [16]:
dt['text'][2]

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

In [17]:
def tokenizer(text):
  return text.split()

In [18]:
dt['text'] = dt['text'].apply(tokenizer)

In [19]:
dt['text'][2]

['Free',
 'entry',
 'in',
 '2',
 'a',
 'wkly',
 'comp',
 'to',
 'win',
 'FA',
 'Cup',
 'final',
 'tkts',
 '21st',
 'May',
 '2005.',
 'Text',
 'FA',
 'to',
 '87121',
 'to',
 'receive',
 'entry',
 'question(std',
 'txt',
 "rate)T&C's",
 'apply',
 "08452810075over18's"]

In [21]:
from nltk.stem.snowball import SnowballStemmer
porter = SnowballStemmer("english" , ignore_stopwords = False)

In [22]:
def stem_it(text):
  return [porter.stem(word) for word in text]

In [39]:
dt['text'] = dt['text'].apply(stem_it)

In [24]:
dt['text'][2]

['free',
 'entri',
 'in',
 '2',
 'a',
 'wkli',
 'comp',
 'to',
 'win',
 'fa',
 'cup',
 'final',
 'tkts',
 '21st',
 'may',
 '2005.',
 'text',
 'fa',
 'to',
 '87121',
 'to',
 'receiv',
 'entri',
 'question(std',
 'txt',
 'rate)t&c',
 'appli',
 '08452810075over18']

In [25]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [26]:
def lemmit_it(text):
  return [lemmatizer.lemmatize(word , pos = 'a') for word in text]

In [32]:
dt['text'] = dt['text'].apply(lemmit_it)

In [28]:
dt['text'][2]

['free',
 'entri',
 'in',
 '2',
 'a',
 'wkli',
 'comp',
 'to',
 'win',
 'fa',
 'cup',
 'final',
 'tkts',
 '21st',
 'may',
 '2005.',
 'text',
 'fa',
 'to',
 '87121',
 'to',
 'receiv',
 'entri',
 'question(std',
 'txt',
 'rate)t&c',
 'appli',
 '08452810075over18']

In [41]:
from nltk.corpus import  stopwords
stop_words = stopwords.words("english")

In [40]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [31]:
def stop_it(text):
  review = [word for word in text if not word in stop_words]
  return review

In [42]:
dt['text'] = dt['text'].apply(stop_it)

In [43]:
dt['text'][2]

['free',
 'entri',
 '2',
 'wkli',
 'comp',
 'win',
 'fa',
 'cup',
 'final',
 'tkts',
 '21st',
 'may',
 '2005.',
 'text',
 'fa',
 '87121',
 'receiv',
 'entri',
 'question(std',
 'txt',
 'rate)t&c',
 'appli',
 '08452810075over18']

In [44]:
dt.head()

Unnamed: 0,type,text,spam
0,ham,"[go, jurong, point,, crazy.., avail, bugi, n, ...",0
1,ham,"[ok, lar..., joke, wif, u, oni...]",0
2,spam,"[free, entri, 2, wkli, comp, win, fa, cup, fin...",1
3,ham,"[u, dun, say, ear, hor..., u, c, alreadi, say...]",0
4,ham,"[nah, think, goe, usf,, live, around, though]",0


In [47]:
dt['text'] = dt['text'].apply(' '.join)

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
y = dt.spam.values
x = tfidf.fit_transform(dt['text'])

In [49]:
x

<116x711 sparse matrix of type '<class 'numpy.float64'>'
	with 1077 stored elements in Compressed Sparse Row format>

In [50]:
from sklearn.model_selection import train_test_split
x_train,x_text,y_train,y_text = train_test_split(x,y,random_state = 1,test_size = 0.2, shuffle = False)

In [52]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(x_train,y_train)
y_pred=clf.predict(x_text)

In [53]:
from sklearn.metrics import accuracy_score
acc_log = accuracy_score(y_pred, y_text)*100
print("accuracy:",acc_log)

accuracy: 87.5
