## Prelims

In [1]:
import nltk

In [2]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\akash\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
import pandas as pd
df = pd.read_csv('spam1.csv')

In [4]:
df.head()

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df["spam"] = df["type"].map({"spam":1, "ham":0}).astype(int)

In [6]:
df.head()

Unnamed: 0,type,text,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


## Tokenization

In [7]:
def tokenizer(text):
    return text.split()

In [8]:
df["text"]= df["text"].apply(tokenizer)

## Stemming

In [9]:
from nltk.stem import SnowballStemmer
porter = SnowballStemmer("english",ignore_stopwords = False)

In [10]:
def stem(text):
    return [porter.stem(word) for word in text]

In [11]:
df["text"]= df["text"].apply(stem)

## Lemmatization

In [12]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [13]:
def lemmit(text):
    return [lemmatizer.lemmatize(word, pos='a') for word in text]

## Remove stopwords

In [14]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\akash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
from nltk.corpus import stopwords
stop_words = stopwords.words("english")

In [16]:
def stop(text):
    review = [word for word in text if word not in stop_words]
    return review

In [17]:
df["text"]= df["text"].apply(stop)

In [18]:
df.head()

Unnamed: 0,type,text,spam
0,ham,"[go, jurong, point,, crazy.., avail, onli, bug...",0
1,ham,"[ok, lar..., joke, wif, u, oni...]",0
2,spam,"[free, entri, 2, wkli, comp, win, fa, cup, fin...",1
3,ham,"[u, dun, say, earli, hor..., u, c, alreadi, sa...",0
4,ham,"[nah, think, goe, usf,, live, around, though]",0


In [19]:
df["text"] = df["text"].apply(" ".join)

In [20]:
df.head()

Unnamed: 0,type,text,spam
0,ham,"go jurong point, crazy.. avail onli bugi n gre...",0
1,ham,ok lar... joke wif u oni...,0
2,spam,free entri 2 wkli comp win fa cup final tkts 2...,1
3,ham,u dun say earli hor... u c alreadi say...,0
4,ham,"nah think goe usf, live around though",0


## Vectorizer

In [21]:
import sklearn
sklearn.__version__

'0.24.1'

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfobj = TfidfVectorizer()
x = tfidfobj.fit_transform(df["text"])

In [23]:
x

<5572x8117 sparse matrix of type '<class 'numpy.float64'>'
	with 50088 stored elements in Compressed Sparse Row format>

In [24]:
y = df.spam.values

In [25]:
y

array([0, 0, 1, ..., 0, 0, 0])

## Training

In [26]:
from sklearn.model_selection import train_test_split as split

In [27]:
x_train, x_test, y_train, y_test = split(x, y, test_size = 0.33, random_state = 42)

In [28]:
x_train

<3733x8117 sparse matrix of type '<class 'numpy.float64'>'
	with 33376 stored elements in Compressed Sparse Row format>

In [29]:
y_train

array([0, 0, 0, ..., 0, 0, 0])

In [30]:
from sklearn.linear_model import LogisticRegression as lgr

In [31]:
lrobj = lgr()
lrobj.fit(x_train, y_train)

LogisticRegression()

In [32]:
y_pred = lrobj.predict(x_test)

In [33]:
res = pd.DataFrame({"Actual":y_test, "Predicted":y_pred})
res

Unnamed: 0,Actual,Predicted
0,0,0
1,0,0
2,1,0
3,0,0
4,1,1
...,...,...
1834,0,0
1835,0,0
1836,0,0
1837,0,0


In [34]:
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, y_pred)#,normalize=False)

In [35]:
"Accuracy Score : {}".format(score)

'Accuracy Score : 0.945078847199565'