# Spam Ham Classifier 

In [1]:
# Importing necessary libraries

import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegressionCV
import pickle 

In [2]:
# Loadding data

spam_data = pd.read_csv('spam.csv')
spam_data['target'] = np.where(spam_data['target']=='spam',1,0)
spam_data.head(5)

Unnamed: 0,text,target
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [18]:
X = spam_data['text']
y = spam_data['target']

In [22]:
spam = spam_data[spam_data['target']==1]
ham = spam_data[spam_data['target']==0]
print(spam.shape,ham.shape)

(747, 2) (4825, 2)


In [19]:
# Extract Feature With CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(X) # Fit the Data

In [14]:
# Checking whether the data set is balanced or not 
len(spam_data[spam_data['target']==1])/len(spam_data['text'])*100

13.406317300789663

In [20]:
# The percentage of spam data is 13.41% which is very less. So, we need to solve the problem of imbalanced dataset
from imblearn.combine import SMOTETomek

# Implementing Oversampling for Handling Imbalanced 
smk = SMOTETomek(random_state=42)
X_res,y_res=smk.fit_sample(X,y)

In [25]:
X_res.shape,y_res.shape

((9650, 8672), (9650,))

In [35]:
from collections import Counter
print('Original dataset shape {}'.format(Counter(y)))
print('Resampled dataset shape {}'.format(Counter(y_res)))

Original dataset shape Counter({0: 4825, 1: 747})
Resampled dataset shape Counter({0: 4825, 1: 4825})


In [26]:
# Spliting data into train test
X_train, X_test, y_train, y_test = train_test_split(X_res,y_res,test_size=0.2,random_state=0)

In [28]:
# Training the model
CLF = LogisticRegressionCV(cv=5, random_state=0)
clf = CLF.fit(X_train, y_train)

In [29]:
pred = clf.predict(X_test)

In [36]:
filename = 'nlp_model.pkl'
pickle.dump(clf, open(filename, 'wb'))
pickle.dump(cv, open('transform.pkl', 'wb'))

In [34]:
score = clf.score(X_test,y_test)
score

0.961139896373057