In [169]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler,OneHotEncoder,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle

In [138]:
df = pd.read_csv('spamhamdata.csv', sep='\t', header=None, names=['label', 'text'])

In [139]:
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [140]:
df.duplicated().sum()

403

In [141]:
df.isnull().sum()

label    0
text     0
dtype: int64

In [142]:
df = df.drop_duplicates()

In [143]:
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [144]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5169 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5169 non-null   object
 1   text    5169 non-null   object
dtypes: object(2)
memory usage: 121.1+ KB


In [145]:
# Feature Engineering

In [146]:
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

In [147]:
X,y = df.text, df.label
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [148]:
### Vectorizer

In [149]:
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

### Logistic Regression

In [150]:
log_reg = LogisticRegression()

In [151]:
log_reg.fit(X_train_tfidf,y_train)

In [152]:
y_pred = log_reg.predict(X_test_tfidf)

In [153]:
accuracy_score(y_test,y_pred)

0.9466357308584686

In [154]:
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

### Decision Tree

In [155]:
dt = DecisionTreeClassifier()

In [156]:
dt.fit(X_train_tfidf,y_train)

In [157]:
y_pred = dt.predict(X_test_tfidf)

In [158]:
accuracy_score(y_test,y_pred)

0.9590100541376644

### Random Forest

In [159]:
rf = RandomForestClassifier()

In [160]:
rf.fit(X_train_tfidf,y_train)

In [161]:
y_pred = rf.predict(X_test_tfidf)

In [162]:
accuracy_score(y_test,y_pred)

0.9737045630317092

### Support Vector Machine

In [163]:
svm = SVC()

In [164]:
svm.fit(X_train_tfidf,y_train)

In [165]:
y_pred = svm.predict(X_test_tfidf)

In [166]:
accuracy_score(y_test,y_pred)

0.9698375870069605

## Predict Function

In [167]:
def predict_spam(phrase):
    phrase_vec = vectorizer.transform([phrase])
    phrase_pred = svm.predict(phrase_vec)
    if phrase_pred[0] == 0:
        return 'ham'
    else:
        return 'spam'

In [168]:
predict_spam("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's")

'spam'

In [170]:
pickle.dump(svm,open('svm.pkl','wb'))
pickle.dump(vectorizer,open('vectorizer.pkl','wb'))