In [1]:
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
df = pd.read_csv('spam.tsv', sep="\t")

In [3]:
hamDf = df[df['label'] == 'ham']
spamDf = df[df['label'] == 'spam']

# Resample hamDf to have the same number of rows as spamDf
hamDf = hamDf.sample(spamDf.shape[0])

# Combine the dataframes using pd.concat()
finalDf = pd.concat([hamDf, spamDf], ignore_index=True)

print(finalDf.shape)


(1494, 4)


In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(finalDf['message'], finalDf['label'], test_size = 0.2, random_state = 0, shuffle = True, stratify = finalDf['label'])

In [23]:
#PipeLine 
#model = Pipeline([('tfidf', TfidfVectorizer()), ('model', RandomForestClassifier(n_estimators=100, n_jobs = -1))])


In [5]:
model = Pipeline([('tfidf', TfidfVectorizer()), ('model', SVC(C=1000, gamma='auto'))])


In [6]:
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)


In [7]:
#print(confusion_matrix(Y_test, Y_pred))
#print(classification_report(Y_test, Y_pred))
print(accuracy_score(Y_test, Y_pred))


0.939799331103679


In [8]:
print(model.predict(["click the link below to claim your prize NOW! Hurry, only a few left!"]))


['spam']


In [38]:
joblib.dump(model, "mySVCModel.pkl")

['mySVCModel.pkl']