In [133]:
import pandas as pd
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
import numpy as np

col_names = ['v1', 'v2']
detect = pd.read_csv("spam.csv", usecols=col_names,encoding='latin-1')

In [134]:
detect.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [135]:
detect.groupby('v1').describe()

Unnamed: 0_level_0,v2,v2,v2,v2
Unnamed: 0_level_1,count,unique,top,freq
v1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [136]:
detect_v2_data = detect['v2'].copy()

In [137]:
# Predefined list of stopwords
stopwords_list = set([
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves",
    "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their",
    "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was",
    "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and",
    "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between",
    "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off",
    "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any",
    "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so",
    "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"
])

In [138]:
def process_v2(texts):
    texts = texts.translate(str.maketrans('', '', string.punctuation))
    texts = ' '.join([word for word in texts.split() if word.lower() not in stopwords_list])
    return texts

In [139]:
detect_v2_processed_data = detect_v2_data.apply(process_v2)

In [140]:
detect_v2_processed_data

0       Go jurong point crazy Available bugis n great ...
1                                 Ok lar Joking wif u oni
2       Free entry 2 wkly comp win FA Cup final tkts 2...
3                     U dun say early hor U c already say
4             Nah dont think goes usf lives around though
                              ...                        
5567    2nd time tried 2 contact u U won å£750 Pound p...
5568                          Ì b going esplanade fr home
5569                          Pity mood Soany suggestions
5570    guy bitching acted like id interested buying s...
5571                                       Rofl true name
Name: v2, Length: 5572, dtype: object

In [141]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')


In [142]:
data_matrix = vectorizer.fit_transform(detect_v2_processed_data)
data_matrix

<5572x9379 sparse matrix of type '<class 'numpy.float64'>'
	with 47361 stored elements in Compressed Sparse Row format>

In [143]:
v2_train, v2_test, v1_train, v1_test = train_test_split(data_matrix, 
                                                        detect['v1'], test_size=0.5, random_state=30)

In [144]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

spam_logreg = LogisticRegression(solver='liblinear', penalty='l1')
spam_logreg.fit(v2_train, v1_train)
pred = spam_logreg.predict(v2_test)
accuracy_score(v1_test,pred)

0.9418521177315147

In [145]:
def stemmer(texts):
    texts = texts.split()
    words = ""
    for i in texts:
        stemmer = PorterStemmer()
        words += (stemmer.stem(i)) + " "
    return words

In [146]:
detect_v2_processed_data = detect_v2_processed_data.apply(stemmer)
vectorizer = TfidfVectorizer(input='english')
data_matrix = vectorizer.fit_transform(detect_v2_processed_data)

In [147]:
v2_train, v2_test, v1_train, v1_test = train_test_split(data_matrix, 
                                                        detect['v1'], test_size=0.5, random_state=30)

In [148]:
detect['length'] = detect['v2'].apply(len)
detect.head()

Unnamed: 0,v1,v2,length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [149]:
length = detect['length'].values
new_data_matrix = np.hstack((data_matrix.todense(),length[:, None]))

In [150]:
v2_train, v2_test, v1_train, v1_test = train_test_split(data_matrix, 
                                                        detect['v1'], test_size=0.5, random_state=30)

In [151]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

spam_logreg = LogisticRegression(solver='liblinear', penalty='l1')
spam_logreg.fit(v2_train, v1_train)
pred = spam_logreg.predict(v2_test)
accuracy_score(v1_test,pred)

0.9461593682699211

In [152]:
import joblib

# Save the trained model to a file
joblib.dump(spam_logreg, 'spam_detection_model.pkl')

['spam_detection_model.pkl']

In [154]:
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

**The End**