In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('SMSSpamCollection.csv')

In [3]:
data.head()

Unnamed: 0,label,text,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13
0,0,Go until jurong point,,,,,,,,,,,,
1,0,Ok lar... Joking wif u oni...,,,,,,,,,,,,
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,,,,,,,,,,,,
3,0,U dun say so early hor... U c already then say...,,,,,,,,,,,,
4,0,Nah I dont think he goes to usf,,,,,,,,,,,,


In [4]:
data = data[['label','text']]

In [5]:
#suming all null values
data.isnull().sum()

label    0
text     4
dtype: int64

In [6]:
# dropping all null values
data.dropna(subset = ['text'],axis = 0,inplace = True) 

In [7]:
data

Unnamed: 0,label,text
0,0,Go until jurong point
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,Nah I dont think he goes to usf
...,...,...
5569,1,This is the 2nd time we have tried 2 contact u...
5570,0,Will b going to esplanade fr home
5571,0,Pity
5572,0,The guy did some bitching but I acted like id ...


In [8]:
#NLP Pipeline
#1. change all uppercase to lowercase

In [9]:
data['text'] = data['text'].str.lower()

In [10]:
#2. removing punctuations from the text

In [11]:
import string

In [12]:
punctuation = string.punctuation

In [13]:
print(punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [14]:
def remove_punctuations(text):
    no_punctuation = ''.join([char for char in text if char not in punctuation])
    return no_punctuation

In [15]:
data['text'] = data['text'].apply(lambda x:remove_punctuations(x))

In [16]:
#3 tokenization: splitting sentences into words

In [17]:
import nltk
from nltk.tokenize import word_tokenize

In [18]:
data['text'] = data['text'].apply(lambda x: word_tokenize(x))

In [19]:
data['text']

0                              [go, until, jurong, point]
1                          [ok, lar, joking, wif, u, oni]
2       [free, entry, in, 2, a, wkly, comp, to, win, f...
3       [u, dun, say, so, early, hor, u, c, already, t...
4                [nah, i, dont, think, he, goes, to, usf]
                              ...                        
5569    [this, is, the, 2nd, time, we, have, tried, 2,...
5570            [will, b, going, to, esplanade, fr, home]
5571                                               [pity]
5572    [the, guy, did, some, bitching, but, i, acted,...
5573                     [rofl, its, true, to, its, name]
Name: text, Length: 5570, dtype: object

In [20]:
#4. removing stopwords like; a ,the, in, is etc

In [21]:
#asigning all stop words in english
import nltk
stopwords = nltk.corpus.stopwords.words('english')

In [22]:
#function to remove stopwords
def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopwords]
    return text

In [23]:
#removing stopwords from the data
data['text'] = data['text'].apply(lambda x: remove_stopwords(x))

In [26]:
#5. lemmatization; coverting words into their root word

In [27]:
from nltk.stem import WordNetLemmatizer

In [28]:
wordnet=WordNetLemmatizer()

In [29]:
#funtion for lemmatizing

In [30]:
def lemmatizing(tokenized_list):
    text = [wordnet.lemmatize(word) for word in tokenized_list]
    return text

In [31]:
data['text'] = data['text'].apply(lambda x:str(lemmatizing(x)))

In [32]:
#6. vectorization; represent each word as a vector

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [34]:
vectorizer = TfidfVectorizer()

In [35]:
vectorizer.fit(data['text'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [36]:
#ML
#1. split into training and testing

In [37]:
from sklearn.model_selection import train_test_split #splitting data into train model and test model

In [38]:
features_train, features_test, label_train, label_test = train_test_split(data['text'], data['label'])

In [39]:
#transforming features_train and features_test into vectorize form
features_train_vect = vectorizer.transform(features_train)
features_test_vect = vectorizer.transform(features_test)

In [40]:
#classification
from sklearn.ensemble import RandomForestClassifier

In [41]:
model = RandomForestClassifier()

In [42]:
model.fit(features_train_vect,label_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [43]:
model.score(features_test_vect, label_test)

0.9655419956927495

In [58]:
#function to test a sentence with our model
def prepare_text(sentence):
    sentence = sentence.lower()
    sentence =''.join([char for char in sentence if char not in punctuation])
    sentence = [word for word in sentence if word not in stopwords]
    sentence = [str(lemmatizing(sentence))]
    return vectorizer.transform(sentence)

In [59]:
new_sentence ="hello dear customer, you have won 2000000000000"

In [60]:
prediction = model.predict(prepare_text(new_sentence))

In [61]:
if prediction[0] == 1:
    print('spam')
else:
    print('not spam')

not spam


In [62]:
#save model

In [63]:
from joblib import dump

In [64]:
dump(model, 'model.joblib')

['model.joblib']

In [65]:
#load model

In [66]:
from joblib import load

In [67]:
model = load('model.joblib')