In [13]:
import pandas as pd
import numpy as np
import re

In [14]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [15]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

In [27]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [17]:
from sklearn.metrics import accuracy_score, classification_report

In [21]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [23]:
data = pd.read_csv("data/spam.csv", encoding='latin-1')

In [24]:
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [25]:
data = data[['v1', 'v2']]
data.columns = ['label', 'text']

In [26]:
data

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [30]:
le = LabelEncoder()
data['class'] = le.fit_transform(data['label'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['class'] = le.fit_transform(data['label'])


In [31]:
data

Unnamed: 0,label,text,class
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will Ì_ b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


In [32]:
df = data[['text', 'class']]

In [33]:
df

Unnamed: 0,text,class
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...
5567,This is the 2nd time we have tried 2 contact u...,1
5568,Will Ì_ b going to esplanade fr home?,0
5569,"Pity, * was in mood for that. So...any other s...",0
5570,The guy did some bitching but I acted like i'd...,0


In [34]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['class'], test_size=0.2, random_state=43)

In [35]:
# Text Preprocessing with Stemming

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

In [36]:
# Custom Tokenizer Function

def stem_tokenizer(text):

    text = text.lower()
    text = re.sub(r'[^a-z]', ' ', text)
    tokens = text.split()

    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]

    return tokens

In [42]:
tfidf = TfidfVectorizer(tokenizer=stem_tokenizer, lowercase=False, token_pattern=None)

In [43]:
pipeline_nb = Pipeline([('tfidf', tfidf), ('clf', MultinomialNB())])
param_grid_nb = {'tfidf__max_df':[0.7, 0.9], 'tfidf__ngram_range':[(1,1), (1,2)], 'clf__alpha':[0.1, 0.5, 1.0]}
grid_nb = GridSearchCV(pipeline_nb, param_grid_nb, cv = 5, scoring='accuracy', n_jobs=1)

In [44]:
pipeline_lr = Pipeline([('tfidf', tfidf), ('clf', LogisticRegression(max_iter=1000))])
param_grid_lr = {'tfidf__max_df': [0.7, 0.9], 'tfidf__ngram_range': [(1,1), (1,2)], 'clf__C': [0.1, 1, 10]}
grid_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv = 5, scoring='accuracy', n_jobs=1)

In [45]:
pipeline_rf = Pipeline([('tfidf', tfidf), ('clf', RandomForestClassifier(random_state=42))])
param_grid_rf = {'tfidf__max_df': [0.7, 0.9], 'clf__n_estimators': [100, 200], 'clf__max_depth': [None, 20]}
grid_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv = 5, scoring='accuracy', n_jobs=1)

In [46]:
grid_nb.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...inomialNB())])
,param_grid,"{'clf__alpha': [0.1, 0.5, ...], 'tfidf__max_df': [0.7, 0.9], 'tfidf__ngram_range': [(1, ...), (1, ...)]}"
,scoring,'accuracy'
,n_jobs,1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,False
,preprocessor,
,tokenizer,<function ste...001ABF27452D0>
,analyzer,'word'
,stop_words,
,token_pattern,

0,1,2
,alpha,0.1
,force_alpha,True
,fit_prior,True
,class_prior,


In [47]:
grid_lr.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step..._iter=1000))])
,param_grid,"{'clf__C': [0.1, 1, ...], 'tfidf__max_df': [0.7, 0.9], 'tfidf__ngram_range': [(1, ...), (1, ...)]}"
,scoring,'accuracy'
,n_jobs,1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,False
,preprocessor,
,tokenizer,<function ste...001ABF27452D0>
,analyzer,'word'
,stop_words,
,token_pattern,

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,10
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [48]:
grid_rf.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'clf__max_depth': [None, 20], 'clf__n_estimators': [100, 200], 'tfidf__max_df': [0.7, 0.9]}"
,scoring,'accuracy'
,n_jobs,1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,False
,preprocessor,
,tokenizer,<function ste...001ABF27452D0>
,analyzer,'word'
,stop_words,
,token_pattern,

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [49]:
models = {
    "Naive Bayes": grid_nb,
    "Logistic Regression": grid_lr,
    "Random Forest": grid_rf
}

for name, model in models.items():
    print(f"\n{name}")
    print("Best CV Accuracy:", model.best_score_)
    print("Best Parameters:", model.best_params_)



Naive Bayes
Best CV Accuracy: 0.984744555671312
Best Parameters: {'clf__alpha': 0.1, 'tfidf__max_df': 0.7, 'tfidf__ngram_range': (1, 2)}

Logistic Regression
Best CV Accuracy: 0.9825011449824604
Best Parameters: {'clf__C': 10, 'tfidf__max_df': 0.7, 'tfidf__ngram_range': (1, 1)}

Random Forest
Best CV Accuracy: 0.9768919891490893
Best Parameters: {'clf__max_depth': None, 'clf__n_estimators': 100, 'tfidf__max_df': 0.7}


In [50]:
best_model = max(models.values(), key=lambda m: m.best_score_)
print("\nBest Overall Model:", type(best_model.best_estimator_['clf']).__name__)


Best Overall Model: MultinomialNB


In [51]:
final_model = best_model.best_estimator_

In [52]:
final_model.predict(["Congratulations! You have won a free ticket"])

array([0])

In [55]:
final_model.predict(["how are you"])

array([0])

In [56]:
X_test

2347                       But i dint slept in afternoon.
676     Maybe?! Say hi to  and find out if  got his ca...
143            I know you are. Can you pls open the back?
1077                         Yep, by the pretty sculpture
4908                             Will do. Have a good day
                              ...                        
3714    I am late,so call you tomorrow morning.take ca...
3435    If india win or level series means this is rec...
741     Do well :)all will for little time. Thing of g...
2753    Derp. Which is worse, a dude who always wants ...
1175    Horrible u eat macs eat until u forgot abt me ...
Name: text, Length: 1115, dtype: object

In [57]:
y_test

2347    0
676     0
143     0
1077    0
4908    0
       ..
3714    0
3435    0
741     0
2753    0
1175    0
Name: class, Length: 1115, dtype: int32

In [59]:
y_test.value_counts()

class
0    960
1    155
Name: count, dtype: int64

In [60]:
X_test[y_test==1]

708     To review and KEEP the fantastic Nokia N-Gage ...
5466    http//tms. widelive.com/index. wml?id=820554ad...
1059    EASTENDERS TV Quiz. What FLOWER does DOT compa...
1216    You have 1 new voicemail. Please call 08719181...
4100    GSOH? Good with SPAM the ladies?U could b a ma...
                              ...                        
1779    Loan for any purpose å£500 - å£75,000. Homeown...
659     88800 and 89034 are premium phone services cal...
2641    You are guaranteed the latest Nokia Phone, a 4...
356     Congratulations ur awarded 500 of CD vouchers ...
1457    CLAIRE here am havin borin time & am now alone...
Name: text, Length: 155, dtype: object

In [61]:
final_model.predict(['Loan for any purpose å£500 - å£75,000'])

array([1])