In [129]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import string
import re
stopwords=nltk.corpus.stopwords.words('english')
ps=nltk.PorterStemmer()

In [130]:
#Building Machine Learning Classifiers:Building a basic random forest model

In [131]:
data=pd.read_csv("SMSSpamCollection.tsv",sep='\t',header=None)

In [132]:
data.columns=["label","body"]

In [133]:
data.head()
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [134]:
# Remvoing Punctuations
#Removing stopwords
#Tokenization
#Stemming

In [135]:
def clean_data(text):
    words="".join([char for char in text if char not in string.punctuation])
    tokenize=re.split('W+',words)
    clean=" ".join([ps.stem(word) for word in tokenize if word not in stopwords])
    return clean

In [136]:
data["clean"]=data["body"].apply(lambda x:clean_data(x))

In [137]:
data.head()

Unnamed: 0,label,body,clean
0,ham,I've been searching for the right words to tha...,ive been searching for the right words to than...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...
3,ham,Even my brother is not like to speak with me. ...,even my brother is not like to speak with me t...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,i have a date on sunday ith ill


In [138]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [139]:
tfidf_vect=TfidfVectorizer(analyzer=clean_data)
X_tfidf=tfidf_vect.fit_transform(data["body"])

In [140]:
# Length of the feature text

In [141]:
def len_feature(text):
    count=sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text)-text.count(" ")),3)*100

In [142]:
data["body_len"]=data["body"].apply(lambda x:len(x) - x.count(""))
data["pucn_%"]=data["body"].apply(lambda x:len_feature(x))

In [143]:
data.head()

Unnamed: 0,label,body,clean,body_len,pucn_%
0,ham,I've been searching for the right words to tha...,ive been searching for the right words to than...,-1,2.5
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,-1,4.7
2,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...,-1,4.1
3,ham,Even my brother is not like to speak with me. ...,even my brother is not like to speak with me t...,-1,3.2
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,i have a date on sunday ith ill,-1,7.1


In [144]:
X_features.head()
X_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5568 entries, 0 to 5567
Data columns (total 62 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pucn_%    5568 non-null   float64
 1   review0   5568 non-null   float64
 2   review1   5568 non-null   float64
 3   review2   5568 non-null   float64
 4   review3   5568 non-null   float64
 5   review4   5568 non-null   float64
 6   review5   5568 non-null   float64
 7   review6   5568 non-null   float64
 8   review7   5568 non-null   float64
 9   review8   5568 non-null   float64
 10  review9   5568 non-null   float64
 11  review10  5568 non-null   float64
 12  review11  5568 non-null   float64
 13  review12  5568 non-null   float64
 14  review13  5568 non-null   float64
 15  review14  5568 non-null   float64
 16  review15  5568 non-null   float64
 17  review16  5568 non-null   float64
 18  review17  5568 non-null   float64
 19  review18  5568 non-null   float64
 20  review19  5568 non-null   floa

In [145]:
#Explore RandomForest Classifier Attributes and Hyperparameters

In [146]:
from sklearn.ensemble import RandomForestClassifier

In [147]:
print(dir(RandomForestClassifier))
print(RandomForestClassifier())

['__abstractmethods__', '__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__sklearn_clone__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_build_request_for_signature', '_check_feature_names', '_check_n_features', '_compute_oob_predictions', '_estimator_type', '_get_default_requests', '_get_metadata_request', '_get_oob_predictions', '_get_param_names', '_get_tags', '_make_estimator', '_more_tags', '_parameter_constraints', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_required_parameters', '_set_oob_score_and_attributes', '_validate_X_predict', '_validate_data', '_validate_estimator', '_validate_params', '_validate_y_class_wei

In [148]:
print(RandomForestClassifier())

RandomForestClassifier()


In [149]:
print(dir(RandomForestClassifier))

['__abstractmethods__', '__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__sklearn_clone__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_build_request_for_signature', '_check_feature_names', '_check_n_features', '_compute_oob_predictions', '_estimator_type', '_get_default_requests', '_get_metadata_request', '_get_oob_predictions', '_get_param_names', '_get_tags', '_make_estimator', '_more_tags', '_parameter_constraints', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_required_parameters', '_set_oob_score_and_attributes', '_validate_X_predict', '_validate_data', '_validate_estimator', '_validate_params', '_validate_y_class_wei

In [150]:
#Explore RandomForestClassifier through Cross Validation

In [151]:
from sklearn.model_selection import KFold,cross_val_score

In [152]:
rf=RandomForestClassifier(n_jobs=-1)
k_fold=KFold(n_splits=5)
cross_val_score(rf,X_features,data["label"],cv=k_fold,scoring='accuracy',n_jobs=-1)

array([0.97666068, 0.98114901, 0.97755835, 0.97843666, 0.97933513])

In [153]:
X_features.head()

Unnamed: 0,pucn_%,review0,review1,review2,review3,review4,review5,review6,review7,review8,...,review51,review52,review53,review54,review55,review56,review57,review58,review59,review60
0,2.5,0.0,0.0,0.636458,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.7,0.0,0.0,0.527478,0.297254,0.294815,0.275358,0.0,0.060552,0.187905,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.1,0.0,0.0,0.613328,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.2,0.0,0.0,0.624581,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7.1,0.0,0.0,0.735239,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [154]:
X_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5568 entries, 0 to 5567
Data columns (total 62 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pucn_%    5568 non-null   float64
 1   review0   5568 non-null   float64
 2   review1   5568 non-null   float64
 3   review2   5568 non-null   float64
 4   review3   5568 non-null   float64
 5   review4   5568 non-null   float64
 6   review5   5568 non-null   float64
 7   review6   5568 non-null   float64
 8   review7   5568 non-null   float64
 9   review8   5568 non-null   float64
 10  review9   5568 non-null   float64
 11  review10  5568 non-null   float64
 12  review11  5568 non-null   float64
 13  review12  5568 non-null   float64
 14  review13  5568 non-null   float64
 15  review14  5568 non-null   float64
 16  review15  5568 non-null   float64
 17  review16  5568 non-null   float64
 18  review17  5568 non-null   float64
 19  review18  5568 non-null   float64
 20  review19  5568 non-null   floa

In [155]:
data["body_len"].astype(float)
X_features=pd.concat([data["pucn_%"],pd.DataFrame(X_tfidf.toarray()).add_prefix('review')],axis=1)

In [156]:
X_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5568 entries, 0 to 5567
Data columns (total 62 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pucn_%    5568 non-null   float64
 1   review0   5568 non-null   float64
 2   review1   5568 non-null   float64
 3   review2   5568 non-null   float64
 4   review3   5568 non-null   float64
 5   review4   5568 non-null   float64
 6   review5   5568 non-null   float64
 7   review6   5568 non-null   float64
 8   review7   5568 non-null   float64
 9   review8   5568 non-null   float64
 10  review9   5568 non-null   float64
 11  review10  5568 non-null   float64
 12  review11  5568 non-null   float64
 13  review12  5568 non-null   float64
 14  review13  5568 non-null   float64
 15  review14  5568 non-null   float64
 16  review15  5568 non-null   float64
 17  review16  5568 non-null   float64
 18  review17  5568 non-null   float64
 19  review18  5568 non-null   float64
 20  review19  5568 non-null   floa

In [157]:
rf=RandomForestClassifier(n_jobs=-1)
k_fold=KFold(n_splits=5)
cross_val_score(rf,X_features,data["label"],cv=k_fold,scoring='accuracy',n_jobs=-1)

array([0.97666068, 0.98204668, 0.97935368, 0.98113208, 0.98113208])

In [158]:
#Explore RandomForestClassifier through Holdout set

In [159]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [160]:
X_train.shape

(3897, 62)

In [161]:
X_test.shape

(1671, 62)

In [162]:
from sklearn.ensemble import RandomForestClassifier

In [163]:
rf=RandomForestClassifier(n_estimators=50,max_depth=20,n_jobs=-1)

In [164]:
rf_model=rf.fit(X_train,y_train)

In [165]:
sorted(zip(rf_model.feature_importances_,X_train.columns),reverse=True)[0:10]

[(0.2054760878823416, 'review3'),
 (0.09951128192126875, 'review4'),
 (0.09603409926196635, 'review11'),
 (0.08848838078296337, 'review8'),
 (0.052008806549917776, 'review6'),
 (0.05024842244911062, 'review5'),
 (0.03400322271566667, 'review10'),
 (0.03078091021173719, 'review45'),
 (0.030560770823419726, 'review7'),
 (0.02831498520396673, 'review9')]

In [166]:
y_pred=rf_model.predict(X_test)

In [167]:
precision,recall,fscore,support=score(y_test,y_pred,pos_label='spam',average='binary')

In [168]:
print('precision:{} / recall:{} / fscore:{}'.format(round(precision,3),round(recall,3),round(y_pred==y_test).sum()/len(y_pred)))

precision:0.977 / recall:0.885 / fscore:0.9802513464991023


In [169]:
# Random forest with grid search

In [170]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [171]:
X_train,X_test,y_train,y_test=train_test_split(X_features,data["label"],test_size=0.3,random_state=0)

In [177]:
def train_rf(n_est,depth):
    rf=RandomForestClassifier(n_estimators=n_est,max_depth=depth,n_jobs=-1)
    rf_model=rf.fit(X_train,y_train)
    y_pred=rf_model.predict(X_test)
    precision,recall,fscore,support=score(y_test,y_pred,pos_label='spam',average='binary')
    print('estimator: {} / depth :{} / ------precision:{} / recall :{} / Accuracy :{}'
          .format(n_est,depth,round(precision,3),round(recall,3),round((y_pred==y_test).sum()/len(y_pred),3)))

In [178]:
for n_est in [10,50,100]:
    for depth in [10,20,30,None]:
        train_rf(n_est,depth)

estimator: 10 / depth :10 / ------precision:0.968 / recall :0.86 / Accuracy :0.975
estimator: 10 / depth :20 / ------precision:0.981 / recall :0.864 / Accuracy :0.978
estimator: 10 / depth :30 / ------precision:0.981 / recall :0.872 / Accuracy :0.979
estimator: 10 / depth :None / ------precision:0.985 / recall :0.831 / Accuracy :0.974
estimator: 50 / depth :10 / ------precision:0.981 / recall :0.86 / Accuracy :0.977
estimator: 50 / depth :20 / ------precision:0.977 / recall :0.881 / Accuracy :0.98
estimator: 50 / depth :30 / ------precision:0.981 / recall :0.868 / Accuracy :0.978
estimator: 50 / depth :None / ------precision:0.982 / recall :0.881 / Accuracy :0.98
estimator: 100 / depth :10 / ------precision:0.973 / recall :0.877 / Accuracy :0.978
estimator: 100 / depth :20 / ------precision:0.977 / recall :0.881 / Accuracy :0.98
estimator: 100 / depth :30 / ------precision:0.981 / recall :0.872 / Accuracy :0.979
estimator: 100 / depth :None / ------precision:0.977 / recall :0.877 / Acc

In [182]:
# RandomForest  with GridSearchCV
from sklearn.model_selection import GridSearchCV

In [184]:
rf=RandomForestClassifier()
param={'n_estimators':[10,150,300],'max_depth':[30,60,90,None]}
gs=GridSearchCV(rf,param,cv=5,n_jobs=-1)
gs_fit=gs.fit(X_features,data['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score',ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,5.268912,0.150875,0.046386,0.000706,30.0,150,"{'max_depth': 30, 'n_estimators': 150}",0.980251,0.982944,0.979354,0.980234,0.979335,0.980424,0.001323,1
10,5.666783,0.188748,0.048689,0.002341,,150,"{'max_depth': None, 'n_estimators': 150}",0.977558,0.983842,0.979354,0.980234,0.978437,0.979885,0.002171,2
11,8.847581,0.801046,0.050864,0.001993,,300,"{'max_depth': None, 'n_estimators': 300}",0.979354,0.982944,0.979354,0.980234,0.977538,0.979885,0.001764,3
5,10.867625,0.551505,0.092189,0.005604,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.979354,0.982047,0.979354,0.980234,0.977538,0.979705,0.001463,4
8,10.657117,0.376676,0.083619,0.007102,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.977558,0.982944,0.979354,0.979335,0.978437,0.979526,0.001834,5
