In [32]:
import numpy as np
import pandas as pd
import string
import re
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score,precision_score,recall_score

In [19]:
X_train = pd.read_csv("X_train.csv")
X_test = pd.read_csv("X_test.csv")
y_train = pd.read_csv("y_train.csv")
y_test = pd.read_csv("y_test.csv")

In [30]:
X_train['clean_text']

0       ['ok', 'u', 'can', 'take', 'me', 'shop', 'when...
1       ['tomorrow', 'i', 'am', 'not', 'go', 'to', 'th...
2       ['jason', 'say', 'it', 'cool', 'if', 'we', 'pi...
3       ['i', 'wud', 'never', 'mind', 'if', 'u', 'dont...
4       ['smile', 'in', 'pleasur', 'smile', 'in', 'pai...
                              ...                        
4452    ['it', 'a', 'site', 'to', 'simul', 'the', 'tes...
4453    ['em', 'it', 'olowoyey', 'uscedu', 'have', 'a'...
4454    ['hi', 'thi', 'is', 'roger', 'from', 'cl', 'ho...
4455    ['as', 'usualiam', 'fine', 'happi', 'amp', 'do...
4456    ['yeah', 'i', 'think', 'my', 'usual', 'guy', '...
Name: clean_text, Length: 4457, dtype: object

In [20]:
tfdif = TfidfVectorizer()
tfdif.fit(X_train['clean_text'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [21]:
X_train_tfidf = tfdif.transform(X_train['clean_text'])
X_test_tfidf = tfdif.transform(X_test['clean_text'])

In [22]:
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

In [23]:
tfdif.vocabulary_

{'ok': 4655,
 'can': 1604,
 'take': 6174,
 'me': 4134,
 'shop': 5673,
 'when': 6903,
 'get': 2898,
 'paid': 4789,
 'tomorrow': 6428,
 'am': 978,
 'not': 4550,
 'go': 2936,
 'to': 6400,
 'theatr': 6289,
 'so': 5825,
 'come': 1843,
 'wherev': 6911,
 'call': 1571,
 'tell': 6228,
 'where': 6908,
 'and': 1001,
 'jason': 3576,
 'say': 5517,
 'it': 3520,
 'cool': 1920,
 'if': 3383,
 'we': 6841,
 'pick': 4911,
 'some': 5840,
 'up': 6633,
 'from': 2805,
 'hi': 3202,
 'place': 4935,
 'in': 3414,
 'like': 3884,
 'an': 997,
 'hour': 3294,
 'wud': 7056,
 'never': 4461,
 'mind': 4217,
 'dont': 2275,
 'miss': 4236,
 'or': 4719,
 'need': 4439,
 'but': 1540,
 'wil': 6942,
 'realli': 5259,
 'hurt': 3345,
 'wen': 6877,
 'amp': 990,
 'care': 1627,
 'smile': 5804,
 'pleasur': 4955,
 'pain': 4790,
 'troubl': 6506,
 'pour': 5035,
 'rain': 5216,
 'sum1': 6084,
 'becoz': 1275,
 'someon': 5844,
 'still': 5996,
 'love': 3973,
 'see': 5565,
 'you': 7176,
 'have': 3126,
 'won': 6995,
 'nokia': 4524,
 '7250i': 658,

In [24]:
X_test_tfidf[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [25]:
from sklearn.ensemble import RandomForestClassifier

In [26]:
rfc = RandomForestClassifier()

In [27]:
model = rfc.fit(X_train_tfidf,y_train)



In [36]:
model.base_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [31]:
ypred = model.predict(X_test_tfidf)

In [33]:
precision = precision_score(y_test,ypred)
recall = recall_score(y_test,ypred)
accuracy = accuracy_score(y_test,ypred)

In [34]:
print(f"Precision: {precision}, Recall: {recall}, Accuracy: {accuracy}")

Precision: 1.0, Recall: 0.7464788732394366, Accuracy: 0.967713004484305


### Tuning Parameters

In [35]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

In [52]:
params = {
    'n_estimators':range(50,500,50),
    'max_depth':range(8,21,2),
    'oob_score':[True,False],
    'min_samples_leaf':range(4,10),
    'min_samples_split':range(4,10)
}

In [53]:
randfor = RandomForestClassifier()
rand = RandomizedSearchCV(randfor,params,
    n_iter=10,
    n_jobs=-1,
    iid='warn',
    refit=True,
    cv=5,
    verbose=2)

In [54]:
randModel = rand.fit(X_train_tfidf,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   23.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   34.4s finished


In [55]:
randModel.best_params_

{'oob_score': False,
 'n_estimators': 200,
 'min_samples_split': 9,
 'min_samples_leaf': 4,
 'max_depth': 20}

In [56]:
randfor1 = RandomForestClassifier(**{'oob_score': True,
 'n_estimators': 150,
 'min_samples_split': 9,
 'min_samples_leaf': 4,
 'max_depth': 20})

In [57]:
model_1 = randfor1.fit(X_train_tfidf,y_train)
ypred = model_1.predict(X_test_tfidf)
precision = precision_score(y_test,ypred)
recall = recall_score(y_test,ypred)
accuracy = accuracy_score(y_test,ypred)
print(f"Precision: {precision}, Recall: {recall}, Accuracy: {accuracy}")

Precision: 1.0, Recall: 0.5070422535211268, Accuracy: 0.9372197309417041


In [69]:
params = {
    'n_estimators':range(150,250,25),
    #'max_depth':range(18,22),
    'oob_score':[True],
    'min_samples_leaf':range(2,6),
    'min_samples_split':range(8,11)
}

In [70]:
randfor = RandomForestClassifier()
grid = GridSearchCV(randfor,params,
    n_jobs=-1,
    cv=5,
    verbose=2)
gridModel = grid.fit(X_train_tfidf,y_train)
gridModel.best_params_

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  6.5min finished


{'min_samples_leaf': 2,
 'min_samples_split': 10,
 'n_estimators': 150,
 'oob_score': True}

In [75]:
randfor2 = RandomForestClassifier(**{'min_samples_leaf': 2,
 'min_samples_split': 10,
 'n_estimators': 150,
 'oob_score': True})
#(**{'max_depth': 21,
#'min_samples_leaf': 2,
# 'min_samples_split': 10,
# 'n_estimators': 150,
# 'oob_score': True})

### Final Model

In [76]:
model_2 = randfor2.fit(X_train_tfidf,y_train)
ypred = model_2.predict(X_test_tfidf)
precision = precision_score(y_test,ypred)
recall = recall_score(y_test,ypred)
accuracy = accuracy_score(y_test,ypred)
print(f"Precision: {precision}, Recall: {recall}, Accuracy: {accuracy}")

Precision: 1.0, Recall: 0.7605633802816901, Accuracy: 0.9695067264573991
