In [1]:
# import basic packages
import pandas as pd
import numpy as np
import swifter
import re
import string
from pprint import pprint
from time import time
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# import classifiers
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.naive_bayes import MultinomialNB
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, svm
from sklearn import decomposition, ensemble
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, roc_auc_score, classification_report, roc_curve, auc, accuracy_score  

In [3]:
df=pd.read_pickle('../data/processed_df.pkl')

In [4]:
df.flagged.value_counts()

YR    223414
NR    187966
Name: flagged, dtype: int64

In [4]:
df.loc[df['flagged'] == 'YR', 'flagged'] = 1
df.loc[df['flagged'] == 'NR', 'flagged'] = 0

In [5]:
# take the processed_text column as training data
X = df.processed_text
y = df.flagged

train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.2, random_state=42)
# examine the object shapes
print(train_x.shape)
print(valid_x.shape)

(329104,)
(82276,)


In [7]:
train_y.value_counts()

1    178833
0    150271
Name: flagged, dtype: int64

## Feature Engineering

### Count vector as feature
vectorize the processed text data using countvectorizer and tfidf vectorizer.

In [17]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word')

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.fit_transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

In [18]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', max_features=30000)
xtrain_tfidf =  tfidf_vect.fit_transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', ngram_range=(2,3), max_features=30000)
xtrain_tfidf_ngram =  tfidf_vect_ngram.fit_transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', ngram_range=(2,3), max_features=30000)
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.fit_transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x)

## Initial model experiment
This part aims to test the accuracy of different vectorizer using multiple classifier models, namely, naive bayes classifier and logistic regression. We want to get a general sense how good are the word embeddings fitting the processed text data.

In [19]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

In [20]:
# Naive Bayes on Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
print ("NB, Count Vectors: ", accuracy)

# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("NB, WordLevel TF-IDF: ", accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("NB, N-Gram Vectors: ", accuracy)

# Naive Bayes on Character Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print ("NB, CharLevel Vectors: ", accuracy)

NB, Count Vectors:  0.648962030239681
NB, WordLevel TF-IDF:  0.652438135057611
NB, N-Gram Vectors:  0.6211167290582916
NB, CharLevel Vectors:  0.6066653702173174


In [21]:
# Linear Classifier on Count Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count)
print ("LR, Count Vectors: ", accuracy)

# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("LR, WordLevel TF-IDF: ", accuracy)

# Linear Classifier on Ngram Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("LR, N-Gram Vectors: ", accuracy)

# Linear Classifier on Character Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print ("LR, CharLevel Vectors: ", accuracy)



LR, Count Vectors:  0.6476250668481696
LR, WordLevel TF-IDF:  0.6573848996062035
LR, N-Gram Vectors:  0.6170329136078565
LR, CharLevel Vectors:  0.6408794788273615


Using multinomialNB classifier, countervectorizer gives the highest accuracy followed by wordlevel tfidf. 
Using logistic regression, wordlevel tfidf gives the highest accuracy, followed by character level, then countvectorizer.

It is hard to reach a conclusion whether to use Countervectorizer or TDIDFvectorizer to perform word embedding. Each vectorizer has a set of parameters which if tuned properly, can improve the model performance. Hence, we will proceed to tune each model using the same classifier and compare the results

### Tuning TFIDF

In [24]:
# #############################################################################
# normalization term
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB()),
])
# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    'tfidf__norm': ['l1', 'l2', None]}


if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, cv=3, n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(train_x, train_y)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['tfidf', 'clf']
parameters:
{'tfidf__norm': ['l1', 'l2', None]}
Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:  1.8min remaining:  2.3min
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:  2.4min finished


done in 189.435s

Best score: 0.634
Best parameters set:
	tfidf__norm: 'l2'


In [10]:
# #############################################################################
# normalization term
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(norm='l2')),
    ('clf', MultinomialNB()),
])
parameters = {
    'tfidf__ngram_range': [(1, 1), (1, 2)]}


if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, cv=3, n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(train_x, train_y)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['tfidf', 'clf']
parameters:
{'tfidf__ngram_range': [(1, 1), (1, 2)]}
Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  2.1min finished


done in 150.539s

Best score: 0.634
Best parameters set:
	tfidf__ngram_range: (1, 1)


In [12]:
# min_df and max_df
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(norm='l2', ngram_range=(1, 1))),
    ('clf', MultinomialNB()),
])
parameters = {
    'tfidf__min_df': (1, 3, 5, 10),
    'tfidf__max_df': [0.25, 0.5, 0.75]}

if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, cv=3, n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(train_x, train_y)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['tfidf', 'clf']
parameters:
{'tfidf__max_df': [0.25, 0.5, 0.75], 'tfidf__min_df': (1, 3, 5, 10)}
Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  6.4min finished


done in 414.242s

Best score: 0.648
Best parameters set:
	tfidf__max_df: 0.75
	tfidf__min_df: 5


In [13]:
# sublinear_tf and sublinear_tf
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(norm='l2', ngram_range=(1, 1), max_df=0.75, min_df=5)),
    ('clf', MultinomialNB()),
])
parameters = {
    'tfidf__sublinear_tf': [True, False],
    'tfidf__smooth_idf': [True, False]
}

if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, cv=3, n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(train_x, train_y)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['tfidf', 'clf']
parameters:
{'tfidf__smooth_idf': [True, False], 'tfidf__sublinear_tf': [True, False]}
Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed:  2.0min remaining:   23.5s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  2.0min finished


done in 146.016s

Best score: 0.648
Best parameters set:
	tfidf__smooth_idf: True
	tfidf__sublinear_tf: False


In [14]:
# max_features
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(norm='l2', ngram_range=(1, 1), max_df=0.75, min_df=5,
                             sublinear_tf = False, smooth_idf = True)),
    ('clf', MultinomialNB()),
])
parameters = {
    'tfidf__max_features': [None, 5000, 10000, 20000, 30000]
}

if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, cv=3, n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(train_x, train_y)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['tfidf', 'clf']
parameters:
{'tfidf__max_features': [None, 5000, 10000, 20000, 30000]}
Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  1.9min finished


done in 141.698s

Best score: 0.648
Best parameters set:
	tfidf__max_features: None


Final parameter for TfidfVectorizer is (norm=None, ngram_range=(1, 2),max_df=0.25,min_df=3,sublinear_tf = True, smooth_idf = True).
The testing accuracy for this model on MultinomialNB classifier with default parameter is 67.152%.
Now, try to put all the parameters together and find the best combination.

In [15]:
# max_features
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(norm='l2', ngram_range=(1, 1), max_df=0.75, min_df=5,
                             sublinear_tf = False, smooth_idf = True)),
    ('clf', MultinomialNB()),
])
parameters = {
    'tfidf__max_features': [5000, 10000, 20000, 30000]
}

if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, cv=3, n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(train_x, train_y)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['tfidf', 'clf']
parameters:
{'tfidf__max_features': [5000, 10000, 20000, 30000]}
Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed:  2.6min remaining:   31.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  2.6min finished


done in 185.129s

Best score: 0.647
Best parameters set:
	tfidf__max_features: 30000


In [16]:
# max_features
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(norm='l2', ngram_range=(1, 1), max_df=0.75, min_df=5,
                             sublinear_tf = False, smooth_idf = True)),
    ('clf', MultinomialNB()),
])
parameters = {
    'tfidf__max_features': [30000, 50000, 80000]
}

if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, cv=3, n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(train_x, train_y)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['tfidf', 'clf']
parameters:
{'tfidf__max_features': [30000, 50000, 80000]}
Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:  1.8min remaining:  2.3min
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:  2.4min finished


done in 171.170s

Best score: 0.648
Best parameters set:
	tfidf__max_features: 50000


In [18]:
# ngram_range
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(norm='l2', max_df=0.75, min_df=5,max_features = 50000,
                             sublinear_tf = False, smooth_idf = True)),
    ('clf', MultinomialNB()),
])
parameters = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)]
}

if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, cv=3, n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(train_x, train_y)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['tfidf', 'clf']
parameters:
{'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)]}
Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:  3.7min remaining:  4.6min
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:  8.7min finished


done in 550.082s

Best score: 0.648
Best parameters set:
	tfidf__ngram_range: (1, 1)


In [19]:
# min_df and max_df
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(norm='l2',max_features = 50000,ngram_range=(1, 1),
                             sublinear_df = False, smooth_idf = True)),
    ('clf', MultinomialNB()),
])
parameters = {
    'tfidf__min_df': (4, 5, 6),
    'tfidf__max_df': [0.6, 0.75, 0.8]}

if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, cv=3, n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(train_x, train_y)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['tfidf', 'clf']
parameters:
{'tfidf__max_df': [0.6, 0.75, 0.8], 'tfidf__min_df': (4, 5, 6)}
Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  3.8min finished


done in 252.014s

Best score: 0.648
Best parameters set:
	tfidf__max_df: 0.6
	tfidf__min_df: 5


## test the model

In [7]:
vect = TfidfVectorizer(norm='l2',max_features = 50000, ngram_range=(1, 1), max_df = 0.6, min_df = 5,
                       sublinear_tf = False, smooth_idf = True)
train_x_dtm = vect.fit_transform(train_x)
valid_x_dtm = vect.transform(valid_x)

In [12]:
nb_clf = MultinomialNB(alpha= 10, fit_prior = False)
nb_clf.fit(train_x_dtm, train_y)
pred_class = nb_clf.predict(valid_x_dtm)
pred_prob = nb_clf.predict_proba(valid_x_dtm)
print(metrics.classification_report(valid_y, pred_class))

              precision    recall  f1-score   support

           0       0.67      0.43      0.52     37695
           1       0.63      0.82      0.71     44581

    accuracy                           0.64     82276
   macro avg       0.65      0.63      0.62     82276
weighted avg       0.65      0.64      0.63     82276



In [13]:
# RandomForestClassifier
text_clf = Pipeline([
                     ('clf', RandomForestClassifier(n_estimators=100)),
                     ])

text_clf.fit(train_x_dtm, train_y)

predicted = text_clf.predict(valid_x_dtm)
print(metrics.classification_report(valid_y, predicted))
print('Oveall F1-score: ', f1_score(valid_y, predicted))

              precision    recall  f1-score   support

           0       0.62      0.43      0.51     37695
           1       0.62      0.78      0.69     44581

    accuracy                           0.62     82276
   macro avg       0.62      0.61      0.60     82276
weighted avg       0.62      0.62      0.61     82276

Oveall F1-score:  0.6902682965550087


In [10]:
# MultinomialNB
text_clf = Pipeline([
                     ('clf', MultinomialNB()),
                     ])

text_clf.fit(train_x_dtm, train_y)


predicted = text_clf.predict(valid_x_dtm)

print(metrics.classification_report(valid_y, predicted))
print('Oveall F1-score: ', f1_score(valid_y, predicted))

              precision    recall  f1-score   support

           0       0.67      0.47      0.55     37695
           1       0.64      0.81      0.72     44581

    accuracy                           0.65     82276
   macro avg       0.66      0.64      0.63     82276
weighted avg       0.66      0.65      0.64     82276

Oveall F1-score:  0.7161288400226454


In [9]:
# GradientBoostingClassifier
text_clf = Pipeline([
                     ('clf', GradientBoostingClassifier(n_estimators=100)),
                     ])

text_clf.fit(train_x_dtm, train_y)


predicted = text_clf.predict(valid_x_dtm)

print(metrics.classification_report(valid_y, predicted))
print('Oveall F1-score: ', f1_score(valid_y, predicted))

              precision    recall  f1-score   support

           0       0.64      0.32      0.42     37695
           1       0.60      0.85      0.70     44581

    accuracy                           0.61     82276
   macro avg       0.62      0.58      0.56     82276
weighted avg       0.62      0.61      0.57     82276

Oveall F1-score:  0.7011144322668733


In [8]:
# XGBClassifier
text_clf = Pipeline([
                     ('clf', xgb.XGBClassifier()),
                     ])

text_clf.fit(train_x_dtm, train_y)


predicted = text_clf.predict(valid_x_dtm)

print(metrics.classification_report(valid_y, predicted))
print('Oveall F1-score: ', f1_score(valid_y, predicted))

              precision    recall  f1-score   support

           0       0.64      0.45      0.53     37695
           1       0.63      0.78      0.70     44581

    accuracy                           0.63     82276
   macro avg       0.63      0.62      0.61     82276
weighted avg       0.63      0.63      0.62     82276

Oveall F1-score:  0.6965658585353661
