In [1]:
import re
import warnings
import numpy as np
import pandas as pd

from tqdm import tqdm
from bs4 import BeautifulSoup

# import nltk
# nltk.download('omw-1.4')
# nltk.download('stopwords')
# nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer

from keras_preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report

from catboost import Pool, CatBoostClassifier
%matplotlib inline

In [2]:
# Read positive corpus
pos_df = pd.read_fwf("positive_corpus.txt", names=['Phrase'], header=None)
pos_df['Sentiment'] = 1
pos_df.head()

Unnamed: 0,Phrase,Sentiment
0,I am so grateful for the loving relationships ...,1
1,I feel accomplished after completing a challen...,1
2,I am happy to be alive and healthy.,1
3,I love learning and expanding my knowledge.,1
4,I am excited for the adventures that lie ahead.,1


In [3]:
# Read negative corpus
neg_df = pd.read_fwf("negative_corpus.txt", names=['Phrase'], header=None)
neg_df['Sentiment'] = 0
neg_df.head()

Unnamed: 0,Phrase,Sentiment
0,I am feeling anxious and overwhelmed.,0
1,I am frustrated with the lack of progress in m...,0
2,I feel angry when people disrespect me.,0
3,I am disappointed with the choices I have made...,0
4,I am worried about the future and what it holds.,0


In [4]:
'''
# Preprocess text data
def preprocess_data(df):
    phrases = []
    for raw in tqdm(df['Phrase']):
        warnings.filterwarnings('ignore', category=UserWarning, module='bs4')
        text = BeautifulSoup(raw, 'lxml').get_text()
        only_text = re.sub('[^a-zA-Z]', ' ', text)
        words = word_tokenize(only_text.lower())
        stops = set(stopwords.words('english'))
        non_stopwords = [word for word in words if not word in stops]
        lemmatizer = WordNetLemmatizer()
        lemma_words = [lemmatizer.lemmatize(word) for word in non_stopwords]    
        phrases.append(lemma_words)
    return phrases

# Tokenize text data
def tokenizer_preprocess(list_X_train, list_X_val):
    unique_words = set()
    len_max = 0
    for sent in tqdm(list_X_train):
        unique_words.update(sent)
        if len_max < len(sent):
            len_max = len(sent)

    tokenizer = Tokenizer(num_words=len(list(unique_words)))
    tokenizer.fit_on_texts(list(list_X_train))
     
    X_train = tokenizer.texts_to_sequences(list_X_train)
    X_train = pad_sequences(X_train, maxlen=len_max)

    X_val = tokenizer.texts_to_sequences(list_X_val)
    X_val = pad_sequences(X_val, maxlen=len_max)

    return X_train, X_val, tokenizer, len_max
'''



In [39]:
# TfidfVectorize dataset from pandas DataFrame, split dataset for training and validation, and convert to numpy array
def TfidfVectorizeDataframe(X_df, Y_df, seed, use_pca=True):
    # Setup stemmer for English language
    stemmer = SnowballStemmer(language='english')

    # Function to create tokenizer
    def tokenize(text):
        return [stemmer.stem(token) for token in word_tokenize(text)]
    
    # Create stopword for English language
    eng_stopword = stopwords.words('english')

    # Create vectorizer
    vectorizer = TfidfVectorizer(tokenizer=tokenize,
                                stop_words=eng_stopword,
                                ngram_range=(1,2),
                                max_features=500)

    # Fit training dataset in vectorizer and transform training and validation dataset
    vectorizer.fit(X_df.Phrase)
    X_dataset = vectorizer.transform(X_df.Phrase)
    Y_dataset = Y_df.Sentiment.values

    # Split dataset for training and validation
    X_train, X_val, Y_train, Y_val = train_test_split(X_dataset, Y_dataset, test_size=0.2, stratify=Y_dataset, random_state=seed)

    # Turn sparse matrix to numpy array
    X_train = X_train.toarray()
    X_val = X_val.toarray()

    # Use PCA to reduce feature number to increase training speed and avoid overfitting
    # Fit training dataset in pca and transform training and validation dataset
    if use_pca:
        pca = PCA(n_components=120)
        _ = pca.fit(X_train)    
    
        X_train = pca.transform(X_train)
        X_val = pca.transform(X_val)
    
    return X_train, X_val, Y_train, Y_val

In [40]:
# Merge positive and negative datasets
df = pd.concat([pos_df, neg_df], axis=0)
df = df.sample(frac=1).reset_index(drop=True)
df.Sentiment.value_counts()

0    100
1    100
Name: Sentiment, dtype: int64

In [41]:
# Get X
X_df = df['Phrase'].to_frame()
X_df.head()

Unnamed: 0,Phrase
0,I am sad about the loss of a job or financial ...
1,I love the feeling of accomplishing something ...
2,I am tired of the constant pressure to conform...
3,"I am grateful for the little things in life, l..."
4,I feel grateful for the moments of peace and c...


In [42]:
# Get Y
Y_df = df['Sentiment'].to_frame()
Y_df.head()

Unnamed: 0,Sentiment
0,0
1,1
2,0
3,1
4,1


In [43]:
# Get metric scores
def get_metric_results(test_label, predict_label):
    metric_results = {}
    metric_results['accuracy'] = accuracy_score(test_label, predict_label)
    metric_results['f1_score'] = f1_score(test_label, predict_label, average="macro")
    metric_results['precision'] = precision_score(test_label, predict_label, average="macro")
    metric_results['recall'] = recall_score(test_label, predict_label, average="macro")
    return metric_results

## Bernoulli Naive Bayes

In [60]:
accuracy = []
f1score = []
precision = []
recall = []

for i in range(5):

    # Get train and test dataset after preprocessing
    X_train, X_test, Y_train, Y_test = TfidfVectorizeDataframe(X_df, Y_df, seed=i**2, use_pca=False)
    
    # GridSearchCV can do exhaustive search over specified parameter values for an estimator.
    # Parameters for GridSearchCV
    parameters = {
        'alpha': [0.6, 0.8, 1.0, 1.2, 1.4],
        'binarize': [0.0, 0.1, 0.2],
    }

    # Define Bernoulli Naive Bayes model
    nb_clf = BernoulliNB()

    # Do GridSearchCV on model
    nb_clf = GridSearchCV(
        nb_clf, 
        parameters,
        cv=5,
        scoring='accuracy',
        refit=True,
    )

    # Train model
    nb_clf.fit(X_train, Y_train)
    
    Y_predicted = nb_clf.predict(X_test)
    metric_result = get_metric_results(Y_test, Y_predicted)
    
    accuracy.append(metric_result['accuracy'])
    f1score.append(metric_result['f1_score'])
    precision.append(metric_result['precision'])
    recall.append(metric_result['recall'])



In [61]:
print('Accuracy:', ['%.2f' % val for val in accuracy])
print('F1score:', ['%.2f' % val for val in f1score])
print('Precision:', ['%.2f' % val for val in precision])
print('Recall:', ['%.2f' % val for val in recall])
print('Average accuracy:', sum(accuracy) / len(accuracy))
print('Average f1score:', sum(f1score) / len(f1score))
print('Average precision:', sum(precision) / len(precision))
print('Average recall:', sum(recall) / len(recall))

Accuracy: ['0.97', '1.00', '1.00', '1.00', '1.00']
F1score: ['0.97', '1.00', '1.00', '1.00', '1.00']
Precision: ['0.98', '1.00', '1.00', '1.00', '1.00']
Recall: ['0.97', '1.00', '1.00', '1.00', '1.00']
Average accuracy: 0.9949999999999999
Average f1score: 0.9949968730456537
Average precision: 0.9952380952380953
Average recall: 0.9949999999999999


## Gaussian Naive Bayes

In [52]:
accuracy = []
f1score = []
precision = []
recall = []

for i in range(5):

    # Get train and test dataset after preprocessing
    X_train, X_test, Y_train, Y_test = TfidfVectorizeDataframe(X_df, Y_df, seed=i**2, use_pca=False)

    # GridSearchCV can do exhaustive search over specified parameter values for an estimator.
    # Parameters for GridSearchCV
    parameters = {
        'var_smoothing': np.logspace(0, -9, num=100),
    }

    # Define Gaussian Naive Bayes model
    nb_clf = GaussianNB()

    # Do GridSearchCV on model
    nb_clf = GridSearchCV(
        nb_clf, 
        parameters,
        cv=5,
        scoring='accuracy',
        refit=True,
    )

    # Train model
    nb_clf.fit(X_train, Y_train)
    
    Y_predicted = nb_clf.predict(X_test)
    metric_result = get_metric_results(Y_test, Y_predicted)
    
    accuracy.append(metric_result['accuracy'])
    f1score.append(metric_result['f1_score'])
    precision.append(metric_result['precision'])
    recall.append(metric_result['recall'])



In [53]:
print('Accuracy:', ['%.2f' % val for val in accuracy])
print('F1score:', ['%.2f' % val for val in f1score])
print('Precision:', ['%.2f' % val for val in precision])
print('Recall:', ['%.2f' % val for val in recall])
print('Average accuracy:', sum(accuracy) / len(accuracy))
print('Average f1score:', sum(f1score) / len(f1score))
print('Average precision:', sum(precision) / len(precision))
print('Average recall:', sum(recall) / len(recall))

Accuracy: ['0.95', '1.00', '1.00', '1.00', '1.00']
F1score: ['0.95', '1.00', '1.00', '1.00', '1.00']
Precision: ['0.95', '1.00', '1.00', '1.00', '1.00']
Recall: ['0.95', '1.00', '1.00', '1.00', '1.00']
Average accuracy: 0.99
Average f1score: 0.9899749373433584
Average precision: 0.990909090909091
Average recall: 0.99


## Catboost

In [62]:
accuracy = []
f1score = []
precision = []
recall = []

for i in range(5):

    # Get train and test dataset after preprocessing
    X_train, X_test, Y_train, Y_test = TfidfVectorizeDataframe(X_df, Y_df, seed=i**2, use_pca=False)
    
    # Fit directly without GridSearchCV

    # Build train and validation dataset for Catboost
    train_dataset = Pool(data=X_train,
                         label=Y_train,
                         cat_features=[])

    eval_dataset = Pool(data=X_test,
                        label=Y_test,
                        cat_features=[])

    # Get CatBoostClassifier model
    model = CatBoostClassifier(iterations=4000,
                               learning_rate=0.3,
                               l2_leaf_reg=5,
                               depth=10,
                               use_best_model=True,
                               early_stopping_rounds=300,
                               task_type="GPU",
                               devices='0:1')
    # Fit model
    model.fit(train_dataset, eval_set=eval_dataset, early_stopping_rounds=25, verbose=25, plot=True)
    
    Y_predicted = model.predict(X_test)
    metric_result = get_metric_results(Y_test, Y_predicted)
    
    accuracy.append(metric_result['accuracy'])
    f1score.append(metric_result['f1_score'])
    precision.append(metric_result['precision'])
    recall.append(metric_result['recall'])



MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.4949401	test: 0.5480378	best: 0.5480378 (0)	total: 154ms	remaining: 10m 16s
25:	learn: 0.0284820	test: 0.1100447	best: 0.1088473 (19)	total: 4s	remaining: 10m 11s
50:	learn: 0.0098009	test: 0.0918190	best: 0.0885681 (47)	total: 7.83s	remaining: 10m 6s
bestTest = 0.08856805563
bestIteration = 47
Shrink model to first 48 iterations.




MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.4433585	test: 0.4840234	best: 0.4840234 (0)	total: 157ms	remaining: 10m 28s
25:	learn: 0.0271437	test: 0.0895531	best: 0.0895531 (25)	total: 3.99s	remaining: 10m 9s
50:	learn: 0.0092032	test: 0.0684329	best: 0.0684329 (50)	total: 7.88s	remaining: 10m 9s
75:	learn: 0.0046491	test: 0.0601508	best: 0.0585462 (72)	total: 11.7s	remaining: 10m 5s
100:	learn: 0.0034374	test: 0.0532043	best: 0.0532043 (100)	total: 15.6s	remaining: 10m 1s
125:	learn: 0.0028817	test: 0.0468677	best: 0.0468677 (125)	total: 19.5s	remaining: 9m 58s
150:	learn: 0.0023676	test: 0.0445382	best: 0.0445382 (150)	total: 23.4s	remaining: 9m 55s
175:	learn: 0.0019049	test: 0.0436619	best: 0.0432358 (173)	total: 27.2s	remaining: 9m 51s
200:	learn: 0.0017335	test: 0.0417444	best: 0.0412882 (193)	total: 31.1s	remaining: 9m 47s
225:	learn: 0.0015352	test: 0.0402440	best: 0.0402440 (225)	total: 35s	remaining: 9m 44s
250:	learn: 0.0013499	test: 0.0403870	best: 0.0400966 (237)	total: 38.9s	remaining: 9m 41s
275:	learn



MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.4982421	test: 0.5061041	best: 0.5061041 (0)	total: 157ms	remaining: 10m 29s
25:	learn: 0.0378152	test: 0.0814033	best: 0.0814033 (25)	total: 4.07s	remaining: 10m 21s
50:	learn: 0.0149497	test: 0.0509351	best: 0.0501968 (48)	total: 7.97s	remaining: 10m 17s
75:	learn: 0.0089017	test: 0.0409852	best: 0.0391332 (71)	total: 11.9s	remaining: 10m 12s
100:	learn: 0.0066343	test: 0.0358322	best: 0.0347946 (88)	total: 15.8s	remaining: 10m 9s
bestTest = 0.03479456604
bestIteration = 88
Shrink model to first 89 iterations.




MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.4992900	test: 0.5734735	best: 0.5734735 (0)	total: 157ms	remaining: 10m 28s
25:	learn: 0.0312606	test: 0.1227329	best: 0.1227329 (25)	total: 3.96s	remaining: 10m 5s
50:	learn: 0.0114300	test: 0.1221505	best: 0.1109685 (37)	total: 7.87s	remaining: 10m 9s
bestTest = 0.1109684944
bestIteration = 37
Shrink model to first 38 iterations.




MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.4634016	test: 0.5939896	best: 0.5939896 (0)	total: 157ms	remaining: 10m 28s
25:	learn: 0.0292620	test: 0.2103268	best: 0.1670114 (21)	total: 4.06s	remaining: 10m 21s
50:	learn: 0.0117644	test: 0.1594817	best: 0.1404703 (35)	total: 7.98s	remaining: 10m 18s
bestTest = 0.1404702783
bestIteration = 35
Shrink model to first 36 iterations.


In [63]:
print('Accuracy:', ['%.2f' % val for val in accuracy])
print('F1score:', ['%.2f' % val for val in f1score])
print('Precision:', ['%.2f' % val for val in precision])
print('Recall:', ['%.2f' % val for val in recall])
print('Average accuracy:', sum(accuracy) / len(accuracy))
print('Average f1score:', sum(f1score) / len(f1score))
print('Average precision:', sum(precision) / len(precision))
print('Average recall:', sum(recall) / len(recall))

Accuracy: ['0.97', '1.00', '1.00', '0.97', '0.93']
F1score: ['0.97', '1.00', '1.00', '0.97', '0.92']
Precision: ['0.98', '1.00', '1.00', '0.98', '0.93']
Recall: ['0.97', '1.00', '1.00', '0.97', '0.93']
Average accuracy: 0.975
Average f1score: 0.9749088937971525
Average precision: 0.977432712215321
Average recall: 0.975


In [53]:
'''
# GridSearchCV can do exhaustive search over specified parameter values for an estimator.
# Parameters for GridSearchCV
parameters = {
    'iterations': [750, 800],
    'learning_rate': [0.03, 0.1],
    'depth': [9, 11],
}

# Define classifier model
catboost_model = CatBoostClassifier(
    verbose=50,
    cat_features=[],
)

# Do GridSearchCV on model
catboost_model = GridSearchCV(
    catboost_model, 
    parameters,
    cv=5,
    scoring='accuracy',
    refit=True
)

# Train model
catboost_model.fit(X_train, Y_train)

print(catboost_model.best_params_)
print(catboost_model.best_score_)

Y_predicted = catboost_model.predict(X_test)
print(get_metric_results(Y_test, Y_predicted))
'''

0:	learn: 0.6668904	total: 42.8ms	remaining: 6.37s
50:	learn: 0.2174723	total: 690ms	remaining: 1.34s
100:	learn: 0.1029530	total: 1.32s	remaining: 641ms
149:	learn: 0.0630265	total: 1.94s	remaining: 0us
0:	learn: 0.6747668	total: 21.9ms	remaining: 3.27s
50:	learn: 0.2147404	total: 655ms	remaining: 1.27s
100:	learn: 0.1046700	total: 1.3s	remaining: 631ms
149:	learn: 0.0637458	total: 1.95s	remaining: 0us
0:	learn: 0.6658787	total: 23.1ms	remaining: 3.44s
50:	learn: 0.2111713	total: 672ms	remaining: 1.3s
100:	learn: 0.1056493	total: 1.31s	remaining: 634ms
149:	learn: 0.0634637	total: 1.93s	remaining: 0us
0:	learn: 0.6743976	total: 54.8ms	remaining: 8.16s
50:	learn: 0.2169110	total: 694ms	remaining: 1.35s
100:	learn: 0.1057434	total: 1.32s	remaining: 639ms
149:	learn: 0.0641518	total: 1.93s	remaining: 0us
0:	learn: 0.6616703	total: 35.7ms	remaining: 5.32s
50:	learn: 0.2147718	total: 670ms	remaining: 1.3s
100:	learn: 0.1051345	total: 1.3s	remaining: 633ms
149:	learn: 0.0645236	total: 1.93s

100:	learn: 0.1051345	total: 1.36s	remaining: 2.01s
150:	learn: 0.0640505	total: 2.04s	remaining: 1.34s
200:	learn: 0.0407870	total: 2.72s	remaining: 663ms
249:	learn: 0.0235120	total: 3.39s	remaining: 0us
0:	learn: 0.6090003	total: 23.1ms	remaining: 5.75s
50:	learn: 0.0550017	total: 740ms	remaining: 2.89s
100:	learn: 0.0130246	total: 1.47s	remaining: 2.17s
150:	learn: 0.0067074	total: 2.17s	remaining: 1.43s
200:	learn: 0.0045200	total: 2.86s	remaining: 697ms
249:	learn: 0.0036022	total: 3.54s	remaining: 0us
0:	learn: 0.6342793	total: 18.4ms	remaining: 4.58s
50:	learn: 0.0564160	total: 700ms	remaining: 2.73s
100:	learn: 0.0133907	total: 1.38s	remaining: 2.04s
150:	learn: 0.0067162	total: 2.07s	remaining: 1.36s
200:	learn: 0.0044364	total: 2.77s	remaining: 676ms
249:	learn: 0.0035854	total: 3.46s	remaining: 0us
0:	learn: 0.6061202	total: 21.8ms	remaining: 5.43s
50:	learn: 0.0511483	total: 698ms	remaining: 2.72s
100:	learn: 0.0117278	total: 1.37s	remaining: 2.02s
150:	learn: 0.0064683	to

150:	learn: 0.0107427	total: 30.6s	remaining: 9.93s
199:	learn: 0.0070337	total: 40.6s	remaining: 0us
0:	learn: 0.6343569	total: 203ms	remaining: 40.3s
50:	learn: 0.0632829	total: 10.2s	remaining: 29.8s
100:	learn: 0.0166445	total: 19.8s	remaining: 19.4s
150:	learn: 0.0101548	total: 29.8s	remaining: 9.65s
199:	learn: 0.0068139	total: 39.4s	remaining: 0us
0:	learn: 0.5939373	total: 4.48ms	remaining: 891ms
50:	learn: 0.0618561	total: 9.98s	remaining: 29.2s
100:	learn: 0.0149913	total: 20s	remaining: 19.6s
150:	learn: 0.0080017	total: 30.1s	remaining: 9.78s
199:	learn: 0.0056226	total: 40.1s	remaining: 0us
0:	learn: 0.4685518	total: 199ms	remaining: 39.6s
50:	learn: 0.0086989	total: 9.78s	remaining: 28.6s
100:	learn: 0.0040383	total: 19.6s	remaining: 19.2s
150:	learn: 0.0024790	total: 29.6s	remaining: 9.61s
199:	learn: 0.0018148	total: 39.1s	remaining: 0us
0:	learn: 0.5312433	total: 200ms	remaining: 39.7s
50:	learn: 0.0096865	total: 9.92s	remaining: 29s
100:	learn: 0.0046548	total: 20.1s	

100:	learn: 0.0199141	total: 1m 23s	remaining: 40.3s
149:	learn: 0.0120707	total: 2m 8s	remaining: 0us
0:	learn: 0.5939373	total: 8.22ms	remaining: 1.22s
50:	learn: 0.0645882	total: 35.9s	remaining: 1m 15s
100:	learn: 0.0191727	total: 1m 18s	remaining: 39.8s
149:	learn: 0.0108569	total: 2m 2s	remaining: 0us
0:	learn: 0.4843930	total: 956ms	remaining: 2m 22s
50:	learn: 0.0104614	total: 43.7s	remaining: 1m 24s
100:	learn: 0.0048565	total: 1m 28s	remaining: 42.8s
149:	learn: 0.0033103	total: 2m 13s	remaining: 0us
0:	learn: 0.5290074	total: 968ms	remaining: 2m 24s
50:	learn: 0.0148703	total: 44.1s	remaining: 1m 25s
100:	learn: 0.0063446	total: 1m 30s	remaining: 43.7s
149:	learn: 0.0037810	total: 2m 14s	remaining: 0us
0:	learn: 0.5035273	total: 941ms	remaining: 2m 20s
50:	learn: 0.0116214	total: 43.5s	remaining: 1m 24s
100:	learn: 0.0058041	total: 1m 26s	remaining: 42.1s
149:	learn: 0.0036079	total: 2m 13s	remaining: 0us
0:	learn: 0.5317485	total: 968ms	remaining: 2m 24s
50:	learn: 0.013307

200:	learn: 0.0074855	total: 2m 50s	remaining: 42.4s
249:	learn: 0.0057722	total: 3m 36s	remaining: 0us
0:	learn: 0.4843930	total: 952ms	remaining: 3m 57s
50:	learn: 0.0104614	total: 43.8s	remaining: 2m 50s
100:	learn: 0.0048565	total: 1m 28s	remaining: 2m 10s
150:	learn: 0.0032777	total: 2m 15s	remaining: 1m 28s
200:	learn: 0.0023035	total: 3m 2s	remaining: 44.4s
249:	learn: 0.0018318	total: 3m 48s	remaining: 0us
0:	learn: 0.5290074	total: 963ms	remaining: 3m 59s
50:	learn: 0.0148703	total: 44.1s	remaining: 2m 51s
100:	learn: 0.0063446	total: 1m 30s	remaining: 2m 13s
150:	learn: 0.0037495	total: 2m 15s	remaining: 1m 29s
200:	learn: 0.0027436	total: 2m 59s	remaining: 43.9s
249:	learn: 0.0021519	total: 3m 45s	remaining: 0us
0:	learn: 0.5035273	total: 961ms	remaining: 3m 59s
50:	learn: 0.0116214	total: 43.5s	remaining: 2m 49s
100:	learn: 0.0058041	total: 1m 26s	remaining: 2m 8s
150:	learn: 0.0035489	total: 2m 14s	remaining: 1m 28s
200:	learn: 0.0025309	total: 2m 57s	remaining: 43.2s
249: