# Text Classification

### Libraries

In [1]:
# import libraries

import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from gensim.models import KeyedVectors
from scipy import stats
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import Perceptron
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import cross_validate
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

### Datasets

In [2]:
# load review sentiment data

review_df = pd.read_csv('review_sentiment.csv')

review_df

Unnamed: 0,review_id,text,positive,neutral,negative
0,NvusujU9_5pIUbn9SZ6hMA,Stopped by to munch a burger during today's Se...,1,0,0
1,vHOeBa7aMA_na4rfS2Db5A,"Yelp doesn't allow to leave 0 star review, so ...",0,0,1
2,hG9RTxxivb0ZXzEk4JXTXA,I find it hard to believe there are so many pe...,0,0,1
3,zIVkwgahZjOneChZFUYY4g,Love this place! Almost all of their menu item...,1,0,0
4,DLczAuvMAlAnY5EeDGhTVg,Excellent customer service. I wish I could ren...,1,0,0
...,...,...,...,...,...
63446,OgoBp7fbXnLSKvsQb4O_tw,"I really loved the food and service. I mean, t...",1,0,0
63447,Q7e8EtZMmdknDrQE7huMoQ,Their Grove location was the bomb. Delicious f...,1,0,0
63448,zzMW6zbsFaQMjoGu2bGVdA,A nice ean BBQ joint right across from some ne...,1,0,0
63449,scgoa60EvhW2Mz7JMqLYGw,The perfect Hookah bar. I'm not sure what they...,1,0,0


In [3]:
# load corpus data

corpus = open('corpus.txt', 'r').read()
corpus = corpus.split('\n')
corpus = corpus[:-1]

In [4]:
# see contents of corpus

for review in corpus[:5]:
    print('-' * 50)
    print(review)

--------------------------------------------------
stop munch burger today seahawk saint game place unsurprisingli pack good reason burger order fantast sat right next door get chanc get six feet place soon got meal place start get busier busier work way world shortest peopl maze get guess mean first person hear place go back might go say lunch tuesday less busi
--------------------------------------------------
yelp allow leav star review see one star wife move ny south california contact differ move compani sent initi email unit van line soon got email back virtual survey confirm ladi virtual survey meticul profession screen whole apart minut one contact sent second email almost month ask everyth ok get quot need mention compani sent quot less hour sinc one repli til today call direct phone left messag answer machin one call back hope everyth ok one hurt see reason explan avoid unprofession
--------------------------------------------------
find hard believ mani peopl low standard co

In [5]:
# load corpus data with stopwords

corpus_stopwords = open('corpus_stopwords.txt', 'r').read()
corpus_stopwords = corpus_stopwords.split('\n')
corpus_stopwords = corpus_stopwords[:-1]

In [6]:
# see contents of corpus with stopwords

for review in corpus_stopwords[:5]:
    print('-' * 50)
    print(review)

--------------------------------------------------
stop by to munch a burger dure today s seahawk saint game and the place wa unsurprisingli pack for good reason too the haven burger i order wa fantast i sat right next to the door so i didn t get the chanc to get more than six feet into the place as soon as i got my meal the place start get busier and busier until i had to work my way through the world s shortest peopl maze to get out guess that mean i wasn t the first person to hear about the place i will be go back to the haven might go for say lunch on a tuesday so it s less busi
--------------------------------------------------
yelp doesn t allow to leav star review so that is whi you see one star me and my wife are move from ny to south california and we contact differ move compani i sent my initi email to unit van line on and soon i got email back with virtual survey confirm for the ladi that did virtual survey wa veri meticul and profession and we screen through the whole apart

### Word embeddings

In [7]:
# load word2vec vectors

wv = KeyedVectors.load("reviews_wv")

In [8]:
def text_to_vector(embeddings, text, sequence_len, strategy=None):
    '''
    Function to convert text to word embeddings
    '''
    tokens = text.split()
    vec = []
    n = 0
    i = 0
    while i < len(tokens) and n < sequence_len:
        try:
            vec.extend(embeddings.get_vector(tokens[i]))
            n += 1
        except KeyError:
            True
        finally:
            i += 1
    for _ in range(sequence_len - n):
        vec.extend(np.zeros(embeddings.vector_size,))
    if strategy == 'mean':
        vec = np.mean(vec, axis=0)
    elif strategy == 'max':
        vec = np.max(vec, axis=0)
    return vec

In [9]:
# corpus statistics

lens = [len(c.split()) for c in corpus]

print('Number of reviews:', len(corpus))
print('Minimum number of words:', np.min(lens))
print('Maximum number of words:', np.max(lens))
print('Average number of words:', np.mean(lens))
print('Standard deviation of words:', np.std(lens))
print('Mode of words:', stats.mode(lens))

Number of reviews: 63451
Minimum number of words: 1
Maximum number of words: 488
Average number of words: 53.2330302122898
Standard deviation of words: 48.55774698364295
Mode of words: ModeResult(mode=13, count=1267)


In [10]:
# convert corpus into dataset with appended embeddings representation

simple_corpus = []
for review in review_df['text']:
    review = re.sub('[^a-zA-Z]', ' ', review).lower()
    simple_corpus.append(review)

embeddings_corpus = []
word_limit = 50
for review in simple_corpus:
    embeddings_corpus.append(text_to_vector(wv, review, word_limit))

### Feature engineering

In [11]:
# vectorizers

bag_of_wors = CountVectorizer()
one_hot = CountVectorizer(binary=True)
n_grams = CountVectorizer(ngram_range=(1, 2), max_features=15000)
tf_idf = TfidfVectorizer()

In [12]:
# apply a vectorizer to the corpus (stopwords included if using N-grams, for negation tokens)

vectorizer = None

X = None
if vectorizer is None:
    X = np.array(embeddings_corpus)
elif vectorizer == n_grams:
    X = vectorizer.fit_transform(corpus_stopwords).toarray()
else:
    X = vectorizer.fit_transform(corpus).toarray()

X.shape

(63451, 7500)

### Classifier training

In [13]:
# target labels

y = review_df[['positive', 'neutral', 'negative']]

y.shape

(63451, 3)

In [14]:
# split data into training and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

print("\nLabel distribution in the training set:")
print(y_train.value_counts())

print("\nLabel distribution in the test set:")
print(y_test.value_counts())

(50760, 7500) (50760, 3)
(12691, 7500) (12691, 3)

Label distribution in the training set:
positive  neutral  negative
1         0        0           34195
0         0        1           11672
          1        0            4893
Name: count, dtype: int64

Label distribution in the test set:
positive  neutral  negative
1         0        0           8485
0         0        1           2987
          1        0           1219
Name: count, dtype: int64


In [15]:
# oversampling to balance the classes

oversampler = RandomOverSampler()
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train.to_numpy())

print(X_train_resampled.shape, y_train_resampled.shape)

print("\nLabel distribution after oversampling:")
print(pd.DataFrame(y_train_resampled, columns=['positive', 'neutral', 'negative']).value_counts())

(102417, 7500) (102417, 3)

Label distribution after oversampling:
positive  neutral  negative
0         0        1           34139
          1        0           34139
1         0        0           34139
Name: count, dtype: int64


In [16]:
# classifiers

nayve_bayes = MultinomialNB()
logistic_regression = LogisticRegression(random_state=0)
decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier()
svm = SVC()
perceptron = Perceptron(tol=1e-3, random_state=0)
xgb = XGBClassifier()

In [None]:
# train a classifier

clf = MultiOutputClassifier(logistic_regression)
clf.fit(X_train_resampled, y_train_resampled)

In [None]:
# get predictions

y_pred_clf = clf.predict(X_test)
print(y_pred_clf)

In [None]:
# cross-validation

cv_scores = cross_validate(clf, X, y, scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'], cv=5)

# Memory Error popping up here (too much dimensionality???)

In [None]:
# VADER sentiment analysis

vader = SentimentIntensityAnalyzer()
res = []
y_pred_vader = []
for review in review_df['text']:
    sentiment = vader.polarity_scores(review)
    res.append(sentiment)
    stronger_value = max(sentiment['pos'], sentiment['neu'], sentiment['neg'])
    if sentiment['pos'] == stronger_value:
        y_pred_vader.append([1, 0, 0])
    elif sentiment['neu'] == stronger_value:
        y_pred_vader.append([0, 1, 0])
    elif sentiment['neg'] == stronger_value:
        y_pred_vader.append([0, 0, 1])
        
res

### Model performance

In [None]:
def evaluate_model(y_test, y_pred):
    '''
    Evaluate the performance of a multi-label classifier
    '''
    multilabel_cm = multilabel_confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    print("Multilabel Confusion Matrix:")
    print(multilabel_cm)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)

In [None]:
# performance metrics - classifier

evaluate_model(y_test, y_pred_clf)

In [None]:
# performance metrics - VADER

y_true = review_df[['positive', 'neutral', 'negative']]
y_true = y_true.values

evaluate_model(y_true, y_pred_vader)

# Note: values are like this probably because of criteria applied when building y_pred_vader

In [None]:
# dataframe with VADER sentiment scores

vader_df = pd.DataFrame(res)
vader_df.columns = ['vader_neg', 'vader_neu', 'vader_pos', 'vader_compound']
vader_df = pd.concat([review_df, vader_df], axis=1)

vader_df

In [None]:
# visualize VADER compound scores by sentiment

positive_scores = vader_df[vader_df['positive'] == 1]['vader_compound']
neutral_scores = vader_df[vader_df['neutral'] == 1]['vader_compound']
negative_scores = vader_df[vader_df['negative'] == 1]['vader_compound']
all_scores = pd.concat([positive_scores, neutral_scores, negative_scores])

sentiments = ['positive'] * len(positive_scores) + ['neutral'] * len(neutral_scores) + ['negative'] * len(negative_scores)
sentiment_scores = pd.DataFrame({'sentiment': sentiments, 'vader_compound': all_scores})

sns.barplot(data=sentiment_scores, x='sentiment', y='vader_compound')
plt.xlabel('Sentiment')
plt.ylabel('VADER Compound Score')
plt.title('VADER Compound Score by Sentiment')
plt.show()

In [None]:
# cross-validation results

cv_scores