In [1]:
import pandas as pd
import numpy as np
import string
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn import svm
from sklearn.model_selection import StratifiedKFold, cross_validate, GridSearchCV
from sklearn.metrics import recall_score
from gensim.models import Word2Vec

In [2]:
data = pd.read_csv("C:\\Users\\ASUS\\Desktop\\directed research\\data.csv")
data = data.loc[data.english == '0', ['reviewText', 'Function']]

### Preporcessing: tokenize / remove stopwords / lemmatization / categorical features binarizing

In [3]:
# Customize the stopwords list 
reserved_words = set(['what', 'which', 'but', 'where', 'why', 'how', 'no', 'not', 'don', "don't", 'should', 'shouldn', "shouldn't"
                 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', 
                 "hasn't", 'haven', "haven't", 'isn', "isn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"])
removal_list = list(set(stopwords.words('english')).difference(reserved_words))
wordnet_lemmatizer = WordNetLemmatizer()

In [4]:
# Assign the part-of-speech for each word to get an accurate lemmatization
def get_wordnet_pos(word):
    treebank_tag = pos_tag([word])[0][1]
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [5]:
# Transform nominal variables into binary format
def feature_binarizing(row):
    if  pd.isnull(row['Function']):
        row['otherfunctional'] = 1
        return row
    if 'featureRequest' in row['Function']:
        row['featureRequest'] = 1
    if 'featureRemoval' in row['Function']:
        row['featureRemoval'] = 1
    if 'functerr' in row['Function']:
        row['functerr'] = 1
    if 'otherfunctional' in row['Function']:
        row['otherfunctional'] = 1
    return row

In [6]:
# Tokenize the reviews and lemmatize each token, remove any non-alphanumeric characters
def tokenize_lemmatize(review):
    tokens = word_tokenize(review)
    tokens = [t.lower() for t in tokens if t.isalnum()]
    tokens = [wordnet_lemmatizer.lemmatize(t, get_wordnet_pos(t)) for t in tokens]
    if len(tokens) == 0:
        return 0
    else:
        return tokens

In [7]:
data['featureRequest'] = 0
data['featureRemoval'] = 0
data['functerr'] = 0
data['otherfunctional'] = 0
data = data.apply(feature_binarizing, axis=1)
data['reviewText'] = data.apply(lambda x: tokenize_lemmatize(x.reviewText), axis = 1)
data = data[data.reviewText != 0]
data.reset_index(inplace=True)

### Features representation: tf-idf * word2vec

In [8]:
# Get the tf-idf matrix of our corpus and save it into variable 'matrix', which is used as the features in the model trainging
tfidf = TfidfVectorizer(stop_words=removal_list, analyzer=lambda x: x, max_df=0.9, sublinear_tf=True)
matrix = tfidf.fit_transform(list(data.reviewText)).todense()

In [9]:
# Get the word vectors matrix and save it into variable 'w2v', the dimension of the word vector is 300
model = Word2Vec(data.reviewText, size=300, workers=4, min_count=1)
w2v = dict(zip(model.wv.index2entity, model.wv.vectors))

# Get the sentence vectors: 
# Each sentence vector is equivalent to the sum of the vectors of the words in this sentence, which is weighted by tf-idf matrix
# The result is saved into variable 's2v', which is used as the features in the model training
s2v = np.array([np.array([w2v[word] * matrix[index, tfidf.vocabulary_[word]] for word in sentence]).mean(axis=0) 
                   for index, sentence in data.reviewText.items()])

### Classification: Naive Bayes / SVM(RBF)

In [16]:
# Naive Bayes
# The performance of Naive Bayes is worss than svm
#mnb = MultinomialNB()
#bnb = BernoulliNB()

# SVM(Radial Basis Function (RBF) kernel)
# Use function GridSearchCV() to choose the best parameters
# Best parameters: C=3 and gamma=1 for tfidf matrix, C=5700 and gamma=100 for s2v
'''
parameters = {'C': [5500, 5700, 5900]}
clf = GridSearchCV(svc, parameters, scoring='f1', cv=3)
clf.fit(s2v, data.featureRequest)
clf.best_params_
'''
svc = svm.SVC(kernel='rbf', class_weight= 'balanced', C=5700, gamma=100, decision_function_shape=None)

### K-fold Cross Validation

In [17]:
# 10 fold cross validation
skf = StratifiedKFold(10, shuffle = True).get_n_splits()
scoring = ['precision', 'recall', 'f1']

# Test the performance, 
# Two types of features: tfidf matrix vs sentence vectors
# Two tragets: funtion error vs feature request

print('functerr + tfidf matrix')
scores = cross_validate(svc, matrix, data.functerr, scoring=scoring, cv=skf, return_train_score=False)
print('precision:', scores['test_precision'].mean())
print('recall:', scores['test_recall'].mean())
print('f1:', scores['test_f1'].mean())
print('--------------------------')

print('featureRequest + tfidf matrix')
scores = cross_validate(svc, matrix, data.featureRequest, scoring=scoring, cv=skf, return_train_score=False)
print('precision:', scores['test_precision'].mean())
print('recall:', scores['test_recall'].mean())
print('f1:', scores['test_f1'].mean())
print('--------------------------')

print('functerr + s2v')
scores = cross_validate(svc, s2v, data.functerr, scoring=scoring, cv=skf, return_train_score=False)
print('precision:', scores['test_precision'].mean())
print('recall:', scores['test_recall'].mean())
print('f1:', scores['test_f1'].mean())
print('--------------------------')

print('featureRequest + s2v')
scores = cross_validate(svc, s2v, data.featureRequest, scoring=scoring, cv=skf, return_train_score=False)
print('precision:', scores['test_precision'].mean())
print('recall:', scores['test_recall'].mean())
print('f1:', scores['test_f1'].mean())

featureRequest + s2v
precision: 0.2338730141328776
recall: 0.7636507936507937
f1: 0.3579239529742898
