In [1]:
import os
import re

import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.model_selection import GridSearchCV

In [2]:
FOLDER = '/Users/fanyang/Dropbox/uiuc/cs598/UIUC_SPL/UIUC_PSL/Project3/'
data = 'train_0' + '.csv'
df_train = pd.read_csv(os.path.join(FOLDER, data))

## Convert review to words with following treatment
- remove html
- remove non-letters
- convert to lower case
- remove stop word (stop word from package nltk)

In [3]:
def review_to_words(raw_review):
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text()
    # 2. Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()
    stops = set(stopwords.words("english"))
    # 4. Remove stop words
    meaningful_words = [w for w in words if not w in stops]
    # 4. Join the words back into one string separated by space,
    # and return the result.
    return (" ".join(meaningful_words))

In [4]:
num_reviews = df_train['review'].size
ini_clean_train_reviews = []
for j in range(num_reviews):
    ini_clean_train_reviews.append(review_to_words(df_train["review"][j]))

## Convert word to vector
- with 1-4 gram
- keep top 30k words by frequency.

In [None]:
train_vectorizer = CountVectorizer(analyzer="word",
                                   tokenizer=None,
                                   preprocessor=None,
                                   stop_words=None,
                                   ngram_range=(1, 4),
                                   max_features=30000
                                   )

ini_train_data_features = train_vectorizer.fit_transform(ini_clean_train_reviews)
ini_train_data_features = ini_train_data_features.toarray()
vocab = train_vectorizer.get_feature_names()
features = pd.DataFrame(vocab, columns=['features'])

## Word selection approach 1 - run lasso to select words

- This approach selected 2126 words, so we decided to explore other approaches.

In [8]:
X = ini_train_data_features
y = df_train['sentiment']

# cross validation for alpha selection
lasso = LogisticRegression(penalty='l1', solver='liblinear')
alphas = np.logspace(-1, 0, 10)
tuned_parameters = [{'C': alphas}]
n_folds = 5
clf = GridSearchCV(lasso, tuned_parameters, cv=n_folds, refit=False, scoring='roc_auc')
clf.fit(X, y)
# clf.cv_results_['mean_test_score']
best_alpha = clf.best_params_['C']
# print(best_alpha)
lasso_model = LogisticRegression(penalty='l1', solver='liblinear', C=best_alpha)
lasso_model.fit(X, y)
df_model_coef = pd.DataFrame(lasso_model.coef_.reshape(-1, ), columns=['coef']).sort_values('coef', ascending=False)
lasso_var = df_model_coef[abs(df_model_coef['coef']) > 0].index.tolist()
lasso_vocab = features.loc[lasso_var, :]['features'].tolist()

In [13]:
with open(os.path.join(FOLDER, 'myvocab_lasso.txt'), '+w') as f:
    f.write('\n'.join(lasso_vocab))

## Approach 2 - run two sample t-test to select words
We selected two set of words: top1000 and top2000 based on the test value (absolute value), both lead to similar performance (AUC ~0.94)

In [None]:
mean1 = np.mean(ini_train_data_features[y == 1, :], axis=0)
mean2 = np.mean(ini_train_data_features[y == 0, :], axis=0)
n1 = y.sum()
n2 = len(y) - n1
var1 = np.var(ini_train_data_features[y == 1, :], axis=0)
var2 = np.var(ini_train_data_features[y == 0, :], axis=0)
t_num = mean1 - mean2
t_den = np.sqrt(var1 / n1 + var2 / n2)
t_result = t_num / t_den
abs_result = np.abs(t_result)
df_result = pd.DataFrame(zip(t_result, abs_result), columns=['t_test', 'abs_value']).sort_values('abs_value', ascending=False)

In [17]:
df_result.to_csv(FOLDER + 't_test_result.csv')

In [19]:
word_id = df_result.iloc[:2000].index.tolist()
# pos_id =df_result.loc[word_id,][df_result.loc[word_id,]['t_test']>0].index.tolist()
# neg_id =df_result.loc[word_id,][df_result.loc[word_id,]['t_test']<0].index.tolist()
word_id_1000 = df_result.iloc[:1000].index.tolist()
t_test_2000 = features.loc[word_id, :]['features'].tolist()
t_test_1000 = features.loc[word_id_1000, :]['features'].tolist()

In [26]:
with open(os.path.join(FOLDER, 'myvocab_t_test2000.txt'), '+w') as f:
    f.write('\n'.join(t_test_2000))
with open(os.path.join(FOLDER, 'myvocab_t_test1000.txt'), '+w') as f:
    f.write('\n'.join(t_test_1000))

## Approach3 - find words that are both selected by lasso and t-test
Less than 1000 words are selected and performance is comparable to approach 2. So this is the final word list we selected

In [27]:
lasso_t2000 = list(set(word_id) & set(lasso_var))
with open(os.path.join(FOLDER, 'myvocab.txt'), '+w') as f:
    f.write('\n'.join(lasso_t2000))
len(lasso_t2000)

866