In [1]:
import csv
import pandas as pd
import numpy as np
import sklearn
import nltk
from nltk.corpus import stopwords 
import matplotlib.pyplot as plt

%matplotlib widget

In [17]:
def read_tsne_embeddings(fp):
    dikt = {}
    dikt['tokens'] = []
    dikt['tsne'] = []
    dikt['embeddings'] = []
    with open(fp, newline='') as csvfile:
        reader = csv.reader(csvfile)
        header = next(reader)
        for row in reader:
            dikt['tokens'].append(row[0])
            tsne = row[1].replace('[', '').replace(']', '').split()
            tsne_vals = [float(val) for val in tsne]
            dikt['tsne'].append(tsne_vals)
            emb = row[2].replace('[', '').replace(']', '')
            emb = [float(val) for val in emb.split(', ')]
            dikt['embeddings'].append(emb)
    array = np.asarray(dikt['tsne'])
    dikt['tsne_array'] = array
    return dikt

In [18]:
emb_dict = read_tsne_embeddings('word_embeddings/tsne_embeddings.csv')
print("emb_dict keys: ", emb_dict.keys())

emb_dict keys:  dict_keys(['tokens', 'tsne', 'embeddings', 'tsne_array'])


In [19]:
reviews = pd.read_csv('data/train.tsv', sep='\t')

In [20]:
def process_reviews(reviews):
    dikt = {}
    dikt['SentenceId'] = []
    dikt['Phrase'] = []
    dikt['Sentiment'] = []
    dikt['num_words'] = []
    for _, row in reviews.iterrows():
        if row['SentenceId'] not in dikt['SentenceId']:
            dikt['SentenceId'].append(row['SentenceId'])
            phrase = row['Phrase'].lower()
            phrase = nltk.word_tokenize(phrase)
            dikt['num_words'].append(len(phrase))
            if REMOVE_STOP_WORDS:
                phrase = [word for word in phrase if word not in stopwords.words('english')]
            dikt['Phrase'].append(phrase)
            dikt['Sentiment'].append(row['Sentiment'])
    df = pd.DataFrame.from_dict(dikt, orient='columns')
    return df

In [21]:
REMOVE_STOP_WORDS = True
reviews = process_reviews(reviews)

In [22]:
def reviews_2_tsne(reviews, emb_dict):
    tsne_embeddings = []
    mean_tsne = []
    for phrase in reviews['Phrase']:
        tsne_phrase = []
        for token in phrase:
            if token in emb_dict['tokens']:
                token_indx = emb_dict['tokens'].index(token)
                tsne_phrase.append(emb_dict['tsne'][token_indx])
        tsne_embeddings.append(tsne_phrase)
        mean = calc_mean_tsne(tsne_phrase)
        mean_tsne.append(mean)
    reviews['tsne_phrase'] = tsne_embeddings
    reviews['mean_tsne'] = mean_tsne
    return reviews

def calc_mean_tsne(tsne_phrase):
    tsne_array = np.asarray(tsne_phrase)
    mean = np.mean(tsne_array, axis=0)
    return mean

In [23]:
reviews = reviews_2_tsne(reviews, emb_dict)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [24]:
reviews.head()

Unnamed: 0,SentenceId,Phrase,Sentiment,num_words,tsne_phrase,mean_tsne
0,1,"[series, escapades, demonstrating, adage, good...",1,37,"[[3.9838614, 1.3062168], [-1.8750219, 0.683443...","[2.292721174117647, -0.687569988235294]"
1,2,"[quiet, ,, introspective, entertaining, indepe...",4,11,"[[1.6589055, -2.3012042], [5.167644, -2.138526...","[2.7907658625000002, -2.2070358724999997]"
2,3,"[even, fans, ismail, merchant, 's, work, ,, su...",1,21,"[[3.9352236, -2.5438602], [2.5099235, -0.37221...","[3.9178892066666666, -1.443636528]"
3,4,"[positively, thrilling, combination, ethnograp...",3,26,"[[0.611687, -4.0485334], [0.25725746, 0.166778...","[1.2054162, -0.7178533618749998]"
4,5,"[aggressive, self-glorification, manipulative,...",1,7,"[[1.4676963, -4.8337884], [-2.7965553, 1.76203...","[0.4232988000000001, -1.827653325]"


In [25]:
temp = reviews.loc[reviews['Sentiment']==0]
a = temp['tsne_phrase'].values[0]
a = np.asarray(a)

In [26]:
def get_tsne_4_display(reviews):
    negatives = []
    positives = []
    
    temp = reviews.loc[reviews['Sentiment']==0]['tsne_phrase'].values    
    for tsne in temp:
        negatives = negatives + tsne
        
    if INCLUDE_1and3:
        temp = reviews.loc[reviews['Sentiment']==1]['tsne_phrase'].values
        for tsne in temp:
            negatives = negatives + tsne        
    negatives = np.asarray(negatives)    
    
    temp = reviews.loc[reviews['Sentiment']==4]['tsne_phrase'].values    
    for tsne in temp:
        positives = positives + tsne
        
    if INCLUDE_1and3:
        temp = reviews.loc[reviews['Sentiment']==3]['tsne_phrase'].values
        for tsne in temp:
            positives = positives + tsne
    positives = np.asarray(positives)    
    
    return negatives, positives

In [27]:
INCLUDE_1and3 = False
negatives, positives = get_tsne_4_display(reviews)
print('Negatives', type(negatives), np.shape(negatives))
print('Positives', type(positives), np.shape(positives))

Negatives <class 'numpy.ndarray'> (12335, 2)
Positives <class 'numpy.ndarray'> (15102, 2)


In [63]:
def get_top_100_tokens(reviews, sent, emb_dict):
    token_counts = {}
    reviews_temp = reviews.loc[reviews['Sentiment']==sent]['Phrase'].values
    for rev in reviews_temp:
        for tok in rev:
            if tok not in token_counts:
                token_counts[tok] = 0
            token_counts[tok] += 1    
    token_counts = list(token_counts.items())
    for row in token_counts:
        row = [row[0], float(row[1])]
    
    top_100 = sorted(token_counts, key=lambda x: x[1])
    top_100 = top_100[-301:-1]  
    
    top_100_tsne = []
    for row in top_100:
        tok = row[0]
        if tok in emb_dict['tokens']:
            indx = emb_dict['tokens'].index(tok)
            top_100_tsne.append([tok, emb_dict['tsne_array'][indx]])    
    top_100_tsne = top_100_tsne[-101:-1]
    return top_100_tsne

In [64]:
neg_100 = get_top_100_tokens(reviews, 0, emb_dict)
pos_100 = get_top_100_tokens(reviews, 4, emb_dict)

['come', array([ 3.9094543, -2.6707184])]


In [60]:
neg_mean = np.mean(negatives, axis=0)
pos_mean = np.mean(positives, axis=0)

In [102]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,6))

ax1.set_title('Negative Review Embedding Trace')
ax1.set_xticklabels([])
ax1.set_yticklabels([])
ax1.grid(True)

sc1 = ax1.scatter(negatives[:,0], negatives[:,1], s=0.01, c='red')
pl1 = ax1.plot(negatives[:,0], negatives[:,1], linewidth=0.008, c='red')
for row in neg_100:
    ax1.annotate(row[0], xy=row[1], alpha=0.75, fontsize=6)


ax2.set_title('Positive Review Embedding Trace')
ax2.set_xticklabels([])
ax2.set_yticklabels([])
ax2.grid(True)

sc2 = ax2.scatter(positives[:,0], positives[:,1], s=0.01, c='blue')
pl2 = ax2.plot(positives[:,0], positives[:,1], linewidth=0.008, c='blue')
for row in pos_100:
    ax2.annotate(row[0], xy=row[1], alpha=0.75, fontsize=6)


plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [83]:
def calc_mean_words(reviews):
    dikt = {}
    sentiments = ['Negative', 'Somewhat Negative', 'Neutral', 'Somewhat Positive', 'Positive']
    dikt['sentiments'] = sentiments
    mean_data = []
    for indx, sent in enumerate(sentiments):
        subset = reviews.loc[reviews['Sentiment']==indx]
        count = 0
        total = 0
        for val in subset['num_words'].values:
            count += 1
            total = total + val
        mean_data.append(float(total)/float(count))
    dikt['mean_data'] = mean_data
    return dikt

In [84]:
mean_words_by_sent = calc_mean_words(reviews)
print(mean_words_by_sent)

{'sentiments': ['Negative', 'Somewhat Negative', 'Neutral', 'Somewhat Positive', 'Positive'], 'mean_data': [19.092350746268657, 19.121363636363636, 18.108157099697884, 19.630331753554504, 18.795472287275565]}


In [100]:
def plot_mean_words(mean_dict):
    labels = mean_dict['sentiments']
    means = mean_dict['mean_data']
    
    x = np.arange(len(labels))
    width = 0.4
    
    fig, ax = plt.subplots()
    ax.set_title('Average Number of Words per Review by Sentiment')
    ax.set_ylabel('Number of Words per Review')
    ax.set_xlabel('Review Sentiment')    
    ax.set_xticks(x)
    ax.set_xticklabels(labels, fontsize=8)
    
    rects1 = ax.bar(x, means, width, color='green')
    
    plt.show()

In [101]:
plot_mean_words(mean_words_by_sent)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …