In [1]:
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import statistics as sts
import matplotlib.pyplot as plt

from scipy.stats import spearmanr
from sklearn.model_selection import RepeatedKFold, KFold
from sklearn.linear_model import LinearRegression


np.set_printoptions(threshold=sys.maxsize, suppress=True)
pd.set_option('display.max_columns', None, 'display.max_rows', None)

sns.set_context("notebook", font_scale=1.0,
                rc={"lines.linewidth": 2.5})

In [3]:
# Six datasets based on corpus x model

#data = pd.read_csv("word2vec_wiki_norms_vectors_annos_categories.csv")

#data = pd.read_csv("word2vec_giga_norms_vectors_annos_categories.csv")

data = pd.read_csv("word2vec_ukwac_norms_vectors_annos_categories.csv")

#data = pd.read_csv("fasttext_wiki_norms_vectors_annos_categories.csv")

#data = pd.read_csv("fasttext_giga_norms_vectors_annos_categories.csv")

#data = pd.read_csv("fasttext_ukwac_norms_vectors_annos_categories.csv")

data.head()

Unnamed: 0,norm,annotation,vector,happy_sad,healthy_unhealthy,cheap_expensive,environ_friendly,casual_posh,local_foreign,location_1,location_2
0,almond,nut,"[0.453611, -0.366176, -0.141542, -0.893425, 0....",2.105263,2.071429,5.210526,2.736842,3.421053,4.842105,1.263158,2.166667
1,anchovy,fish,"[0.365828, -0.516123, -0.256799, -1.276984, 0....",5.666667,3.619048,5.190476,4.761905,4.952381,5.0,2.142857,3.35
2,apple,fruit,"[0.456778, -0.059501, 0.326614, -0.704594, 0.1...",2.166667,1.25,1.916667,1.75,1.541667,1.583333,1.5,4.681818
3,apricot,fruit,"[0.311568, -0.07296, -0.075732, -0.840292, 0.3...",2.25,1.7,4.7,2.25,3.85,4.4,1.2,4.0625
4,artichoke,vegetable,"[0.002574, -0.632022, -0.020574, -1.310475, 0....",4.846154,2.0,4.461538,2.769231,5.153846,3.461538,2.0,2.25


In [4]:
# Get norm and annotation values        

norms = data['norm']
annotations = data['annotation']

norms = [norm for norm in norms.values]
annotations = [anno for anno in annotations.values]

In [5]:
def map_annotations(annotations):
    
    annotations_map = {x: index for index, x in enumerate(np.unique(annotations), start=0)}
    annotation = np.asarray([annotations_map[annotation] for annotation in annotations])
    return annotation

In [6]:
# Get vectors for each category. Converts each str repr of vector to vector

vectors = data['vector']

x = [np.array(np.matrix(vector)).ravel() for vector in vectors.values]
x

[array([ 0.453611, -0.366176, -0.141542, -0.893425,  0.465599,  0.392229,
        -0.507111, -0.694245, -0.032502, -0.07994 , -0.070681, -0.27096 ,
        -0.003226,  0.288651,  0.328331, -0.336236, -0.042413, -0.301928,
        -0.163257, -0.094638,  0.462432,  0.3566  ,  0.045621, -0.085892,
         0.467205,  0.45132 ,  0.348264, -0.314366,  0.195946, -0.099086,
        -0.179906, -0.539623, -0.383221, -0.075982,  0.068214, -0.741099,
        -0.34751 , -0.479687, -0.030655, -0.186941,  0.705656,  0.51782 ,
        -0.178176,  0.255177,  0.436854,  0.08742 , -0.193607, -0.075158,
        -0.686341,  0.257652,  0.19651 , -0.096867, -0.173365,  0.420043,
        -0.095786, -0.200198,  0.137489, -0.558253, -0.813881, -0.199163,
         0.266628, -0.588016,  0.049463,  0.001984,  0.510038, -0.323774,
         0.773042, -0.568593, -0.026827,  0.23736 ,  0.010912, -0.010211,
         0.681059, -0.131046, -0.311557,  1.119529,  0.195949, -0.733487,
        -0.63452 , -0.628856,  0.14170

In [7]:
# Get scores for each category

happy_scores = [score for score in data['happy_sad']]
healthy_scores = [score for score in data['healthy_unhealthy']]
cheap_scores = [score for score in data['cheap_expensive']]
environ_scores = [score for score in data['environ_friendly']]
casual_scores = [score for score in data['casual_posh']]
local_scores = [score for score in data['local_foreign']]

In [8]:
# Transform x, y to numpy arrays for each category

x, y = np.array(x), np.array(casual_scores)
x

array([[ 0.453611, -0.366176, -0.141542, -0.893425,  0.465599,  0.392229,
        -0.507111, -0.694245, -0.032502, -0.07994 , -0.070681, -0.27096 ,
        -0.003226,  0.288651,  0.328331, -0.336236, -0.042413, -0.301928,
        -0.163257, -0.094638,  0.462432,  0.3566  ,  0.045621, -0.085892,
         0.467205,  0.45132 ,  0.348264, -0.314366,  0.195946, -0.099086,
        -0.179906, -0.539623, -0.383221, -0.075982,  0.068214, -0.741099,
        -0.34751 , -0.479687, -0.030655, -0.186941,  0.705656,  0.51782 ,
        -0.178176,  0.255177,  0.436854,  0.08742 , -0.193607, -0.075158,
        -0.686341,  0.257652,  0.19651 , -0.096867, -0.173365,  0.420043,
        -0.095786, -0.200198,  0.137489, -0.558253, -0.813881, -0.199163,
         0.266628, -0.588016,  0.049463,  0.001984,  0.510038, -0.323774,
         0.773042, -0.568593, -0.026827,  0.23736 ,  0.010912, -0.010211,
         0.681059, -0.131046, -0.311557,  1.119529,  0.195949, -0.733487,
        -0.63452 , -0.628856,  0.14170

In [9]:
# Build LinearRegression model instance for multilinear regression

model = LinearRegression()

In [22]:
# 10 Fold Cross-Validation: Split into train and test sets (each split = 19 vectors, 19 scores)

predictions = []
y_test_lst = []

kf = KFold(n_splits=10)
kf.get_n_splits(x)

for k, (train_index, test_index) in enumerate(kf.split(x, y)):

    x_train, x_test, y_train, y_test = x[train_index], x[test_index], y[train_index], y[test_index]

    model.fit(x_train, y_train)

    cv_y_pred = model.predict(x_test)
    
    predictions.append(cv_y_pred)
    y_test_lst.append(y_test)

print(predictions)

[array([4.31269781, 5.00135427, 3.44086883, 1.40455785, 4.06720328,
       5.81442979, 4.08207388, 2.11727744, 5.3071059 , 5.58519795,
       3.4475461 , 0.57784826, 4.4902382 , 2.00288621, 3.42518456,
       3.17412859, 3.38084598, 2.91337976, 2.37247592, 3.1551803 ]), array([2.52560599, 2.76121753, 1.77403474, 4.33484236, 4.9515539 ,
       2.6445249 , 2.06136429, 6.81265225, 1.24924578, 7.55271351,
       2.16348738, 6.73845265, 2.41125085, 2.51321669, 3.03336963,
       4.30665572, 2.02774951, 5.14953959, 3.24031312, 3.56200914]), array([2.91268389, 2.98276473, 3.29858793, 3.51164121, 2.68555889,
       5.37707204, 6.03084055, 3.75151635, 1.26309016, 2.08025484,
       3.94004556, 5.32562254, 0.82024393, 4.46426824, 2.09941095,
       4.46093842, 0.69652597, 3.10867791, 3.93000656, 3.9420461 ]), array([ 5.35694452,  3.34872474,  4.40609539,  6.08251001,  2.30409775,
        2.85067007,  5.79897798,  5.060091  ,  4.19030632,  5.54774665,
        0.21380597, -0.0398384 ,  5.73000945,

In [21]:
cv_pred = [p for pred in predictions for p in pred]
y_tests = [y for y_test in y_test_lst for y in y_test]

print(cv_pred)
cv_pred, y_tests = np.array(cv_pred), np.array(y_tests)

rho, pval = spearmanr(cv_pred, y)

#print('Spearman Rho:', rho, "pvalue:", pval)
#print('Standard Deviation (cv_pred):', sts.stdev(cv_pred), '\n')
#print('Mean (cv_pred):', cv_pred.mean())
#print('Max (cv_pred):', cv_pred.max())
#print('Min (cv_pred):', cv_pred.min(), '\n')



[4.312697806889399, 5.0013542688237935, 3.44086882825461, 1.4045578520927604, 4.067203278718876, 5.814429789825306, 4.0820738783105, 2.1172774362579183, 5.307105902731608, 5.58519795190502, 3.4475460982551205, 0.5778482594524466, 4.490238199197787, 2.0028862074186753, 3.42518455914462, 3.1741285852937224, 3.380845981510774, 2.91337976269378, 2.3724759176640386, 3.155180297480401, 2.5256059931496138, 2.7612175336042695, 1.7740347379110206, 4.334842358206843, 4.951553899691114, 2.6445248978055425, 2.061364285430326, 6.812652246649778, 1.2492457768023693, 7.552713506365374, 2.163487381711641, 6.738452650812836, 2.411250851245489, 2.5132166904015145, 3.033369632178522, 4.306655717125075, 2.0277495113617903, 5.149539586028087, 3.2403131190038454, 3.562009135590367, 2.912683886617409, 2.98276472998801, 3.298587928984304, 3.5116412101751497, 2.685558890098526, 5.377072041404115, 6.030840548544951, 3.751516350372122, 1.2630901644216577, 2.0802548401444048, 3.940045560771577, 5.325622537371015,

In [14]:
# Get metric for each fold in 10 Fold Cross Validation

cor_lst = list(zip(predictions, y_test_lst))
spearman_lst = []

for cv_pred, y in cor_lst:
    spearman_lst.append(spearmanr(cv_pred, y)[0])

print('Mean Folds:', np.mean(spearman_lst))
print('Max Folds:', np.max(spearman_lst))
print('Min Folds:', np.min(spearman_lst))

print('Standard Deviation Folds:', str(sts.stdev(np.array(spearman_lst))) + '\n')

for i, s in enumerate(spearman_lst):
    print(s)

Mean Folds: 0.6544129609121199
Max Folds: 0.8280701754385964
Min Folds: 0.29559987047190067
Standard Deviation Folds: 0.16519738095720563

0.524812030075188
0.8180451127819548
0.29559987047190067
0.5227529515978905
0.6694246430534136
0.7637323456234041
0.6631578947368421
0.7634928334822793
0.6950417518597302
0.8280701754385964


In [17]:
# Prepare for plotting

cv_pred = [p for pred in predictions for p in pred]

y = np.array(casual_scores)
y

array([3.42105263, 4.95238095, 1.54166667, 3.85      , 5.15384615,
       5.55      , 5.1875    , 2.28571429, 3.        , 3.71428571,
       1.8       , 2.46666667, 3.35      , 2.22222222, 4.30434783,
       2.75      , 2.86363636, 2.6       , 2.52380952, 3.55555556,
       2.54545455, 2.        , 2.125     , 2.56521739, 2.81818182,
       2.68181818, 2.19047619, 5.16666667, 1.52380952, 6.75      ,
       2.22727273, 8.57142857, 1.63636364, 3.71428571, 3.86363636,
       2.7826087 , 2.05555556, 4.625     , 3.        , 3.04545455,
       3.625     , 7.1875    , 5.21428571, 3.72      , 4.15      ,
       3.61111111, 2.66666667, 3.3125    , 1.6       , 2.23529412,
       3.95238095, 6.5       , 3.31818182, 2.31578947, 2.13043478,
       3.47058824, 1.73913044, 6.83333333, 2.66666667, 5.8       ,
       4.54545455, 1.59090909, 4.27272727, 7.75      , 4.2       ,
       4.6       , 5.15789474, 4.94444444, 3.35      , 7.16666667,
       3.61904762, 3.5       , 4.53846154, 5.23809524, 6.625  

In [None]:
# Setup Seaborn plotstyle

sns.set_style("darkgrid", {"axes.facecolor": ".9"})
sns.set_context("notebook", font_scale=1.7,
                rc={"lines.linewidth": 2.5})

plt.figure(figsize=(17, 17))

# Choose color palette with seaborn 
num_classes = len(np.unique(annotations))
palette = np.array(sns.color_palette("husl", num_classes))

# Create labels and annotations and zip with palette
annos = list(zip(palette, sorted(list(set(annotations)))))

# Create scatter plot for points
ax = plt.subplot(aspect='equal')
ax.scatter(y, cv_pred, c=palette[map_annotations(annotations).astype(np.int)])

plt.gca()
plt.text(0.25, 9.7, 'Spearman ρ = ' + str(round(rho, 3)), color='black', bbox=dict(facecolor='none', edgecolor='gray', 
        boxstyle='round, pad=.5'), fontsize=13)

# Create regression plot for line
plt.plot([-1,100], [-1,100], c='black', linestyle='--')
plt.annotate('ρ = 1', xy=(0.20, 0.60), color='black', fontsize=13)

#Creat second legend for colormapped points
for i, anno in enumerate(annos):
    plt.plot([], [], ' ', c = palette[i], marker = 'o', label=str(anno[1]).capitalize())
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.07), fontsize=14, frameon = 1, ncol=7)

plt.title('Actual vs Predicted Scores for "casual_posh" on Word2Vec Vectors trained on UkWac Corpus', y=1.01)
plt.xlabel('Actual Scores')
plt.ylabel('Predicted Scores')

plt.xlim([0, 10])
plt.ylim([0, 10])
    
# Annotate points by norm
for i, txt in enumerate(norms):
    plt.annotate(txt, (y[i], cv_pred[i]), fontsize=14)

In [None]:
# Advanced linear regression for advanced statistical parameters 

import statsmodels.api as sm

x = sm.add_constant(x)

In [None]:
predictions = []
y_test_lst = []

kf = KFold(n_splits=10)
kf.get_n_splits(x)

for k, (train_index, test_index) in enumerate(kf.split(x, y)):
    x_train, x_test, y_train, y_test = x[train_index], x[test_index], y[train_index], y[test_index]

In [None]:
# Create advanced model and fit

model = sm.OLS(y_train, x_train)

In [None]:
# Fit results

results = model.fit()

In [None]:
# Extract evaluation metrics

#print(results.summary())
#print('coefficient of determination r_2:', results.rsquared)
print('adjusted coefficient of determination adj r_2:', results.rsquared_adj)

In [None]:
rho, pval = spearmanr(y, cv_pred)

sns.set_style("darkgrid", {"axes.facecolor": ".9"})
plt.figure(figsize=(16, 12))

points = plt.scatter(y, cv_pred, c=y, cmap='Blues')#(x, y)
plt.colorbar(points)
sns.regplot(y, cv_pred, scatter=False)
plt.title('Predicted Scores for "local_foreign" on fastText Vectors in ukWac and Gigaword Corpus')
plt.xlabel('Predicted ukWac')
plt.ylabel('Predicted Gigaword')
plt.legend(['ukWac', 'Gigaword', 'Spearman ρ = ' + str(round(rho, 2))])
plt.xlim([1, 9])
plt.ylim([1, 9])
for i, txt in enumerate(norms):
    plt.annotate(txt, (y[i], cv_pred[i]))