In [1]:
import re
import ast
import sys
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects

# Allow numpy to print full array without scientific notation

np.set_printoptions(threshold=sys.maxsize, suppress=True)

%matplotlib inline

# Set random variable for reproducibility

RS = 123

In [2]:
""" 
Load norms, vectors, annotations from annotations_csv.
Note: Vectors must be converted to np.array of float values
"""  

def load_norms_vectors(annotations_csv, kind='train'):
    
    # Load csv
    annotations_csv = pd.read_csv(annotations_csv, sep = "\t", names = ['norm', 'annotation', 'vector'], skipinitialspace=True)

    # Convert vectors from str representation to list[float]
    annotations_csv['vector'] = annotations_csv['vector'].apply(ast.literal_eval)
    
    # Create norms, vectors dataframe subset
    norms_vectors = annotations_csv[['norm', 'vector']]
    
    # Create annotations np.array subset
    annotations = np.array(annotations_csv['annotation'])
    
    return norms_vectors, annotations


#annotations_csv = "norms_vectors_word2vec_wiki_annotations.csv.vec100"
#annotations_csv = "norms_vectors_fasttext_wiki_annotations.csv.vec100"

#annotations_csv = "norms_vectors_word2vec_gigaword_annotations.csv.vec100"
#annotations_csv = "norms_vectors_fasttext_gigaword_annotations.csv.vec100"

annotations_csv = "norms_vectors_word2vec_ukwac_annotations.csv.vec100"
#annotations_csv = "norms_vectors_fasttext_ukwac_annotations.csv.vec100"

In [3]:
# Map annotations to integers and replace

def map_annotations(annotations):
    
    annotations_map = {x: index for index, x in enumerate(np.unique(annotations), start=0)}
    annotation = np.asarray([annotations_map[annotation] for annotation in annotations])
    return annotation

_, annotations = load_norms_vectors(annotations_csv)

In [4]:
# Load x_train, y_train to norms_vectors and annotations using above function

x_train, y_train = load_norms_vectors(annotations_csv, kind='train')

y_train = map_annotations(y_train)
norms_df = x_train['norm'].reset_index()

In [5]:
# Analyse data dimensions with shape attribute for x_train

norms = [norm for norm in x_train['norm'].values]

In [6]:
# Get unique subsets for norms, vectors, annotations

def get_unique_subset():
    
    x_subset_norm, x_subset_vector, y_subset = x_train['norm'].to_numpy(), x_train['vector'], y_train
    vectors_array = [vector for vector in x_subset_vector]    
    vectors_pca = np.array(vectors_array)
    
    return vectors_pca, y_subset

In [7]:
from sklearn.decomposition import PCA

vectors_pca, y_subset = get_unique_subset()

time_start = time.time()

pca = PCA(n_components = 2)

pca_result = pca.fit_transform(vectors_pca) # Transform of all 200 vectors

print('PCA done! Time elapsed: {} seconds'.format(time.time() - time_start))

PCA done! Time elapsed: 0.015656471252441406 seconds


In [8]:
""" View y_train: Should be format = array([],...)
    and contain every label of all data points. """

print(y_train)

print("Unique classes: {}".format(np.unique(y_train)))
print("Number of unique classes: {}".format(len(np.unique(y_train))))

[10  4  5  5 13 13  5 11  0 13  5  7  8  9 11 13  5  9  5  5  0  0 13  0
 13  3 13 11 13  4 13  4  0  2  5 11  9 13 13 12 11  4 12 12 10  4 12  8
  0  2  0  4  2 13 13  5 12 11  0  5  4 13 11 11  9 13  5 11  2 11 11  2
  0  2 11  5  5  2  9  5  4  4 11 10  4  3 11 13  9  5 13  5 11 13  5  8
  9 13  5  5  4  4  5  5  3  5  4  0  6  4  1 11  0  0  4  3 13  5  0  5
  8 13  5  0 13  5  5 13 11 11  4  5  4  5  5  5  5 11 13  8 11  4  2 11
  5  0 13 11  2 11 13 13  5  5  0  0  0 11  4  3  4 11  4  4  3 13  4 11
  1  9 13 13  4  4  5  3 13  4 12  0  5 12  4  6  4  4 11 11 11  3 10  5
  7  0  4  2 13]
Unique classes: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13]
Number of unique classes: 14


In [10]:
# Store the 4 principle components in a dataframe in 4 separate columns

pca_df = pd.DataFrame(columns = ['pca1','pca2','pca3','pca4','pca5','pca6','pca7','pca8','pca9','pca10'])

pca_df['pca1'] = pca_result[:,0]
pca_df['pca2'] = pca_result[:,1]
pca_df['pca3'] = pca_result[:,2]

IndexError: index 2 is out of bounds for axis 1 with size 2

In [11]:
# Setup Seaborn plotstyle

sns.set_style("darkgrid", {"axes.facecolor": ".9"})
sns.set_palette('muted')
sns.set_context("notebook", font_scale=2.2,
                rc={"lines.linewidth": 2.5})

def scatter_plot2(x, colors):
   
    # Choose color palette with seaborn 
    num_classes = len(np.unique(colors))
    palette = np.array(sns.color_palette("husl", num_classes))
    
    # Create labels and annotations and zip with palette
    annos = list(zip(palette, sorted(list(set(annotations)))))
    
    # Create scatter plot
    plt.figure(figsize=(30, 17))
    
    ax = plt.subplot(aspect='equal')
    ax.scatter(x[:,0], x[:,1], lw = 0, s = 40, c = palette[colors.astype(np.int)])
        
    for i, anno in enumerate(annos):
        plt.plot([], [], ' ', c = palette[i], marker = 'o', label=str(anno[1]).capitalize()) # str(anno[-1]) + ': ' +
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fontsize=22, frameon = 1, ncol=7)
    plt.title('TSNE Visualization for Word2Vec Vectors trained on ukWac Corpus', y=1.01)
    
    # Set lims
    plt.xlim(-15, 25)
    plt.ylim(-15, 25)  
    
    #Set axis
    ax.axis('tight')
    ax.grid(True)   
    
    # Annotate norms 
    for i, txt in enumerate(norms):
        plt.annotate(txt, (x[:,0][i], x[:,1][i]), fontsize=20)

In [None]:
# Take two top components

top_2_components = pca_df[['pca1', 'pca2']] # Shows per annotations (0-13)

# PCA using top 2 components

scatter_plot2(top_2_components.values, y_subset) 

In [None]:
def draw_vectors(transformed_features, components_, columns):
    """
    This funtion will project your *original* features
    onto your principal component feature-space, so that you can
    visualize how "important" each one was in the
    multi-dimensional scaling
    """

    num_columns = len(columns)

    # Scale the principal components by the max value in
    # the transformed set belonging to that component
    xvector = components_[0] * max(transformed_features[:,0])
    yvector = components_[1] * max(transformed_features[:,1])

    ax = plt.axes()

    for i in range(num_columns):
    # Use an arrow to project each original feature as a
    # labeled vector on your principal component axes
        plt.arrow(0, 0, xvector[i], yvector[i], color='b', width=0.0005, head_width=0.02, alpha=0.75)
        plt.text(xvector[i]*1.2, yvector[i]*1.2, list(columns)[i], color='b', alpha=0.75)

    return ax

In [None]:
from sklearn.manifold import TSNE

scatter_tsne = TSNE(random_state=RS).fit_transform(vectors_pca)

print('t-SNE done! Time elapse: {} seconds'.format(time.time()-time_start))

# TSNE using all 100 vector components

scatter_plot2(scatter_tsne, y_subset)

In [None]:
# See how to visualize this with regard to categories

pca_10 = PCA(n_components=10)

pca_result_10 = pca_10.fit_transform(vectors_pca)

print('PCA with 10 components done! Time elapsed: {} seconds'.format(time.time()-time_start))

print('Cumulative variaance explained by 6 principle components: {}'.format(np.sum(pca_10.explained_variance_ratio_)))

pca_result_10.shape