Imports.

In [1]:
import numpy as np

import pandas as pd

import gensim
from gensim.models import Word2Vec as w2v

from ast import literal_eval

import time

Load DataFrame.

In [2]:
chaps_df = pd.read_csv('../data/animorphs_chaps.csv')

for col in ['clean','clean_no_stops']:
    chaps_df[col] = chaps_df[col].map(literal_eval) # as lists
    
chaps_df.head()

Unnamed: 0,book,chapter,text,authenticity,clean,clean_no_stops,vec_clean
0,1,1,"My name is Jake. That's my first name, obvious...",1,"[name, thats, first, name, obviously, cant, te...","[my, name, is, jake, thats, my, first, name, o...","[0.16399288, -0.05242784, 0.1072874, -0.055261..."
1,1,2,"""A flying saucer?"" Marco said. He did laugh. T...",1,"[flying, saucer, laugh, looked, could, feel, h...","[a, flying, saucer, marco, said, he, did, laug...","[0.23453264, -0.108955644, 0.12658253, -0.1080..."
2,1,3,<They have come to destroy you.>\nIt was stran...,1,"[come, destroy, strange, way, knew, telling, t...","[they, have, come, to, destroy, you, it, was, ...","[0.16289029, -0.19454873, 0.11443755, -0.12418..."
3,1,4,<Yeerks!>\nThe twin red lights slowed. They tu...,1,"[yeerks, twin, red, lights, slowed, turned, ci...","[yeerks, the, twin, red, lights, slowed, they,...","[0.2535453, -0.27677158, 0.14230801, -0.217424..."
4,1,5,"The Hork-Bajir pointed his gun, or whatever it...",1,"[hork, bajir, pointed, gun, whatever, around, ...","[the, hork, bajir, pointed, his, gun, or, what...","[0.23739086, -0.29221234, 0.1713094, -0.215393..."


A function to create or load a word vector model.

In [3]:
vsize=300 # size of all w2v vectors

def create_vectors(vector_file, col='clean', 
                   chaps_df=chaps_df, reset=False):
    """
    A function to set up word vectorization for a given corpus,
    or load it from a given file.
    """
    try: # load word vectors
        
        assert not reset
        
        w2vmodel = w2v.load(vector_file)

    except: # create word vectors

        t0 = time.time() # this might be a while

        # make corpus
        corpus = list(chaps_df[col])

        w2vmodel = w2v(corpus,
                       size=vsize, # somewhat arbitrary, plecháč used 100 d
                       # could gridsearch over most of these params, but
                       # would take forever. instead, make informed decision
                       window=5, 
                       min_count=2,
                       sg=0, # cbow #sg=1, # skipgram
                       workers=3)
        
        w2vmodel.save(vector_file)   
        
        print('It took',time.time()-t0,'seconds to generate this model.')
    
    return w2vmodel

In [4]:
vector_file = '../data/animorphs.vector'

w2vmodel = create_vectors(vector_file)

It took 15.5559663772583 seconds to generate this model.


In [5]:
w2vmodel.wv.most_similar('planet')

[('earth', 0.9860408306121826),
 ('parasitic', 0.9683764576911926),
 ('invaded', 0.9607400894165039),
 ('destroyed', 0.9558642506599426),
 ('invasion', 0.9503868818283081),
 ('conquering', 0.9499269723892212),
 ('loser', 0.947063684463501),
 ('infested', 0.9470416903495789),
 ('kandrona', 0.9450056552886963),
 ('battle', 0.944513738155365)]

This next cell was inextricably influenced by the work of Boom Devahastin Na Ayudhya, in the function `vectorize_corpus` available [here](https://github.com/boom-deva/FEMA-Power-Outage-Hotspot-Detection/blob/master/code/4_Preprocessing-and-NLP-Modeling.ipynb).

Define a function to get the average vector of a list of words, ignoring unfamiliar words.

In [6]:
# influenced by boom's vectorize_corpus
def avg_vec_words(words, w2vmodel=w2vmodel):
    """
    A function to get the average vector of a list of words.
    Ignores unfamiliar words.
    """
    vecs = [w2vmodel.wv.word_vec(w)    # vectorize word w
            for w in words             # for all words
            if w in w2vmodel.wv.vocab] # that are in the vocab
    
# below code would treat unfamiliar words as 0 vectors instead of ignoring
#     vecs = [w2vmodel.wv.word_vec(w)    # vectorize word w
#             if w in w2vmodel.wv.vocab  # if w can be vectorized
#             else np.zeros(vsize)       # otherwise give it a 0 vector
#             for w in words]            # for all words
    
    return np.mean(vecs, axis=0) # return avg of vecs

Vectorize each chapter.

In [7]:
chaps_df['vec_clean'] = chaps_df['clean'].map(avg_vec_words)

chaps_df.head()

Unnamed: 0,book,chapter,text,authenticity,clean,clean_no_stops,vec_clean
0,1,1,"My name is Jake. That's my first name, obvious...",1,"[name, thats, first, name, obviously, cant, te...","[my, name, is, jake, thats, my, first, name, o...","[0.050813198, 0.21546867, -0.11217794, 0.05301..."
1,1,2,"""A flying saucer?"" Marco said. He did laugh. T...",1,"[flying, saucer, laugh, looked, could, feel, h...","[a, flying, saucer, marco, said, he, did, laug...","[0.0059409393, 0.23846294, -0.09167185, 0.0067..."
2,1,3,<They have come to destroy you.>\nIt was stran...,1,"[come, destroy, strange, way, knew, telling, t...","[they, have, come, to, destroy, you, it, was, ...","[0.00516426, 0.25513598, -0.040639274, -0.0519..."
3,1,4,<Yeerks!>\nThe twin red lights slowed. They tu...,1,"[yeerks, twin, red, lights, slowed, turned, ci...","[yeerks, the, twin, red, lights, slowed, they,...","[-0.027513022, 0.24088845, 0.00715725, -0.0299..."
4,1,5,"The Hork-Bajir pointed his gun, or whatever it...",1,"[hork, bajir, pointed, gun, whatever, around, ...","[the, hork, bajir, pointed, his, gun, or, what...","[-0.037028935, 0.29484826, 0.030976577, -0.076..."


Create a DataFrame of two potential vectorizations of books.

`book_vec` is the simple averge vectorization of every word in the book.
`avg_chap_vec` is the average of the vectorization of each chapter (which is itself the average vectorization of every word in the chapter), which would be inappropriate for normal use but might be relevant in chapter-based classification methods.

In [8]:
books_gb = chaps_df.groupby(by='book')

books_df = pd.DataFrame(data={
    'book':books_gb['book'].mean(), # book number
    'book_vec':[avg_vec_words(
        [w 
         for c in chaps_df.loc[chaps_df['book']==b, 'clean'] 
         for w in c]) for b in chaps_df['book'].unique()], # book vector
    'avg_chap_vec':books_gb['vec_clean'].apply(
        lambda v: np.mean(v, axis=0)), # avg vector of chapters in book
    'authenticity':books_gb['authenticity'].mean() # authenticity
})

books_df.head()

Unnamed: 0_level_0,book,book_vec,avg_chap_vec,authenticity
book,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,"[0.0010895184, 0.2081184, -0.05931624, 0.01286...","[-0.0027950632, 0.20982857, -0.058019806, 0.01...",1
2,2,"[-0.00226549, 0.22271992, -0.065148115, 0.0113...","[-0.0009056398, 0.22276656, -0.06607905, 0.012...",1
3,3,"[-0.0129256295, 0.21558568, -0.05230921, 0.003...","[-0.015145328, 0.21650073, -0.0493573, 0.00182...",1
4,4,"[-0.0025382496, 0.21817772, -0.069260634, 0.00...","[-0.0047836, 0.21768926, -0.06756582, 0.005677...",1
5,5,"[-0.00024276103, 0.22412194, -0.05454963, 0.00...","[-0.0009346308, 0.22463025, -0.055034954, 0.00...",1


Because of the way `pandas` and `.csv` files interact and store odd datatypes, we need to save our array columns in an odd way and then undo the transformation when we load them.

In [9]:
# save np arrays as lists for convenient csv use
chaps_df['vec_clean'] = chaps_df['vec_clean'].map(list)
books_df['book_vec'] = books_df['book_vec'].map(list)
books_df['avg_chap_vec'] = books_df['avg_chap_vec'].map(list)

Save updated DataFrames.

In [10]:
chaps_df.to_csv('../data/animorphs_chaps.csv', index=False)
books_df.to_csv('../data/animorphs_books.csv', index=False)