In [1]:
import gensim as gn
import numpy as np
import pandas as pd
import swifter

In [2]:
from tqdm.notebook import tqdm

In [3]:
#For your purpose, all that is needed to be done is to initialize the model
#Make sure you downloaded the word2vec-sentences-of-reports.model that is within the shared google drive
model = gn.models.Word2Vec.load("word2vec-sentences-of-reports.model")

In [1]:
#using new dataframe from Daniel

In [4]:
df = pd.read_csv("processed_new.csv")

In [2]:
#def getVector gets us the vector of a word found within the word2vec model. We set an array of 100 zeros if it does not exist within the model to not mess up numpy calculations

In [6]:
def getVector(word):
    try:
        vec = model.wv[word]
        return vec
    except:
        vec = np.zeros(100)          #set array of zeros to not mess with calculations
        return vec

In [3]:
#using the weights we gained from tfidf, we check if the dictionary of weights for that sentence holds the word. return the tfidf for that word in the setnence else, return 0

In [7]:
def getWeight(word, weights):
    try:
        return weights[word]
    except:
        return 0

In [4]:
#the following block is just us initializing the pickled wordweight file that holds our tfidf values

In [9]:
import pickle
with open("wordWeights", "rb") as fp:   # Unpickling
    wordWeights = pickle.load(fp)

In [14]:
wordWeights

[[('concerned', 0.5638089776039124),
  ('phone', 0.49839696288108826),
  ('spoke', 0.5584638118743896),
  ('twr', 0.3490530848503113)],
 [('alt', 0.14173442125320435),
  ('apch', 0.12614770233631134),
  ('approx', 0.30283322930336),
  ('bay', 0.230533629655838),
  ('chart', 0.20041970908641815),
  ('contact', 0.16111139953136444),
  ('ctl', 0.15174627304077148),
  ('current', 0.20623920857906342),
  ('freq', 0.16819427907466888),
  ('mi', 0.1494668573141098),
  ('nw', 0.22639894485473633),
  ('proceeded', 0.1768011450767517),
  ('san', 0.6872854828834534),
  ('tca', 0.22865056991577148),
  ('vor', 0.17225180566310883)],
 [('changed', 0.41996049880981445),
  ('ctlr', 0.3126370906829834),
  ('freqs', 0.5156309008598328),
  ('quickly', 0.4285743534564972),
  ('routine', 0.5256805419921875)],
 [('apch', 0.2026223987340927),
  ('coming', 0.31288543343544006),
  ('contact', 0.25878217816352844),
  ('ctlr', 0.2267381101846695),
  ('land', 0.2574431598186493),
  ('nearly', 0.3869044780731201),

In [16]:
wordWeights[0]

[('concerned', 0.5638089776039124),
 ('phone', 0.49839696288108826),
 ('spoke', 0.5584638118743896),
 ('twr', 0.3490530848503113)]

In [7]:
#def getSenVec takes in the current row as its parameter. This is doen to obtain both the current index as well as the current sentence within the dataframe that is using .apply()

#also, the purpose of this funciton is to ultimately get the "vector representation" for a sentence. We accomplish this with 3 steps
# 1. Get the word vector for each word (that exists in our word2vec model)
# 2. Get the scale of each word (that exists within the tfidf dictionary) and multiply the word vector by it
# 3. Sum up each of the scaled word vectors and average them by dividing by the amount of words present in the sentence.

#we then return this vector

In [10]:
def getSenVec(row):

    thisSentence = row.sentence
    index = row.name           #Grab the INDEX



    weights = {}
    sentenceRep = []           #this will be our representive vector for a sentence
    currentPos = 0             #beginning of the sentence
    sentenceLength = len(thisSentence)
    
    for pair in wordWeights[index]:                                         #HOLD WEIGHT DICTIONARY
        weights[pair[0]] = pair[1]

    for word in thisSentence.split():                                      #We have to initialize our sentence rep first and then we can do math after
        if currentPos==0:
            sentenceRep = getVector(word) * getWeight(word,weights)                          #We have to convert word vector to a series 
        else:
            scaledVec = getVector(word) * getWeight(word,weights)
            
        if currentPos != 0:
            sentenceRep = sentenceRep + scaledVec               #Update vector by addition
            
        currentPos += 1

    sentenceRep = sentenceRep / sentenceLength              #we divide the vector by the amount of words in the sentence to avg
    return sentenceRep                                     #we will return the sentence vector back as an array

In [11]:
dftesting = df

In [9]:
#so, now with the function being created, we will use .apply(function=getSenVec, axis=1 <- for rows)

#.swifter is used to optimize the application as much as possible as well as providing us with a progress bar as well

In [13]:
dftesting['SentenceVec'] = df.swifter.apply(getSenVec,axis=1)

Pandas Apply:   0%|          | 0/2328179 [00:00<?, ?it/s]

In [10]:
#what we result in is with a data frame that still holds the original data frame as well as the sentence vectors of each of them in a respective column.

In [124]:
dftesting

Unnamed: 0,sentence,ACN,positive_sentiment_score,neutral_sentiment_score,negative_sentiment_score,SentenceVec
0,gentleman spoke lindbergh tower phone polite s...,75085,0.814141,,,"[-0.04881222, -0.1671472, -0.0915354, 0.028442..."
1,approximately mile north west mission bay vor ...,75085,,0.033765,,"[0.1053383, -0.17976266, -0.08720298, 0.027233..."
2,approximately attempts able reach controller,75085,,0.256700,,"[-0.2270008, -0.100482486, -0.020965075, -0.10..."
3,without hesitation quickly changed frequencies...,75085,,0.139586,,"[-0.019198608, -0.029624831, 0.035234645, 0.11..."
4,trying contact third san approach controller n...,75085,,0.046918,,"[-0.021005306, -0.08504546, -0.077642426, -0.0..."
...,...,...,...,...,...,...
2329217,let us fly approach vectored commuter front us...,1925323,,0.246761,,"[0.10059753, -0.18382819, -0.18872885, -0.1427..."
2329218,could avoided provided information prior conta...,1925323,,0.039105,,"[-0.062782235, -0.042367324, 0.059841324, -0.1..."
2329219,digital atis need clearly tell crews approach ...,1925323,,0.042575,,"[0.2530189, 0.30727804, 0.13685602, -0.0440030..."
2329220,multiple aircraft landed multiple years fbo fr...,1925480,,0.054096,,"[0.0880418, -0.27884012, -0.09044636, 0.212590..."


In [11]:
#dftesting.to_pickle("./weightedDF&SenVec.pkl")             we save this dataframe just in case

In [12]:
#NOW, in order for us to be able to use sklearn libraries, we must reformat the SentenceVec column to be a matrix (ie dataframe) of its own

In [None]:
#initialize the dataframed we created
file = open('weightedDF&SenVec.pkl', 'rb')
weightedDBSenVec = pkl.load(file)

In [None]:
#hold only the column that holds the sentence vectors
data = weightedDBSenVec['SentenceVec']

In [None]:
list = []                #will hold our vectors
columns = []             #will hold our columns which in this case is 0 through 99

#why we are doing this is so that we can later apply a dataframe function that automatically formats itself by taking lists as row x col

In [None]:
for vec in data:                              #append vectors
    list.append(vec.tolist())

col = np.arange(100).tolist()                 #append 0 through 99

In [None]:
matrix = pd.DataFrame(list, columns = col)   #function that automatically formats dataframe

In [None]:
#matrix                                       #just checking if correct

In [None]:
matrix['sentence'] = weightedDBSenVec['sentence']         
#for future purposes, we want to hold the original sentences also within our matrix, just have to make sure to only allow sklearn to see columns 0-99 and not column 100 which will
#hold the original sentence (rn its the "fixed" sentence but we will fix this issue soon)

In [None]:
#matrix.to_pickle("./WeightedMatrixSenVec.pkl")          we saved this matrix as a file already so it is able to be initialized in other files

In [None]:
#The matrix should allow us to obtain better clusters