In [1]:
import pandas as pd
from glob import glob
from os import listdir, chdir
from re import compile as rcompile
from path import Path
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam, Nadam

from sklearn.metrics.pairwise import cosine_similarity

In [2]:
trex = rcompile('[^a-zA-Z 0-9]')
tokenize = lambda x: trex.sub('', x.lower().replace(',', ' ').replace('-', ' '))

def read_transcript(file_name):
    file_names = glob(f'{file_name}*')
    ret = ''
    for fname in file_names:
        with open(fname) as fd:
            ret += fd.read()+'\n\n'
    return ret

reference_transcripts_dir = '../data/transcripts'
google_transcripts_dir = '../data/google_transcripts'

reference_transcripts_files = listdir(reference_transcripts_dir)
google_transcripts_files = listdir(google_transcripts_dir)
reference_transcripts_files.sort()
google_transcripts_files.sort()

In [3]:
print(reference_transcripts_files)
print(google_transcripts_files)

['.ipynb_checkpoints', '3101.txt', '3102.txt', '3103.txt', '3104.txt', '3105.txt', '3106.txt', '3107.txt', '3108.txt', '3109.txt', '3110.txt', '3111.txt', '3112.txt', '3113.txt', '3114.txt', '3115.txt', '3116.txt', '3117.txt', '3118.txt', '3119.txt', '3120.txt', '3121.txt', '3122.txt', '3123.txt', '3124.txt', '3125.txt', '3126.txt', '3127.txt', '3128.txt', '3129.txt', '3130.txt', '3131.txt', '3132.txt', '3201.txt', '3202.txt', '3203.txt', '3204.txt', '3205.txt', '3206.txt', '3207.txt', '3208.txt', '3209.txt', '3210.txt', '3211.txt', '3212.txt', '3213 (missing page 2).txt', '3214.txt', '3215 (missing page 2).txt', '3216.txt', '3217.txt', '3218.txt', '3219.txt', '3220.txt', '3221.txt', '3222.txt', '3223.txt', '3224.txt', '3225.txt', '3226.txt', '3227.txt', '3228.txt', '3229.txt', '3230.txt', '3231.txt', '3232.txt', '3234.txt', '3235.txt', '3236.txt', '3237.txt', '3238.txt', '3239.txt', '3240.txt', '3241.txt', '3243.txt', '3244.txt', '3245.txt', '3246.txt', '3247.txt', '3248.txt', '5101.t

In [4]:
#Some of the reference transcripts have missing information and those files have
#filename with '(missing page <number>)' substrings - filter these files out for
#now.
rex = rcompile('[0-9]+.txt')
reference_transcripts_file_roots = [x.split('.')[0] for x in reference_transcripts_files if rex.match(x) is not None]
print(reference_transcripts_file_roots)

['3101', '3102', '3103', '3104', '3105', '3106', '3107', '3108', '3109', '3110', '3111', '3112', '3113', '3114', '3115', '3116', '3117', '3118', '3119', '3120', '3121', '3122', '3123', '3124', '3125', '3126', '3127', '3128', '3129', '3130', '3131', '3132', '3201', '3202', '3203', '3204', '3205', '3206', '3207', '3208', '3209', '3210', '3211', '3212', '3214', '3216', '3217', '3218', '3219', '3220', '3221', '3222', '3223', '3224', '3225', '3226', '3227', '3228', '3229', '3230', '3231', '3232', '3234', '3235', '3236', '3237', '3238', '3239', '3240', '3241', '3243', '3244', '3245', '3246', '3247', '3248', '5101', '5102', '5103', '5104', '5105', '5106', '5107', '5108', '5109', '5110', '5111', '5112', '5113', '5114', '5115', '5116', '5117', '5118', '5119', '5120', '5121', '5122', '5123', '5124', '5125', '5126', '5129', '5130', '5131', '5132', '5202', '5203', '5204', '5205', '5206', '5207', '5208', '5209', '5210', '5213', '5214', '5215', '5216', '5217', '5218', '5219', '5220', '5221', '5222',

In [5]:

with Path(reference_transcripts_dir):
    reference_transcripts = [read_transcript(f'{x}.txt') for x in reference_transcripts_file_roots]

print(len(reference_transcripts),'\n', reference_transcripts[0],'\n', reference_transcripts[1])

with Path(google_transcripts_dir):
    google_transcripts = [read_transcript(x) for x in reference_transcripts_file_roots]
    
print(len(google_transcripts),'\n', google_transcripts[0],'\n', google_transcripts[1])

reference_transcripts = list(map(tokenize, reference_transcripts))
print(len(reference_transcripts),'\n', reference_transcripts[0],'\n', reference_transcripts[1])

google_transcripts = list(map(tokenize, google_transcripts))
print(len(google_transcripts),'\n', google_transcripts[0],'\n', google_transcripts[1])


165 
 Once apon a time there was a girl named Mary. On a warm 
sunny day Mary was walking through the woods near hear nouse 
to look for some critters to take pictures of. She loved animals 
and nature all her life even though she was only nine years old 
She thinks that she is going to die soon. She does go to school 
but she isn't that smart. For example Mary recycled a pie 
even though the pie was not even bitten. Another thing she did 
was leave her muffin in the woods while she was traveling with  
her family to Andrea's house which is one of Mary's friends at 
school and Mary's muffin was eaten by a bear that sniffed it :q!
:
from far away. Yes. Mary could be a little weird but that is 
just how she is. Back to the real story now. Mary found a 
fox, took a picture she also found a rabbit took a picture 
she found a squirrel and took a picture before it ran away. 
But sooner or later she saw two eyes peeking through a bush 
on the side of the path she was walking on. Then the 
cre

In [6]:
# Instantiate vectorizer object
tfidf = TfidfVectorizer(stop_words='english',
                        ngram_range=(1,2),
                        min_df=1
                       )
# Create a vocabulary and get word counts per document
ref_dtm = tfidf.fit_transform(reference_transcripts)

features = tfidf.get_feature_names()
#display(len(features), features[:50])

# Get feature names to use as dataframe column headers
ref_dtm = pd.DataFrame(ref_dtm.todense(), columns=features)
print(ref_dtm.shape)
display(ref_dtm.head())


(165, 22296)


Unnamed: 0,000,000 000,000 5203,000 math,000 rocket,000 tommorow,000 words,0001,0001 5216,10,...,zoes accound,zoes friend,zoes social,zoomed,zoomed planted,zoomed shop,zrm,zrm jedis,zzzzzz,zzzzzz wake
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.035998,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052521,0.052521
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
ishape = ref_dtm.shape[1]
# Create Model 
input_img = Input(shape=(ishape, ))

x = Dense(512)(input_img)

x = Dense(256)(x)

x = Dense(128)(x)

encoded = Dense(64)(x)

x = Dense(128)(encoded)

x = Dense(256)(x)

x = Dense(512, activation='sigmoid')(x)
decoded = Dense(ishape, activation='sigmoid')(x)


rmodel = Model(input_img, decoded)
rmodel.compile(loss='mse', optimizer=Adam(learning_rate=0.01))

rmodel.fit(ref_dtm, ref_dtm, batch_size=512, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f9459f0d370>

In [8]:
encoder = Model(input_img, encoded)

encoded_ref_dtm = encoder.predict(ref_dtm)
print(encoded_ref_dtm.shape)
display(encoded_ref_dtm)

(165, 64)


array([[ 1.4990647 ,  0.98012966, -1.3534468 , ...,  0.470708  ,
        -0.90270114,  0.84063077],
       [ 1.5463108 ,  0.99925774, -1.348777  , ...,  0.521043  ,
        -0.92188436,  0.85927   ],
       [ 1.4620095 ,  0.94183564, -1.2930222 , ...,  0.48530713,
        -0.88439316,  0.796235  ],
       ...,
       [ 1.4187019 ,  0.8924032 , -1.238866  , ...,  0.4494997 ,
        -0.8336204 ,  0.79506457],
       [ 1.4080077 ,  0.8775796 , -1.2466266 , ...,  0.47411922,
        -0.82748294,  0.7817882 ],
       [ 1.3298484 ,  0.87903994, -1.1690811 , ...,  0.46021232,
        -0.7725303 ,  0.7473995 ]], dtype=float32)

In [9]:
def get_vector(doc):
    vec = tfidf.transform([tokenize(doc)]).todense()
    return encoder.predict(vec)

In [10]:
print(get_vector(google_transcripts[0]))

[[ 1.5016049   0.97205496 -1.3459889   0.4551152  -1.7292242   1.2013485
  -1.4958045   1.8624494  -0.9326757  -0.954323    1.1047379   0.12093814
   1.5400263   0.31621033 -1.8757991  -0.49392912 -0.43462443  2.2810898
  -0.49175274 -0.5201553  -1.1880847  -0.6612964   0.63556176  1.6796058
   1.7924833   1.5975866   1.5442698   0.8153668   1.9369153   1.3197103
  -1.2814026   0.6152996   0.26179093 -1.0687301   0.30968785 -0.98631656
  -0.31112486 -0.4294312  -1.6195368   1.4637537  -0.40564954  1.796835
  -0.77639407  0.27288294  0.33176598 -1.1098183   1.285609    0.00735259
  -1.2460188  -0.13671783  2.0164964   1.4723079  -1.4743842  -0.19632255
  -0.64653987  0.09914749 -0.48715913  0.51950663  0.03424577 -0.4790276
   0.9096251   0.47012222 -0.8985676   0.83741385]]


In [11]:
def get_encoded_dtm(transcripts):
    dtm = tfidf.transform(transcripts).todense()
    print(dtm.shape)
    encoded_dtm = encoder.predict(dtm)
    print(encoded_dtm.shape)
    display(encoded_dtm)
    return encoded_dtm

encoded_google_dtm = get_encoded_dtm(google_transcripts)
print(encoded_google_dtm.shape)
display(encoded_google_dtm)

(165, 22296)
(165, 64)


array([[ 1.501605  ,  0.972055  , -1.3459889 , ...,  0.47012237,
        -0.89856744,  0.83741385],
       [ 1.568002  ,  0.9906531 , -1.3552194 , ...,  0.5056087 ,
        -0.9083303 ,  0.88360643],
       [ 1.4836593 ,  0.94291997, -1.3041081 , ...,  0.48810276,
        -0.87959397,  0.8221138 ],
       ...,
       [ 1.4152523 ,  0.890223  , -1.2332716 , ...,  0.4497346 ,
        -0.842205  ,  0.7872576 ],
       [ 1.4309719 ,  0.8889978 , -1.2674681 , ...,  0.4814803 ,
        -0.8408581 ,  0.7974877 ],
       [ 1.2677284 ,  0.83859813, -1.0655831 , ...,  0.4182424 ,
        -0.7452989 ,  0.67814314]], dtype=float32)

(165, 64)


array([[ 1.501605  ,  0.972055  , -1.3459889 , ...,  0.47012237,
        -0.89856744,  0.83741385],
       [ 1.568002  ,  0.9906531 , -1.3552194 , ...,  0.5056087 ,
        -0.9083303 ,  0.88360643],
       [ 1.4836593 ,  0.94291997, -1.3041081 , ...,  0.48810276,
        -0.87959397,  0.8221138 ],
       ...,
       [ 1.4152523 ,  0.890223  , -1.2332716 , ...,  0.4497346 ,
        -0.842205  ,  0.7872576 ],
       [ 1.4309719 ,  0.8889978 , -1.2674681 , ...,  0.4814803 ,
        -0.8408581 ,  0.7974877 ],
       [ 1.2677284 ,  0.83859813, -1.0655831 , ...,  0.4182424 ,
        -0.7452989 ,  0.67814314]], dtype=float32)

In [12]:
cosine_similarity([encoded_ref_dtm[0]], [encoded_google_dtm[0]])

array([[0.99999493]], dtype=float32)

In [13]:
def get_cossim(encoded_dtm):
    cossim = []
    for idx,ref in enumerate(encoded_ref_dtm):
        cossim.append(cosine_similarity([ref], [encoded_dtm[idx]])[0][0]) 
    return cossim

cossim = get_cossim(encoded_google_dtm)
print(cossim)
mean_cossim = sum(cossim)/len(cossim)
print(mean_cossim, round((1-mean_cossim)*1e5,4))

[0.99999493, 0.99997103, 0.99994373, 0.9999001, 0.999973, 0.99998695, 0.99997216, 0.9999799, 0.9999726, 0.9999608, 0.9999876, 0.9999627, 0.99996996, 0.99997437, 0.99995995, 0.9999772, 0.99991226, 0.999967, 0.9999092, 0.99998283, 0.9999788, 0.99999326, 0.99998426, 0.9999369, 0.9999989, 0.99997264, 0.9999955, 0.999931, 0.99997985, 0.9999907, 0.9999861, 0.9999834, 0.99994993, 0.99998045, 0.99998945, 0.9999855, 0.99999464, 0.9999825, 0.9999807, 0.9999834, 0.99999326, 0.99997795, 0.9999818, 0.9999683, 0.99999285, 0.9999257, 0.9999329, 0.999961, 0.99999034, 0.99997365, 0.9999645, 0.9999719, 0.9999815, 0.9999936, 0.99999213, 0.9999685, 0.99999094, 0.9998902, 0.99998355, 0.9999846, 0.99999475, 0.9999913, 0.9999827, 0.999873, 0.99978524, 0.9999881, 0.9999867, 0.9999946, 0.9999005, 0.999997, 0.99996835, 0.9999763, 0.9999872, 0.9999633, 0.99999243, 0.9999983, 0.99996984, 0.99996436, 0.9999803, 0.9999384, 0.9999704, 0.99999404, 0.999989, 0.9999848, 0.9999418, 0.9999503, 0.9999826, 0.99999356, 0.99

In [14]:
from joblib import dump
MODELS_DIR = '../models/'

TFIDF = MODELS_DIR+'tfidf.pkl'
ENCODED_REF_DTM = MODELS_DIR+'encoded_ref_dtm.pkl'
ENCODER = MODELS_DIR+'encoder.h5'
FILE_ROOTS = MODELS_DIR+'file_roots.pkl'

dump(tfidf, TFIDF)
encoder.save(ENCODER)
dump(encoded_ref_dtm, ENCODED_REF_DTM)
dump(reference_transcripts_file_roots, FILE_ROOTS)

['../models/file_roots.pkl']