### Imports and setup

In [None]:
# imports
import pickle
import pandas as pd

# setup for logging
import logging
from datetime import datetime

# write logs with time to log folder
LOG_FILENAME = datetime.now().strftime('~/log/logfile_%H_%M_%S_%d_%m_%Y.log')

for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

logging.basicConfig(filename=LOG_FILENAME,level=logging.DEBUG)

# open file
with open('df_preprocessed.pkl', 'rb') as f:
    datastore = pickle.load(f)

f.close()

# make dataframe
df_in = pd.DataFrame(datastore)

# arrangements: 'slot' ommitted - thats the 2 hour windows in the json
cols = ['idx', 'text', 'timestamp', 'ts', 'combined', 'ner_all', 'documents']
df = df_in[cols]

# replace values in timestamp with proper timestamp readable by Python
df['timestamp'] = df['timestamp'].astype('datetime64[ns]')

# sort by timestamp
df = df.sort_values('ts', ascending=True)

logging.info('Dataframe created')

### Split into 1 h time slots

In [None]:
# make slots 

tmin = 1369699200
tmax = 1370908800
bins = [i for i in range(tmin, tmax+3600, 3600)]
labels = [i for i in range(len(bins)-1)]

df['slot'] = pd.cut(df['ts'], bins=bins, labels=labels, include_lowest=True)

# function that splits df by time frames
def splitter(frame):
    values = [i for i in range(0, df['slot'].max()+1)]
    frames = []
    for a in values:
        df1 = frame.loc[frame['slot'] == a]
        frames.append(df1)
    return frames

# apply
frames = splitter(df)

logging.info('Split complete')

### Vectorize

In [None]:
# imports
import sklearn
import sklearn.feature_extraction

# generates the arrays
def vectorboy_arrays(lizt):
    rays = []
    for item in lizt:
        num = (max(int(item.shape[0]*0.0025),8))
        vectorizer = sklearn.feature_extraction.text.CountVectorizer(min_df=num)
        corpus = item.documents.tolist()
        X = vectorizer.fit_transform(corpus).toarray()
        rays.append(X)
    return rays

# generates the corresponding vocabularies
def vectorboy_vocabs(lizt):
    vocabs = []
    for item in lizt:
        num = (max(int(item.shape[0]*0.0025),8))
        vectorizer = sklearn.feature_extraction.text.CountVectorizer(min_df=num)
        corpus = item.documents.tolist()
        X = vectorizer.fit_transform(corpus).toarray()
        vocabs.append(vectorizer.vocabulary_)
    return vocabs

# get vector matrix arrays list
rays = vectorboy_arrays(frames)

# get slot vocabs
vocabs = vectorboy_vocabs(frames)

logging.info('Vector matrices success')

### Scaling & Normalizing

In [None]:
# scaling and normalizing the vector matrices
from sklearn import preprocessing
import numpy as np

def scaler(lizt):
    output = []
    for item in lizt:
        scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
        X_scaled = scaler.fit_transform(item)
        X_scaled = X_scaled.astype('float16')
        output.append(X_scaled)
    return output

def normalizer(lizt):
    output = []
    for item in lizt:
        X_norm = preprocessing.normalize(item)
        X_norm = X_norm.astype('float16')
        output.append(X_norm)
    return output

# scaling
rays_scaled = scaler(rays)

# normalizing
rays_sn = normalizer(rays_scaled)

logging.info('Normalizing success')

### Compute pairwise cosine similarity

In [None]:
# now pairwise similarity by cosine distance
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import time

def cosine_sim(lizt):
    output = []
    for array in lizt:
        array1 = array.astype('float16')
        X_sim = 1-sklearn.metrics.pairwise.cosine_similarity(array1)
        array2 = X_sim.astype('float16')
        output.append(array2)
    return output

# partitioning data for memory efficiency
rays1 = rays_sn[0:92]
rays2 = rays_sn[92:200]
rays3 = rays_sn[200:]

In [None]:
# PARTITION 1

import time

# start timer
t0 = time.time()

# apply function
result1 = cosine_sim(rays1)

# record the time
t1 = time.time()
print("Time: {}".format(t1-t0))

logging.info('Partition 1 success')

In [None]:
# PARTITION 2

# start timer
t0 = time.time()

# apply function
result2 = cosine_sim(rays2)

# record the time
t1 = time.time()
print("Time: {}".format(t1-t0))

In [None]:
# PARTITION 3

# start timer
t0 = time.time()

# apply function
result3 = cosine_sim(rays3)

# record the time
t1 = time.time()
print("Time: {}".format(t1-t0))

In [None]:
# AGGREGATE & File dump

# aggregate
rays_cs = result1 + result2 + result3 

# saving arrays separately - very large files
def saver(lizt):
    for index, item in enumerate(lizt):
        np.save('/home/wgrambozambo/arrays1/array'+ str(index)+'.npy', item)
    print("Job complete, flies in /home/wgrambozambo/arrays1/")

# dump
saver(rays_cs)

logging.info('Arrays saved in arrays1')

In [None]:
# frames, rays, and vocabs dump

import pickle

with open('frames.pkl', 'wb') as f:
    pickle.dump(frames, f)
    
f.close()

logging.info('List of dataframes by time slot saved as frames.pkl')

with open('rays.pkl', 'wb') as f:
    pickle.dump(rays, f)
    
f.close()

logging.info('List of slot arrays saved as rays.pkl')

with open('vocabs.pkl', 'wb') as f:
    pickle.dump(vocabs, f)
    
f.close()

logging.info('List of slot vocabs saved as vocabs.pkl')