### Imports and setup

In [None]:
# imports
import pickle
import pandas as pd
import time

# setup for logging
import logging
from datetime import datetime

# write logs with time to log folder
LOG_FILENAME = datetime.now().strftime('/home/wgrambozambo/log/logfile_%H_%M_%S_%d_%m_%Y.log')

for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

logging.basicConfig(filename=LOG_FILENAME,level=logging.DEBUG)

# open file
with open('data_clustered.pkl', 'rb') as f:
    df = pickle.load(f)

f.close()

# sort by timestamp
df = df.sort_values('ts', ascending=True)

logging.info('Dataframe ready')

### Split into time slots

In [None]:
# function that splits df by time frames

def splitter(frame):
    values = [i for i in range(0, df['slot'].max()+1)]
    frames = []
    for a in values:
        df1 = frame.loc[frame['slot'] == a]
        frames.append(df1)
    return frames

# split into frames
frames = splitter(df)

logging.info('Dataframe split')

### Vectorize
This needs to be redone because of the clusterids

In [None]:
import sklearn
import sklearn.feature_extraction

def vectorboy_arrays(lizt):
    rays = []
    for item in lizt:
        num = (max(int(item.shape[0]*0.0025),8))
        vectorizer = sklearn.feature_extraction.text.CountVectorizer(min_df=num)
        corpus = item.documents.tolist()
        X = vectorizer.fit_transform(corpus).toarray()
        rays.append(X)
    return rays

def vectorboy_vocabs(lizt):
    vocabs = []
    for item in lizt:
        num = (max(int(item.shape[0]*0.0025),8))
        vectorizer = sklearn.feature_extraction.text.CountVectorizer(min_df=num)
        corpus = item.documents.tolist()
        X = vectorizer.fit_transform(corpus).toarray()
        vocabs.append(vectorizer.get_feature_names())
    return vocabs

# generate slot vector matrices list
rays = vectorboy_arrays(frames)

# generate corresponding vocabs list
vocabs = vectorboy_vocabs(frames)

logging.info('Arrays and vocabs done')

### Compute dfidft weights

In [None]:
# df idft
from collections import OrderedDict
import time 
import numpy as np
import math

def tf(voc, arr):
    output = []
    for f, b in zip(voc, arr):
        result = np.sum(b, axis = 0)
        result = result.tolist()
        dikt = OrderedDict(zip(f, result))
        output.append(dikt)
    return output

# start timer
t0 = time.time()

# simple term frequencies
tf = tf(vocabs, rays)

# previous df average per word in vocab of time slot
def tf_pre(lizt):
    output = []
    for idx, val in enumerate(lizt):
        diktator = {}
        if idx < 4:
            for item in val:
                diktator.update({item : 0})
        else:
            dikt_1 = lizt[(idx-1)]
            dikt_2 = lizt[(idx-2)]
            dikt_3 = lizt[(idx-3)]
            dikt_4 = lizt[(idx-4)]
            for item in val:
                n1 = dikt_1.get(item, 0)
                n2 = dikt_2.get(item, 0)
                n3 = dikt_3.get(item, 0)
                n4 = dikt_4.get(item, 0)
                average = (n1+n2+n3+n4)/4
                diktator.update({item : average})
        output.append(diktator)
    return output

# term frequencies in previous slots
previous = tf_pre(tf)

def dfidft(now, prev):
    output = []
    for f, b in zip(now, prev):
        diktator = {}
        for item in f:
            tf = f.get(item)
            ptf = b.get(item)
            weight = (tf+1)/(math.log(ptf+1)+1)
            diktator.update({item : weight})
        output.append(diktator)
    return output
                             
# dfidft                         
weighted_vocabs = dfidft(tf, previous)
                             
# record the time
t1 = time.time()
print("Time: {}".format(t1-t0))

logging.info('Dfidft complete')

### Increase weights for named entities

In [None]:
# make ner list
ner_list = df['ner_all'].tolist()

from itertools import chain

# ner list with unique values 
ner_list = list(set(chain(*ner_list)))

# with ner list

def nert(lizt):
    output = []
    for a in lizt:
        diktator = {}
        for item in a:
            v = a.get(item)
            if item in ner_list:
                v = v * 2.5
            else:
                v = v
            diktator.update({item : v})
        output.append(diktator)
    return output

# weighted slot vocabularies final
wvoc_final = nert(weighted_vocabs)

logging.info('Weighted vocabs updated with NER')

### Get list of clusters

In [None]:
import uuid

def clusterman(lizt):
    output = []
    for item in lizt:
        serie = item['clusterid']
        clusterray = np.array(serie.values.tolist())
        base = np.unique(clusterray, return_counts=True)
        keyz = base[0].tolist()
        valuez = base[1].tolist()
        base = dict(zip(keyz, valuez))
        target = []
        for a in base:
            b = base.get(a)
            if b >= 10:
                target.append(a)
        for c in target:
            yo = [c]
            df = item[item.clusterid.isin(yo)]
            # drop clusterid col
            # df.drop(['clusterid'], axis=1)
            # add uuid
            # df['cid'] = uuid.uuid4()
            output.append(df)
    return output

# list of clusters
clusterlist = clusterman(frames)

logging.info('List of clusters generated')

### Get weighted cluster vocabs

In [None]:
from itertools import chain

# with clusterlist as lizt1 and weighted_vocabs as lizt2
def dictgetter(lizt1, lizt2):
    output = []
    for item in lizt1:
        key = item.slot[0] # get slot id
        vocab = lizt2[key] # get corresponding vocab
        wordlist = item['combined'].tolist() # get all words used in the cluster
        wordlist = list(set(chain(*wordlist))) # make list values unique
        diktator = {} 
        for word in vocab:
            val = vocab.get(word)
            if word in wordlist:
                diktator.update({word:val})
        output.append(diktator)
    return output    

# weighted cluster vocabs
clustervocabs = dictgetter(clusterlist, wvoc_final)

logging.info('Weighted cluster vocabs success')

### Average term scores

In [None]:
import statistics

def ranker_ats(a,b):
    output = []
    for idx, val in enumerate(a):
        weightedvocab = b[idx]
        weights = list(weightedvocab.values())
        if len(weights) == 0:
            print("blop")
            dic = {idx : 0}
            output.append(dic)
        else:
            result = statistics.mean(weights)
            dic = {idx : result}
            output.append(dic)
    return output

# produce dict of cluster index and average term score     
cluster_ats = ranker_ats(clusterlist, clustervocabs) 

logging.info('Average term scores success')

### Get top 20

In [None]:
# function that makes a list of lists: [[clusterlist[i], slotid, cluster_ats value], ... ...]
def lolmaker_clusterrank(a,b):
    output = []
    for idx, val in enumerate(a):
        bdict = b[idx]
        bval = bdict.get(idx)
        data = val
        slot = data.iloc[0,6]
        lizt = [data, slot, bval]
        output.append(lizt)
    return output
        
clusterframes = lolmaker_clusterrank(clusterlist, cluster_ats) 

# for clusterframes, add the weighted cluster dictionary to cluster - just for final results
def voc_add(a,b):
    output = []
    for idx, val in enumerate(a):
        bdict = b[idx]
        res = val
        res.append(bdict)
        output.append(res)
    return output

clusterframes2 = voc_add(clusterframes, clustervocabs)

from operator import itemgetter 

def top20(a):
    output = []
    for i in list(range(312)):
        candidates = []
        for item in a:
            if item[1] == i:
                candidates.append(item)
        res = sorted(candidates, key = itemgetter(2), reverse=True)
        res = res[0:20]
        output = output + res
    return output

# this is a list of lists: the top 20 ranked by slot, including: df, slotid, average weight, term dictionary.
ranked_by_slot = top20(clusterframes2)

logging.info('Top 20 extracted')

### Extract headlines with weight score

In [None]:
# use ranked_by_slot to make a dataframe of all the first tweets of the clusters

def rower(a):
    out = pd.DataFrame(columns=['idx', 'text', 'ts', 'combined', 'ner_all', 'documents', 'slot'])
    for idx, val in enumerate(a):
        df = val[0]
        row = df.iloc[0]
        out.loc[idx] = row
    return out

# the df to perform hierarchical clustering on
headlines = rower(ranked_by_slot)

def scorer(a, b):
    combined = a['combined'].tolist()
    slots = a['slot'].tolist()
    scores = []
    for c,d in zip(combined,slots):
        score = 0
        y = b[d]
        for word in c:
            if word in y:
                value = y.get(word)
                score = score+value
        scores.append(score)
            
    ser = pd.Series(scores)
    df = a
    df['weight'] = ser.values
    return df

# add the scores
headlines_wscore = scorer(headlines, wvoc_final)

logging.info('Scores added to cluster')

### Divide by days

In [None]:
# function that splits df by time frames

def splitter(frame):
    values = [i for i in range(0, frame['day'].max()+1)]
    frames = []
    for a in values:
        df1 = frame.loc[frame['day'] == a]
        frames.append(df1)
    return frames

# split
hl_days = splitter(headlines_wscore)

logging.info('Day split done')

### Vectorizing

In [None]:
import sklearn
import sklearn.feature_extraction

def vectorboy_arrays(lizt):
    rays = []
    for item in lizt:
        num = (max(int(item.shape[0]*0.0025),8))
        vectorizer = sklearn.feature_extraction.text.CountVectorizer(min_df=num)
        corpus = item.documents.tolist()
        X = vectorizer.fit_transform(corpus).toarray()
        rays.append(X)
    return rays

def vectorboy_vocabs(lizt):
    vocabs = []
    for item in lizt:
        num = (max(int(item.shape[0]*0.0025),8))
        vectorizer = sklearn.feature_extraction.text.CountVectorizer(min_df=num)
        corpus = item.documents.tolist()
        X = vectorizer.fit_transform(corpus).toarray()
        vocabs.append(vectorizer.get_feature_names())
    return vocabs

frames = hl_days

# generate vector matrices
rays = vectorboy_arrays(frames)

# generate vocabs
vocabs = vectorboy_vocabs(frames)

logging.info('Vectorization complete')

### Scale and normalize

In [None]:
# scaling and normalizing the vector matrices
from sklearn import preprocessing
import numpy as np

def scaler(lizt):
    output = []
    for item in lizt:
        scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
        X_scaled = scaler.fit_transform(item)
        X_scaled = X_scaled.astype('float16')
        output.append(X_scaled)
    return output

def normalizer(lizt):
    output = []
    for item in lizt:
        X_norm = preprocessing.normalize(item)
        X_norm = X_norm.astype('float16')
        output.append(X_norm)
    return output

# scaling
rays_scaled = scaler(rays)

# normalizing
rays_sn = normalizer(rays_scaled)

logging.info('Normalization complete')

### Cosine similarity

In [None]:
# now pairwise similarity by cosine distance
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import time

def cosine_sim(lizt):
    output = []
    for array in lizt:
        array1 = array.astype('float16')
        X_sim = 1-sklearn.metrics.pairwise.cosine_similarity(array1)
        array2 = X_sim.astype('float16')
        output.append(array2)
    return output

import time 

# start timer
t0 = time.time()

# apply function
rays_cs = cosine_sim(rays)

# record the time
t1 = time.time()
print("Time: {}".format(t1-t0))

logging.info('Cosine similarity success')

### Re-clustering

In [None]:
# hierarchical clustering for looking at data
import fastcluster
from scipy.cluster.hierarchy import cut_tree

def clusterer(lizt):
    output = []
    for item in lizt:
        # load array by time slot
        array = item

        # cluster
        X_clustered = fastcluster.linkage(array, method='centroid', metric='cosine')

        cutoff = 3.5/len(array)
        cutoff = (round(cutoff,5))

        # cut tree
        cutree = cut_tree(X_clustered, height=cutoff)

        # add to output list
        output.append(cutree)
    return output

# start timer
t0 = time.time()

cutrees = clusterer(rays_cs)

# record the time
t1 = time.time()
print("Time: {}".format(t1-t0))

logging.info('Clustering success')

In [None]:
def clustertagger(lizt1, lizt2):
    output = []
    for f, b in zip(lizt1, lizt2):
        l1 = b.tolist()
        my_list = [item for sublist in l1 for item in sublist]
        f['clusterid'] = pd.Series(my_list).values
        output.append(f)
    return output

# dataframe tagged with clusterids
daylevel_clustered = clustertagger(frames, cutrees)

logging.info('Cluster tagging success')

In [None]:
def clusterman(lizt):
    output = []
    for item in lizt:
        serie = item['clusterid']
        clusterray = np.array(serie.values.tolist())
        base = np.unique(clusterray, return_counts=True)
        keyz = base[0].tolist()
        valuez = base[1].tolist()
        base = dict(zip(keyz, valuez))
        target = []
        for a in base:
            b = base.get(a)
            if b >= 1:
                target.append(a)
        for c in target:
            yo = [c]
            df = item[item.clusterid.isin(yo)]
            # drop clusterid col
            # df.drop(['clusterid'], axis=1)
            # add uuid
            # df['cid'] = uuid.uuid4()
            output.append(df)
    return output

# list of day level clusters 
dayclusters = clusterman(daylevel_clustered)

def vectorboy_vocabs(lizt):
    vocabs = []
    for item in lizt:
        num = (max(int(item.shape[0]*0.0025),8))
        vectorizer = sklearn.feature_extraction.text.CountVectorizer(min_df=1)
        corpus = item.documents.tolist()
        X = vectorizer.fit_transform(corpus).toarray()
        vocabs.append(vectorizer.get_feature_names())
    return vocabs

# corresponding vocabs
clustervocs = vectorboy_vocabs(dayclusters)

logging.info('Clusters prepared')

### Extract earliest and top ranked

In [None]:
# for each cluster, keep only the top ranked one and the earliest. Add column that specifies r t
def keeper(x):
    out = pd.DataFrame(columns=['idx', 'text', 'ts', 'combined', 'ner_all', 'documents', 'slot', 'weight', 'day', 'clusterid', 'cdex', 'type'])
    for i, item in enumerate(x):
        item['cdex'] = i
        rank = item['weight'].max()
        early = item['ts'].min()
        ranked = item[item['weight']==rank]
        ranked['type'] = "r"
        earliest = item[item['ts']==early]
        earliest['type'] = "e"
        out = out.append(earliest, ignore_index=True)
        out = out.append(ranked, ignore_index=True)
    return out

money = keeper(dayclusters)
events = money[['idx','day','ts','text','type','slot','cdex']]

from datetime import datetime

def dtconvert(x):
    dt_object = datetime.fromtimestamp(x)
    return dt_object

# fix timestamp
events['ts'] = events['ts'].apply(dtconvert)

logging.info('Table of final headline Tweets generated')

### Dump all to files

In [None]:
# file export - events
with open('results_headlines.pkl', 'wb') as f:
    pickle.dump(events, f)
    
# file export - wvoc_final
with open('slot_termweights.pkl', 'wb') as f:
    pickle.dump(wvoc_final, f)
    
# file export - clustervocs
with open('cluster_vocabs.pkl', 'wb') as f:
    pickle.dump(clustervocs, f)
    
f.close()

logging.info('Files out')