In [None]:
import model
import data
import util

from pathlib import Path
import csv
import pandas as pd
import numpy as np
import scipy
import scipy.stats
from scipy.linalg import svd

import torch
import numpy as np

import plotly.express as px
import plotly.graph_objects as go

pd.options.display.float_format = '{:,.3f}'.format


model_family_dir = Path('model/reddit')
fields = data.load_fields(model_family_dir)
comms = fields['community'].vocab.itos
comms = [comm for comm in comms]

# CCLM perplexity and information gain (§2)

## PPL/IG over all examples

LSTMs perform better over all. Not very much difference in where the community layer falls.

The middle-layer transformers are worse. Maybe the transformer architecture does'nt like to have intermediate layers for some reason.


In [None]:
# Load the results created by `eval_lm test_perplexity`
df = pd.read_csv(model_family_dir/'ppl_aggregate.csv').set_index('model')
best_lstm, best_transformer = 'lstm-3-1', 'transformer-3-0'
df

### PPL/IG  by community

Somewhat bigger community spread in transformer performance. All models are skewed left (long tail of low perplexity subreddits).

The best LSTM model has more communities with negative information gain on average than the best Transformer (although the mean information gain is lower).

In [None]:
df = pd.read_csv(model_family_dir/'ppl_by_comm.csv').set_index('community')

cclms = [f'{arch}-3-{lc}' for arch in ('lstm', 'transformer') for lc in range(4)]
uncond = lambda x: x[:-2]

for model in cclms:
    df[f'{model}_ig'] = df[uncond(model)] / df[model] 

#### Perplexity

In [None]:
px.box(df[cclms].melt(var_name='model'), x='model', y='value')

#### Information gain

In [None]:
px.box(df[[f'{model}_ig' for model in cclms]].melt(var_name='model'), x='model', y='value')

# LMCC indiscernability & confusion matrix (§3)

In [None]:
confusion = {model: pd.read_csv(model_family_dir/model/'confusion.csv').set_index('community') for model in cclms}

def entropy(P, axis=0):
    return (-P * np.log(P)).sum(axis=axis)

for model in cclms:
    indisc = np.exp(entropy(confusion[model], axis=1)) / len(comms)
    df[f"{model}_indisc"] = pd.Series(indisc, index=comms)

### Correlation with PPL

In [None]:
pd.DataFrame([scipy.stats.pearsonr(df[f"{model}"], df[f"{model}_indisc"]) for model in cclms], 
             index=cclms, columns=('r','p'))

### Correlation with IG

In [None]:
pd.DataFrame([scipy.stats.pearsonr(df[f"{model}_ig"], df[f"{model}_indisc"]) for model in cclms], 
             index=cclms, columns=('r','p'))

In [None]:
def apply_permutation(permutation, matrix):
    result = np.empty_like(matrix)
    for i in range(matrix.shape[0]):
        for j in range(matrix.shape[1]):
            result[i][j] = matrix[permutation[i]][permutation[j]]
    return result

def sort_confusion_by_values(confusion, sort_key):
    assert(all(confusion.index == sort_key.index))
    comm_order = np.array(list(zip(*sorted(list(zip(sort_key,range(len(sort_key)))))))[1])
    labels_sorted = [comms[i] for i in comm_order]
    confusion_sorted = apply_permutation(comm_order, confusion.values)
    return labels_sorted, confusion_sorted

def plot_confusion(C, labels):
    fig = go.Figure(data=go.Heatmap(z=C, x=labels, y=labels))
    fig.update_layout(height=2*510, width=2*510, font=dict(size=8), title='')
    return fig.show()

In [None]:
labels_sorted, confusion_sorted  = sort_confusion_by_values(
        confusion[best_lstm], df[f"{best_lstm}_indisc"])
plot_confusion(np.log(confusion_sorted), labels_sorted)

# Community embeddings

In [None]:
def normalize(w):
    return w/np.sqrt( (w**2).sum(axis=1) )[:,None] 

# load the social network-based embedding
with open('embedding/web-redditEmbeddings-subreddits.csv', 'r') as f:
    web_vecs = {}
    reader = csv.reader(f) 
    for row in reader:
        web_vecs[row[0]] = [float(a) for a in row[1:]]
web_embed = np.array([web_vecs[comm.lower()] for comm in comms]) # order the rows correctly
web_embed = normalize(web_embed)

# load the CCLM embeddings
model_dir = model_family_dir
model_names = [f"{arch}-3-{i}" for arch in ('lstm', 'transformer') for i in range(4)]
cclm_embed = {m: normalize(util.extract_comm_embedding(model_dir/m)) for m in model_names}

# load the random "embeddings" w/ same shape as the CCLM embeddings
# wget https://www.random.org/integers/?num=8160&min=-100&max=100&col=16&base=10&format=plain&rnd=new
random_embed = {}
for i in range(10):
    numbers = []
    for line in open(f"embedding/random/{i+1}.txt").readlines():
        if line:
            numbers.append(list(map(lambda x: int(x)/100, line.split('\t'))))
    random_embed[i] = normalize(np.array(numbers))

## Alignment values

In [None]:
def embedding_alignment(e1, e2):
    prod = np.matmul(e1.T, e2)
    u, s, vh = svd(prod)
    return u, s, vh

### Alignment with random embeddings (for baseline purposes)

In [None]:
random_alignments = []
for i in range(10):
    _, s, _ = embedding_alignment(random_embed[i], web_embed)
    dist = 510 - s.sum()
    random_alignments.append(dist)
    print(f"Web and Random-{i}: {dist:0.2f}")

random_embed_mean = sum(random_alignments) / len(random_alignments)
random_embed_std  = scipy.stats.tstd(random_alignments)
    
print(f"Mean D(L,S): {random_embed_mean:.4f}")
print(f"Stddev:      {random_embed_std:.4f}")
print(f"mu - 3sd:    {random_embed_mean - (3*random_embed_std):.4f}")

### Alignment of CCLM embeddings to Web embedding

In [None]:
for m in cclms:
    _, s, _ = embedding_alignment(cclm_embed[m], web_embed)
    dist = 510 - s.sum()
    print(f"{m} and Web: {dist:0.2f} ({(random_embed_mean - dist) / random_embed_std:0.2f})")

In [None]:
value_no = list(range(16))
fig = go.Figure()
for model in cclms:
    color = 'blue' if model[:4] == 'lstm' else 'red'
    u, s, vh = embedding_alignment(web_embed, cclm_embed[model])
    fig.add_trace(go.Bar(x=value_no,
                    y=list(s),
                    name=model,
                    marker_color=color
                    ))
fig.show()

## Cluster & dislpay embeddings

In [None]:
from sklearn.cluster import KMeans
n_clusters = 6
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(web_embed)

cos_dist = scipy.spatial.distance.cosine
for cluster, center in enumerate(kmeans.cluster_centers_):
    dist = np.array([cos_dist(v, center) for v in web_embed])
    print(f"{cluster}: {' '.join([comms[i] for i in np.argsort(dist)[:7]])}")
cluster_labels = ['console games', 'politics', 'hobby', 'meme', 'discussion', 
                  'computer games', 'subculture', 'nsfw', 'advice', 'sports'] #assigned post-hoc

In [None]:
def plot_embedding_against_web(model, kmeans):  

    u, s, vh = embedding_alignment(web_embed, cclm_embed[model])

    df = pd.DataFrame({
        'web_PCA0':      np.dot(web_embed, u[:,0]),
        'web_PCA1':      np.dot(web_embed, u[:,1]),
        f'{model}_PCA0': np.dot(cclm_embed[model], vh[0]),
        f'{model}_PCA1': np.dot(cclm_embed[model], vh[1])
    })
    df['community'] = comms


    df['web_cluster'] = [str(l) for l in kmeans.labels_]
    
    fig = px.scatter(df.reset_index(), x='web_PCA0', y='web_PCA1', hover_name='community', color='web_cluster')
    fig.show()
    fig = px.scatter(df.reset_index(), x=f'{model}_PCA0', y=f'{model}_PCA1', hover_name='community', color='web_cluster')
    fig.show()
    
    return df

M = best_lstm
df_web = plot_embedding_against_web(best_lstm, kmeans).set_index('community')

In [None]:
df = df.merge(df_web, left_index=True, right_index=True)

In [None]:
df.sort_values(f'{M}_indisc').to_csv('paper/floats/comm.csv', index=True, sep='\t')

# Confusion matrix sorted by embedding PCA0

In [None]:
u, s, vh = embedding_alignment(cclm_embed[best_lstm], cclm_embed)
pca0 = np.dot(cclm_embed[best_lstm], vh[0])

labels_sorted, confusion_sorted  = sort_confusion_by_values(confusion[best_lstm], pca0)
plot_confusion(np.log(confusion_sorted), labels_sorted)