In [None]:
import string, csv, pickle
import pandas as pd
import scipy as sy
import numpy as np
import transformers as tr
import gensim.downloader as api

from collections import Counter
from mittens import GloVe, Mittens
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans, SpectralClustering, DBSCAN

In [None]:
data = pd.read_csv('../../data/processed/final_repo_english_whatwhy.csv')

In [None]:
data.head()

In [None]:
data.columns

In [None]:
readme = data['content_text_w_o_tags']

In [None]:
readme_alpha = readme.apply(lambda x: ' '.join(x for x in x.split() if x.isalpha()))

In [None]:
readme_alpha

In [None]:
stop_words_en = list(stop_words.ENGLISH_STOP_WORDS)

In [None]:
# Glove vectors for oov are generated by fine tuning. Check bottom of the notebook
with open("repo_glove.pkl", 'rb') as f:
    repo_glove = pickle.load(f)

In [None]:
glove_model = api.load("glove-wiki-gigaword-50")

In [None]:
vocabs_list = list(glove_model.vocab.keys()) + list(repo_glove.keys())

In [None]:
vocabs_dict = dict((a,0) for a in vocabs_list)

In [None]:
readme_tokens = [[token.lower() for token in doc.split() if (token.lower() in vocabs_dict) and (token.lower() not in stop_words_en)] for doc in readme_alpha]

In [None]:
top_tokens = []
for tokens in readme_tokens:
    top_tokens.append(list(dict(Counter(tokens).most_common(10)).keys()))

In [None]:
def get_vecs(v):
    try:
        return glove_model[v]
    except KeyError:
        return repo_glove[v]

In [None]:
tokens_vocab = list({w for v in top_tokens for w in v})

In [None]:
tokens_vecs = [get_vecs(t) for t in tokens_vocab]

In [None]:
def get_clusters(vocabs, labels):
    vocab_clusters = dict(zip(vocabs, labels))
    cluster_groups = dict(sorted(vocab_clusters.items(), key=lambda x:x[1]))
    return cluster_groups

### KMeans clustering

In [None]:
kmeans_model = KMeans(n_clusters=20, random_state=66)
kmeans_model.fit_transform(tokens_vecs)
kmeans_groups = get_clusters(tokens_vocab, kmeans_model.labels_)

In [None]:
kmeans_groups

### Density-Based Spatial Clustering of Applications with Noise

In [None]:
dbs = DBSCAN(metric="cosine", eps=0.2, min_samples=10)
dbs_labels = dbs.fit_predict(tokens_vecs)
dbs_groups = get_clusters(tokens_vocab, dbs_labels)

In [None]:
print(f'Number of clusters: {len(set(dbs_labels))}')

In [None]:
dbs_groups

### Spectral clustering

Takes too long to run the model compared to other models

In [None]:
from sklearn.cluster import SpectralClustering

In [None]:
spec_model = SpectralClustering(n_clusters=30)

In [None]:
spec_model.fit(tokens_vecs)

In [None]:
spec_groups = get_clusters(tokens_vocab, spec_model.labels_)

In [None]:
spec_groups

### Fine tuning GloVe embeddings

In [None]:
glove_model = api.load("glove-wiki-gigaword-50")

In [None]:
mittens_model = Mittens(n=50, max_iter=10)

In [None]:
def glove2dict(glove_filename):
    with open(glove_filename, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=' ', quoting=csv.QUOTE_NONE)
        embed = {line[0]: np.array(list(map(float, line[1:])))
                for line in reader}
    return embed

glove_path = "glove.6B\\glove.6B.50d.txt" # get it from https://nlp.stanford.edu/projects/glove
original_emb = glove2dict(glove_path)

In [None]:
def get_freqmorethan(xdict, val):
    return [k for (k,v) in Counter(xdict).items() if v<=val]

readme_nonstop = [' '.join([token.lower() for token in doc.split() if (token.lower() not in stop_words_en)]) for doc in readme_alpha]
oov = [token for doc in readme_nonstop for token in doc.split() if token not in glove_model.vocab]
oov_vocab = get_freqmorethan(oov, 1)
stops_oov = oov_vocab + stop_words_en

In [None]:
readme_tokens = [[token for token in doc.split() if (token.lower() not in stop_words_en)] for doc in readme_alpha]
short_tokens = [[token for token in doc if token not in oov_vocab] for doc in readme_tokens]
short_readme = [' '.join(t) for t in short_tokens]

In [None]:
corp_vocab = list(set(oov) - set(oov_vocab))

In [None]:
cv = CountVectorizer(ngram_range=(1,1), vocabulary=corp_vocab)
X = cv.fit_transform(short_readme)
Xc = (X.T * X)
Xc.setdiag(0)

In [None]:
coocc_ar = Xc.toarray()

In [None]:
new_embeddings = mittens_model.fit(
    coocc_ar,
    vocab=corp_vocab,
    initial_embedding_dict= original_emb)

In [None]:
newglove = dict(zip(corp_vocab, new_embeddings))

In [None]:
f = open("../out/repo_glove.pkl","wb")
pickle.dump(newglove, f)
f.close()