In [12]:
########################################################
# STEP 0 - Import necessary files and define functions
########################################################

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import math
from gensim.models.doc2vec import Doc2Vec
import numpy, math
from sklearn.cluster import SpectralClustering, DBSCAN, AgglomerativeClustering
import statistics as stats 

mod = Doc2Vec.load('doc2vecR.200.30.20.5.1550908281.eAp.trained')

# Cosine similarity: 1=identical, 0=completely different
def cos_sim (av, bv):
    return (sum(av*bv)/math.sqrt(sum(av*av)*sum(bv*bv)))

# Distinguish docvecs vectors between projects, developers, and languages
def typeCheck(vec):
    field = vec[0]
    maybeDev = False
    isDev = False
    isPro = False
    isLan = False
    for c in field:
        if c == '<' and not maybeDev:
            maybeDev = True
        elif c == '>' and maybeDev:
            isDev = True
        elif c == '_':
            isPro = True
    if isDev:
        return 'dev'
    elif isPro:
        return 'pro'
    else :
        return 'lan'
    
def print_clusters(cluster, names):
    for i in range(max(cluster)+1):
        print('-'*40)
        print("Group " + str(i))
        for j in range(len(cluster)):
            if cluster[j] == i:
                print(names[j])

def array_clusters(cluster, names):
    res = []
    for i in range(max(cluster)+1):
        res.append([])
        for j in range(len(cluster)):
            if cluster[j] == i:
                temp = names[j]
                res[-1].append(temp)
    return res

# Import questions into dataframe
dTypes = {'Language': object, 'Library': object, 'Unnamed: 0': object, 'Id': object, 'PostTypeId': object, 'AcceptedAnswerId': object, 'CreationDate': object, 'Score': object, 'ViewCount': object, 'Body': object, 'LastEditDate': object, 'LastActivityDate': object, 'Title': object, 'Tags': object, 'AnswerCount': object, 'ClosedDate': object}
dateCols = ['CreationDate', 'LastEditDate', 'LastActivityDate', 'ClosedDate']
QuestionLibs = pd.read_csv('SO_BDA_Q_Libraries_in_Skill_Space.csv', dtype=dTypes, parse_dates=dateCols)

QuestionLibs = QuestionLibs[~QuestionLibs.ViewCount.str.contains("-", na=False)]
QuestionLibs = QuestionLibs[~QuestionLibs.PostTypeId.str.contains("-", na=False)]
QuestionLibs = QuestionLibs[~QuestionLibs["Unnamed: 0"].str.contains("-", na=False)]
QuestionLibs = QuestionLibs[~QuestionLibs["Id"].str.contains("-", na=False)]
QuestionLibs = QuestionLibs[~QuestionLibs["AcceptedAnswerId"].str.contains("-", na=False)]
QuestionLibs = QuestionLibs[~QuestionLibs["Score"].str.contains("-", na=False)]
QuestionLibs = QuestionLibs[~QuestionLibs["AnswerCount"].str.contains("-", na=False)]

# Reformat numeric and datetime columns correctly 
QuestionLibs["Unnamed: 0"] = pd.to_numeric(QuestionLibs["Unnamed: 0"])
QuestionLibs["Unnamed: 0.1"] = pd.to_numeric(QuestionLibs["Unnamed: 0.1"])
QuestionLibs["Id"] = pd.to_numeric(QuestionLibs["Id"])
QuestionLibs["PostTypeId"] = pd.to_numeric(QuestionLibs["PostTypeId"])
QuestionLibs["AcceptedAnswerId"] = pd.to_numeric(QuestionLibs["AcceptedAnswerId"])
QuestionLibs["CreationDate"] = pd.to_datetime(QuestionLibs["CreationDate"])
QuestionLibs["Score"] = pd.to_numeric(QuestionLibs["Score"])
QuestionLibs["ViewCount"] = pd.to_numeric(QuestionLibs["ViewCount"])
QuestionLibs["LastEditDate"] = pd.to_datetime(QuestionLibs["LastEditDate"])
QuestionLibs["LastActivityDate"] = pd.to_datetime(QuestionLibs["LastActivityDate"])
QuestionLibs["AnswerCount"] = pd.to_numeric(QuestionLibs["AnswerCount"])
QuestionLibs["ClosedDate"] = pd.to_datetime(QuestionLibs["ClosedDate"])

QuestionLibs = QuestionLibs[~QuestionLibs["CreationDate"].isna()]

# Import answers into dataframe
dTypes = {'Language': object, 'Library': object, 'Unnamed: 0': object, 'Id': object, 'PostTypeId': object, 'ParentId': object, 'CreationDate': object, 'Score': object, 'ViewCount': object, 'Body': object, 'LastEditDate': object, 'LastActivityDate': object, 'CommentCount': object}
dateCols = ['CreationDate', 'LastEditDate', 'LastActivityDate']
AnswerLibs = pd.read_csv('SO_BDA_A_Libraries_in_Skill_Space.csv', dtype=dTypes, parse_dates=dateCols)

AnswerLibs = AnswerLibs[~AnswerLibs.ViewCount.str.contains("-", na=False)]
AnswerLibs = AnswerLibs[~AnswerLibs.PostTypeId.str.contains("-", na=False)]
AnswerLibs = AnswerLibs[~AnswerLibs["Unnamed: 0"].str.contains("-", na=False)]
# AnswerLibs = AnswerLibs[~AnswerLibs["Unnamed: 0.1"].str.contains("-", na=False)]
AnswerLibs = AnswerLibs[~AnswerLibs["Id"].str.contains("-", na=False)]
AnswerLibs = AnswerLibs[~AnswerLibs["ParentId"].str.contains("-", na=False)]
AnswerLibs = AnswerLibs[~AnswerLibs["Score"].str.contains("-", na=False)]
AnswerLibs = AnswerLibs[~AnswerLibs["CommentCount"].str.contains("-", na=False)]

AnswerLibs = AnswerLibs[~AnswerLibs["CreationDate"].isna()]

# Reformat numeric and datetime columns correctly 
AnswerLibs["Unnamed: 0"] = pd.to_numeric(AnswerLibs["Unnamed: 0"])
AnswerLibs["Unnamed: 0.1"] = pd.to_numeric(AnswerLibs["Unnamed: 0.1"])
AnswerLibs["Id"] = pd.to_numeric(AnswerLibs["Id"])
AnswerLibs["PostTypeId"] = pd.to_numeric(AnswerLibs["PostTypeId"])
AnswerLibs["ParentId"] = pd.to_numeric(AnswerLibs["ParentId"])
AnswerLibs["CreationDate"] = pd.to_datetime(AnswerLibs["CreationDate"])
AnswerLibs["Score"] = pd.to_numeric(AnswerLibs["Score"])
AnswerLibs["ViewCount"] = pd.to_numeric(AnswerLibs["ViewCount"])
AnswerLibs["LastEditDate"] = pd.to_datetime(AnswerLibs["LastEditDate"])
AnswerLibs["LastActivityDate"] = pd.to_datetime(AnswerLibs["LastActivityDate"])
AnswerLibs["CommentCount"] = pd.to_numeric(AnswerLibs["CommentCount"])

QuestionLibs["AnswerCount"] = QuestionLibs["AnswerCount"].replace(0.0, np.nan)
AnswerLibs["CommentCount"] = AnswerLibs["CommentCount"].replace(0.0, np.nan)

In [13]:
########################################################
# STEP 2 - Form similarity matrix of libraries that 
#          appear > 20 times in both Q's and A's
########################################################

Q_CommonLibs = QuestionLibs.groupby('Language')['Library'].value_counts().loc[lambda x : x>19].rename_axis(['Language','Library']).to_frame('counts')
A_CommonLibs = AnswerLibs.groupby('Language')['Library'].value_counts().loc[lambda x : x>19].rename_axis(['Language','Library']).to_frame('counts')
QA_CommonLibs = pd.merge(A_CommonLibs, Q_CommonLibs, on=["Language","Library"])
QA_CommonLibsList = QA_CommonLibs.index.to_frame(index=False)['Library'].unique().tolist()

data = []
for lib in QA_CommonLibsList:
    row = []
    for l in QA_CommonLibsList:
        calc = cos_sim(mod.wv.get_vector(lib), mod.wv.get_vector(l))
        row.append(calc)
    data.append(row)

SimMatrix = pd.DataFrame(data, index=QA_CommonLibsList, columns=QA_CommonLibsList)
SimMatrixNPArray = SimMatrix.to_numpy()
SimMatrixNPMatrix = np.matrix(SimMatrixNPArray)

In [14]:
########################################################
# STEP 3 - Perform clustering for different numbers of 
#          clusters and graph the average std. dev. 
########################################################

res = []
sd = []
title = []
for i in range(1, 50):
    clustering = SpectralClustering(i).fit_predict(SimMatrixNPMatrix)
    res.append(clustering)


k_val_tests = []
max_num = 40

for m in range(max_num):
    # produce a set of clusters
    res_array = array_clusters(res[m], QA_CommonLibsList)
    clus_av = []

    # iterate through each cluster
    for j in range(len(res_array)):

        clus = res_array[j]
        clus_sd = []

        # iterate through each element in the cluster being the bench vector
        for k in range(len(clus)):
            bench = mod.wv.get_vector(clus[k])
            temp = []

            # calc cos sim of bench vec to all other vectors  
            for i in range(len(clus)):
                temp.append(cos_sim(bench, mod.wv.get_vector(clus[i])))
            try:  
                sd = stats.stdev(temp)
            except stats.StatisticsError:  
                sd = 0.0
            clus_sd.append(sd)

        try:  
            clus_av.append(stats.mean(clus_sd))
        except stats.StatisticsError:  
            clus_av.append(0.0)
    
    k_val_tests.append(stats.mean(clus_av))

k_index = list(range(1, max_num+1))
k_val_tests
plt.figure(figsize=(20,10))
plt.plot(k_index, k_val_tests, label = "Line", marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Average SD of nodes in each cluster')
plt.title('Maximising cluster numbers')
plt.xticks(np.arange(min(k_index), max(k_index)+1, 1.0))
plt.show()



ValueError: n_samples=10 should be >= n_clusters=11.