In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import os, pandas as pd, numpy as np, pprint as pp, matplotlib.pyplot as plt, json
pd.options.display.max_rows = 500

from clustering.clustering_funcs import cluster_glove, merge_cluster_data
from phonology.funcs import vectorize_phonology, find_phonology_cosine_similarity_perPhonType, stitch_parts
from preprocess.funcs import remove_phonology_duplicate_videos, clear_phonology_df, clear_semantics_df
from semantics.funcs import find_semantics_cosine_similarity_pairwise
from utility.util_funcs import pandas_pair_signs_alphabetically

from itertools import product, combinations
from scipy.stats import pearsonr
from sklearn.cluster import AgglomerativeClustering


print("\n\nCell executed.")



Cell executed.


# 1. Data pre-processing

In this part, we transform the datasets that we have to a format that allows comparisons:

- We use pretrained English GloVe vectors (trained on the Wikipedia corpus) to extract semantic similarity between signs
- For phonological similarity, we use annotated signs from the Gallaudet Dictionary for ASL, and the SignBank for BSL
- The method that we follow does not allow a *direct* comparison between ASL and BSL. We use the available phonological annotations as the basis of our semantic space, which have different entries across the two languages; therefore, the semantic space, while they do share ~500 signs, are not identical.

## 1.1. Filtering the data

In this section, we identify the ASL and the BSL signs that we will be working with.

Each sign:
- Has to have a phonological transcription
- Has to have a semantic vector representation in the GloVe vectors
- Must not have duplicates

In [None]:
phonRoot_raw = "data/raw/phonologyData/"
phon_paths_raw = [phonRoot_raw+p for p in os.listdir(phonRoot_raw) if (not p.startswith(".")) and (p.endswith("xlsx"))]

#Uncomment below if you want to run - Takes about 5 minutes
for p in phon_paths_raw:
    print(p)
#     remove_phonology_duplicate_videos(p)
    
print("\n\nCell executed.")

In [None]:
phonRoot_unique = "data/transforming/phonologyData/unique_signs/"
phon_paths_uniqueSigns = [phonRoot_unique+p for p in os.listdir(phonRoot_unique) if (not p.startswith(".")) and (p.endswith("gz"))]

gloveRoot = "../../../Downloads/glove/" #replace with path to Glove txt files
glovePaths = [gloveRoot+g for g in os.listdir(gloveRoot) if not g.startswith(".")]

print("\n\nCell executed.")

In [None]:
#Uncomment below if you want to run - Takes about 5 minutes
for p in phon_paths_uniqueSigns:
    print(p)
#     clear_phonology_df(p)
    
print("\n\nCell executed.")

In [None]:
#Uncomment below if you want to run - Takes about 5 minutes
for (p, g) in product(phon_paths_uniqueSigns, glovePaths):
    print("\n", p, g)
#     clear_semantics_df(p, g)
    
print("\n\nCell executed.")

## 1.2. Transforming and Vectorizing the Phonology Data

In this section, we transform the phonology dataframes of ASL and BSL with the following goals in mind:

- All signs are lower-cased.
- Duplicate signs are removed. Only the first occurrence of a sign is kept. This is a necessary step to avoid skewing the data. For instance, the BSL phonology dataframe has more than 10 entries for the sign MAUVE. We only keep the first occurrence of MAUVE in the order of the rows in the dataframe.

In [None]:
phonRoot_clean = "data/output/phonologyData/"
phonPaths_clean = [phonRoot_clean+p for p in os.listdir(phonRoot_clean) if not p.startswith(".")]

#Uncomment below if you want to run - Takes about a couple minutes
for p in phonPaths_clean:
    print(p)
#     vectorize_phonology(p)

print("\n\nCell executed.")

## 2. Finding Phonological Similarity (Pairwise Cosine Similarity)

In [None]:
"""FINDING PHONOLOGICAL SIMILARITY FROM VECTORIZED DFs"""
phonRoot_vectorized = "data/output/vectorizedPhonDFs/"
phonPaths_vectorized = [phonRoot_vectorized+p for p in os.listdir(phonRoot_vectorized) if not p.startswith(".")]

#Uncomment below if you want to run - Takes about over hour
for p in phonPaths_vectorized:
    print(p)
#     find_phonology_cosine_similarity_perPhonType(p)
    
    
print("\n\nCell executed.")

## 3. Finding Semantic Similarity (Pairwise Cosine Similarity)

In [None]:
semPath_root = "data/output/semanticsData/"
semPaths = [semPath_root+p for p in os.listdir(semPath_root) if not p.startswith(".")]

#Uncomment below if you want to run - Takes over an hour
for p in semPaths:
    print(p)
#     find_semantics_cosine_similarity_pairwise(p)
    
print("\n\nCell executed.")

## 4. Analyses

## 4.1. Pairwise Analysis

In [None]:
"""STITCHING TOGETHER PHONOLOGICAL SIMILARITY CSVs"""

masterPath = "data/output/vectorizedPhonDFs_Stitched_DFs/"
if not os.path.exists(masterPath):
    os.makedirs(masterPath)
    
languages = ["ASL", "BSL"]
phonTypes = ["ENTIRE", "LOC", "MOV", "HS"]

#Uncomment below if you want to run - Takes about 5 minutes
for language, phonType in product(languages, phonTypes):
    print(language, phonType)
    path = "data/output/vectorized_PhonSim/"+language+"/"+phonType+"/"
    df = stitch_parts(path)
    df.to_csv(masterPath+language+"_"+phonType+".csv.gz", compression="gzip",index=False)
    
print("\n\nCell executed.")

In [None]:
"""STITCHING TOGETHER SEMANTIC SIMILARITY CSVs"""

masterPath = "data/output/SemSim_StitchedDFs/"
if not os.path.exists(masterPath):
    os.makedirs(masterPath)
    
languages = ["ASL", "BSL"]
dims = ["50d", "100d", "200d", "300d"]

#Uncomment below if you want to run - Takes about 5 minutes
for language, dim in product(languages, dims):
    print(language, dim)
    path = "data/output/SemSim/"+language+"/"+dim+"/"
    df = stitch_parts(path)
    df.to_csv(masterPath+language+"_"+dim+".csv.gz", compression="gzip",index=False)
    
print("\n\nCell executed.")

In [None]:
"""PAIRWISE CORRELATIONS ANALYSIS"""

masterPhonPath = "data/output/vectorizedPhonDFs_Stitched_DFs/"
masterSemPath = "data/output/SemSim_StitchedDFs/"
semPaths = [masterSemPath + p for p in os.listdir(masterSemPath) if not p.startswith(".")]
phonPaths = [masterPhonPath + p for p in os.listdir(masterPhonPath) if not p.startswith(".")]

#Creating a dataframe to store pairwise calculations
df_pairwise_calculations = pd.DataFrame(columns=["phonType", "semDimension", "language", "pearson_r", "p-value"], index=range(32))

df_Limit = None
i = -1
for s, p in list(product(semPaths, phonPaths))[:df_Limit]:
    language_sem = s.split("/")[-1].split("_")[0]
    language_phon = p.split("/")[-1].split("_")[0]
    
    if language_sem == language_phon:
        i += 1
        
        language = language_sem
        dim = s.split("/")[-1].split("_")[1].split(".")[0]
        phonType = p.split("/")[-1].split("_")[1].split(".")[0]
        
        print(i, language, dim, phonType)
        
        df_pairwise_calculations.iloc[i]["language"] = language
        df_pairwise_calculations.iloc[i]["phonType"] = phonType
        df_pairwise_calculations.iloc[i]["semDimension"] = dim
        
        #Loading CSVs
        phonDF = pd.read_csv(p)
        semDF = pd.read_csv(s)
        
        phonDF["paired"] = phonDF.apply(lambda x: pandas_pair_signs_alphabetically(x), axis=1)
        semDF["paired"] = semDF.apply(lambda x: pandas_pair_signs_alphabetically(x), axis=1)
        
        phonDF = phonDF.drop(["s1", "s2"], axis=1).set_index("paired").sort_index()
        semDF = semDF.drop(["s1", "s2"], axis=1).set_index("paired").sort_index()
        
        
        if all(phonDF.index == semDF.index):
            print("All indices match.\n\n")
        else:
            print("INDICES DO NOT MATCH.\n\n")
            
        #Calculate correlations
        df_pairwise_calculations.iloc[i]["pearson_r"], df_pairwise_calculations.iloc[i]["p-value"] = pearsonr(phonDF[phonType+"_cosineSim"], semDF["sem_cosineSim"])
        
        """ADD VISUALIZATIONS HERE"""
        
        del phonDF
        del semDF
        
df_pairwise_calculations.to_csv("results/pairwise/pairwise_results.csv", index=False)
df_pairwise_calculations

print("\n\nCell executed.")

### Results of the Pairwise Analysis:

There is no apparent linear relationship between phonological similarity (as measured by the additive inverse of cosine distance between two signs in space that are vectorized) and semantic similarity (measured using the same cosine method as phonological similarity -- except we use GloVe vectors pretrained on the Wikipedia corpus).

Phonology is arbitrary when the lexicon of an SL is taken as a whole.

This brings us to our next analysis: Hierarchical Clustering.

## 4.2. Hierarchical Clustering Analysis

In this section, we raise the question that if there is no linear relationship in the phonology and semantics of pairs of signs in a semantically ***unorganized*** lexicon of an SL, can we find relationships betweeen pairs of signs within clusters of semantically related signs? 

1. We first cluster signs in a semantic vector space using agglomerative hierarchical clustering
2. We then look for pairwise relations between pairs of signs within individual clusters.
3. This dramatically reduces the number of sign pairs that we study, as the pairing process does not cross cluster boundaries in a given semantic vector space.

In [None]:
semPath_root = "data/output/semanticsData/"
semPaths = [semPath_root+p for p in os.listdir(semPath_root) if not p.startswith(".")]

languages = ["ASL", "BSL"]
dims = ["50d", "100d", "200d", "300d"]

heightRange = range(0,100)
tuples = product(languages, dims)
columns = pd.MultiIndex.from_tuples(tuples, names=["language", "dim"])
clusterN_df = pd.DataFrame(index=heightRange, columns=columns)

"""CLUSTERING -- pruning heights 0% through 100%"""
for language in languages:
    for dim in dims:
        masterPath = "results/clustering/clusterIDs/"+language+'/'+dim+'/'
        
        if not os.path.exists(masterPath):
            os.makedirs(masterPath)

        p = semPath_root+language+"_Semantics_"+dim+"_clean.csv.gz"
        print(p)
        
        clusterLabels_dict = {}
        for heit in heightRange:
            
            if heit%25 == 0:
                print(heit)
                
            signs, silhouette, clusterLabels, clusters_len = cluster_glove(p, height=heit)
            
            if  1 < clusters_len < len(signs):
                clusterN_df[(language, dim)].loc[heit] = clusters_len
                temp_dict = {sign:cluster for sign, cluster in zip(signs, clusterLabels)}
                height_key = "height_"+str(heit).zfill(3)
                clusterLabels_dict[height_key] = {}
                clusterLabels_dict[height_key]["clusters"] = {"C"+str(key).zfill(4): [value for value, check_key in temp_dict.items() if check_key==key] for key in temp_dict.values()}
                clusterLabels_dict[height_key]["silhouette_score"] = silhouette
                
        with open(masterPath+language+"_"+dim+"_"+"clusterIDs.json", "w") as outfile:
             json.dump(clusterLabels_dict, outfile)
            

clusterN_df = clusterN_df.reset_index()
clusterN_df = clusterN_df.rename(columns={"index":"height"})
clusterN_df.to_csv("results/clustering/clusterN_by_height.csv", index=True)
clusterN_df

print("\n\nCell executed.")


In [None]:
languages = ["ASL", "BSL"]
dims = ["50d", "100d", "200d", "300d"]

masterPath = "results/clustering/clusterIDs/"

silhous = pd.DataFrame(columns = ["language", "dim", "prune_height", "cluster_N", "silhouette_score"], index=range(len(languages)*len(dims)*100))
i = 0
for language in languages:
    for dim in dims:
        read_json_path = masterPath+language+"/"+dim+"/"+language+"_"+dim+"_clusterIDs.json"
        print(read_json_path)
        with open(read_json_path, "r") as read_file:
            clusterLabels = json.load(read_file)
        
        for heit in clusterLabels.keys():
            height = int(heit.split("_")[1])
            silhous.iloc[i]["language"] = language
            silhous.iloc[i]["dim"] = dim
            silhous.iloc[i]["prune_height"] = height
            silhous.iloc[i]["cluster_N"] = len(clusterLabels[heit]["clusters"])
            silhous.iloc[i]["silhouette_score"] = clusterLabels[heit]["silhouette_score"]
            i+= 1
        
    
silhous = silhous.dropna()
silhous.head()
silhous.to_csv("results/clustering/clustering_silhouette_scores.csv", index=False)

print("\n\nCell executed.")


In [None]:
#Finding the VSMs where clustering quality is highest (the higher the silhouette score the better clustering quality)
silhous.groupby(["language", "dim"])["silhouette_score"].idxmax()

In [None]:
languages = ["ASL", "BSL"]
dims = ["50d", "100d", "200d", "300d"]

masterPath = "results/clustering/clusterIDs/"

for language in languages:
    for dim in dims:
        read_json_path = masterPath+language+"/"+dim+"/"+language+"_"+dim+"_clusterIDs.json"
        print(read_json_path)
        with open(read_json_path, "r") as read_file:
            clusterLabels = json.load(read_file)
            
        masterOutPath = "results/clustering/signPairs_byCluster/"+language+"/"+dim+"/"
        if not os.path.exists(masterOutPath):
            os.makedirs(masterOutPath)
        
        for heit in clusterLabels.keys():
            height = int(heit.split("_")[1])
            
            height_condition = 2<height<11
            #The range above is obtained from the silhouette scores.
            #All VSMs have the max silhouette score between heights 4 and 7.
            #So we only look at those VSMs where cluster validity is better:
            
            if height_condition:
                signPairs = {cluster: [sorted((x,y)) for (x,y) in combinations(clusterLabels[heit]["clusters"][cluster],2)] for cluster in clusterLabels[heit]["clusters"] if len(clusterLabels[heit]["clusters"][cluster]) > 1}
                out_json_path = masterOutPath+language+"_"+dim+"_height"+heit+"_signPairs_byCluster.json"
                with open(out_json_path,'w') as outfile:
                    json.dump(signPairs, outfile)
            
print("\n\nCell executed.")


In [2]:
#Merging all cluster information into one dataframe and outputting to outPath
outPath = "results/clustering/allClusters.csv"

In [None]:
#Takes about 30 minutes. Uncomment to run.
# allClusters = merge_cluster_data(outPath)

print("\n\nCell executed.")

In [3]:
allClusters = pd.read_csv(outPath)
allClusters.head()
print(len(allClusters))

languages = ["ASL",
             "BSL"
            ]
dims = ["50d", 
        "100d", "200d", "300d"
       ]

phonTypes = ["ENTIRE", "HS", "MOV", "LOC"]

lowerBound = 3
upperBound = 11
heights = [i for i in range(lowerBound, upperBound)]


phon_masterPath = "data/output/vectorizedPhonDFs_Stitched_DFs/"
sem_masterPath = "data/output/SemSim_StitchedDFs/"

finalDF = pd.DataFrame()
for (language, dim, heit) in product(languages, dims, heights):
    semDF = pd.read_csv(sem_masterPath+language+"_"+dim+".csv.gz")
    semDF["signPair"] = semDF["s1"] + " + " + semDF["s2"]

    df = allClusters[(allClusters.language == language) & (allClusters.dim == dim) & (allClusters.height==heit)].set_index("signPair")
    df = df.merge(semDF, on='signPair')
    df = df.drop(["s1", "s2", "semSim"], axis=1)
    
    for phonType in phonTypes:
        print(language, dim, heit, phonType)
        phonDF = pd.read_csv(phon_masterPath+language+"_"+phonType+".csv.gz")
        phonDF["signPair"] = phonDF["s1"] + " + " + phonDF["s2"]
        
        df = df.merge(phonDF, on='signPair')
        df = df.drop(["s1", "s2", phonType+"_sim"], axis=1)
        
    finalDF = finalDF.append(df, ignore_index=True)
    
finalDF.to_csv("results/clustering/allClusters_withValues.csv.gz", compression = "gzip", index=False)
    

Unnamed: 0,language,dim,height,clusterID,signPair,semSim,HS_sim,LOC_sim,MOV_sim,ENTIRE_sim
0,ASL,50d,6,C0000,a + another,,,,,
1,ASL,50d,6,C0000,a + home,,,,,
2,ASL,50d,6,C0000,a + key,,,,,
3,ASL,50d,6,C0000,a + main,,,,,
4,ASL,50d,6,C0000,a + run,,,,,


305442
ASL 50d 3 ENTIRE
ASL 50d 3 HS
ASL 50d 3 MOV
ASL 50d 3 LOC
ASL 50d 4 ENTIRE
ASL 50d 4 HS
ASL 50d 4 MOV
ASL 50d 4 LOC
ASL 50d 5 ENTIRE
ASL 50d 5 HS
ASL 50d 5 MOV
ASL 50d 5 LOC
ASL 50d 6 ENTIRE
ASL 50d 6 HS
ASL 50d 6 MOV
ASL 50d 6 LOC
ASL 50d 7 ENTIRE
ASL 50d 7 HS
ASL 50d 7 MOV
ASL 50d 7 LOC
ASL 50d 8 ENTIRE
ASL 50d 8 HS
ASL 50d 8 MOV
ASL 50d 8 LOC
ASL 50d 9 ENTIRE
ASL 50d 9 HS
ASL 50d 9 MOV
ASL 50d 9 LOC
ASL 50d 10 ENTIRE
ASL 50d 10 HS
ASL 50d 10 MOV
ASL 50d 10 LOC
ASL 100d 3 ENTIRE
ASL 100d 3 HS
ASL 100d 3 MOV
ASL 100d 3 LOC
ASL 100d 4 ENTIRE
ASL 100d 4 HS
ASL 100d 4 MOV
ASL 100d 4 LOC
ASL 100d 5 ENTIRE
ASL 100d 5 HS
ASL 100d 5 MOV
ASL 100d 5 LOC
ASL 100d 6 ENTIRE
ASL 100d 6 HS
ASL 100d 6 MOV
ASL 100d 6 LOC
ASL 100d 7 ENTIRE
ASL 100d 7 HS
ASL 100d 7 MOV
ASL 100d 7 LOC
ASL 100d 8 ENTIRE
ASL 100d 8 HS
ASL 100d 8 MOV
ASL 100d 8 LOC
ASL 100d 9 ENTIRE
ASL 100d 9 HS
ASL 100d 9 MOV
ASL 100d 9 LOC
ASL 100d 10 ENTIRE
ASL 100d 10 HS
ASL 100d 10 MOV
ASL 100d 10 LOC
ASL 200d 3 ENTIRE
ASL 200

Unnamed: 0,signPair,language,dim,height,clusterID,sem_cosineSim,ENTIRE_cosineSim,HS_cosineSim,MOV_cosineSim,LOC_cosineSim
0,a + another,ASL,50d,3,C0000,0.9506,0.6909,0.5736,0.2236,1.0000
1,abandon + promise,ASL,50d,3,C0001,0.7741,0.3673,0.4251,0.2236,0.3677
2,above + below,ASL,50d,3,C0004,0.9472,0.8412,0.7504,1.0000,0.9023
3,accept + consider,ASL,50d,3,C0006,0.8926,0.5569,0.5436,0.2500,0.6702
4,accident + crash,ASL,50d,3,C0007,0.8906,0.6626,0.6948,0.7500,0.6010
...,...,...,...,...,...,...,...,...,...,...
305437,portugal + spain,BSL,300d,10,C0216,0.6838,0.7030,0.8401,0.4082,0.6782
305438,pyramid + vase,BSL,300d,10,C0218,0.2403,0.6680,0.7503,0.6667,0.5484
305439,ski + sled,BSL,300d,10,C0219,0.3274,0.5407,0.5003,0.2357,0.7304
305440,ski + wheelchair,BSL,300d,10,C0219,0.2812,0.5916,0.7697,0.5164,0.3349


In [33]:
summed = (finalDF.groupby(["language", "dim", "height", "clusterID"]).sum()/finalDF.groupby(["language", "dim", "height", "clusterID"]).count()).reset_index()


languages = ["ASL",
             "BSL"
            ]

dims = ["50d", 
        "100d", "200d", "300d"
       ]

phonTypes = ["ENTIRE", "HS", "MOV", "LOC"]

lowerBound = 3
upperBound = 11
heights = [i for i in range(lowerBound, upperBound)]

corrs = pd.DataFrame(columns=["language", "dim", "prune_height", "phonType", "pearsonR_with_semSim", "p-value"], index=range(len(languages)*len(dims)*len(heights)*len(phonTypes)))
i = 0
for (language, dim, heit) in product(languages, dims, heights):
    df = summed[(summed.language == language) & (summed.dim == dim) & (summed.height==heit)]
    
    for phonType in phonTypes:
        r, p = pearsonr(df["sem_cosineSim"], df[phonType+"_cosineSim"])
        
        corrs.iloc[i]["language"] = language
        corrs.iloc[i]["dim"] = dim
        corrs.iloc[i]["prune_height"] = heit
        corrs.iloc[i]["phonType"] = phonType
        corrs.iloc[i]["pearsonR_with_semSim"] = r
        corrs.iloc[i]["p-value"] = p
        
        i+=1
    
corrs.to_csv("results/clustering/clustering_corr_coefs.csv", index=False)
corrs

Unnamed: 0,language,dim,prune_height,phonType,pearsonR_with_semSim,p-value
0,ASL,50d,3,ENTIRE,0.174608,0.00554
1,ASL,50d,3,HS,0.124424,0.048945
2,ASL,50d,3,MOV,0.071957,0.256039
3,ASL,50d,3,LOC,0.181724,0.003868
4,ASL,50d,4,ENTIRE,0.162923,8e-05
5,ASL,50d,4,HS,0.09711,0.019221
6,ASL,50d,4,MOV,0.116853,0.004799
7,ASL,50d,4,LOC,0.159268,0.000116
8,ASL,50d,5,ENTIRE,0.188604,1.7e-05
9,ASL,50d,5,HS,0.132613,0.002616


In [None]:
# allClusters.groupby(["language", "dim", "height"])["signPair"].count()

# ***IGNORE AFTER HERE***

In [None]:
# plotOutputPath = "plots/heights_elbow/"

# if not os.path.exists(plotOutputPath):
#     os.makedirs(plotOutputPath)
    
# y_ticks = np.arange(0, 2000, 400)



# for p in semPaths[:limit]:
#     plt.figure(figsize=(12,12))
#     print(p, "\n")
    
#     language = p.split("/")[-1].split("_")[0]
#     dim = p.split("/")[-1].split("_")[2]
    
#     heights = []
#     for heit in range(100):
#         clusters = cluster_glove(p, height=heit)
#         clustersN = len(list(set([x[0] for x in clusters])))
#         heights += [(heit, clustersN)]
        
#     plt.scatter([x[0] for x in heights], [x[1] for x in heights])
#     _=plt.yticks(y_ticks)
#     _=plt.axes().set_ylim(-100, 2000)
#     _=plt.axes().set_xlim(-5,100)
#     plt.savefig(plotOutputPath+language+"_"+dim+".png", dpi=300)
#     plt.show()

    

In [None]:
# import pandas as pd, numpy as np, os
# import matplotlib.pyplot as plt
# from sklearn.cluster import AgglomerativeClustering
# from scipy.cluster.hierarchy import dendrogram
# from scipy.cluster import hierarchy

# distance_Tresholds = [
#     6,
# #                         7,
# #                       8,9,10,
# #                       11, 12,
#     13, 14, 15
    
#                      ]

# inputPath = semPaths
# outputPath = "../../04.Analyses/hierarchicalClustering/gloVe_VSMs_ClusteredHierarchical/"
# gloves = [x for x in os.listdir(inputPath) if not x.startswith(".")]

# method = "ward"
# distanceMethod = "euclidean"

# Limit = None
# for glove in gloves[:Limit]:
#     datasetName = glove
#     print("Now working on ", glove)
#     data = pd.read_csv(inputPath+datasetName).set_index("label")[:]
#     print("length of data:", len(data))
#     data.head()
#     dataOutput = data.reset_index()
#     X = data
#     signs = [x for x in data.index]
#     # print(signs)

#     for threshold in distance_Tresholds:
#         model = AgglomerativeClustering(linkage= method,
#                                         affinity= distanceMethod,
#                                         distance_threshold=threshold,
#                                         n_clusters=None,
#                                         compute_distances=True
#                                        )
#         model_fit = model.fit(X)
#         clusters = ["C"+str(c) for c in list(model.fit_predict(X))]
#         dataOutput["clusters"] = clusters
# #             dataOutput["labels"] = signs

#         dataOutput.to_csv(outputPath+glove[:-4]+"_height"+str(threshold)+".csv", index=False)


#         print("N of clusters: ", model_fit.n_clusters_)


# print("this cell executed.")


In [None]:
import pandas as pd

pd.read_csv("data/output/vectorizedPhonDFs/ASL_HS_vectorizedDF.csv.gz")


