<a href="https://colab.research.google.com/github/DevashishX/AbstractClustering/blob/master/templates/GaussianMixtureGlovePmeans5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ***Set parameters***

In [15]:
__algo__ = "Gauss"              #Name of the Clustering algorithm
__emb__ = "glove"                #Name of the Word Embeddings used (glove, w2v, ftt), MUST set directory below
__sentemb__ = "pmeans5"          #Name of Sentence Embedding algorithm used
recnum = 3000                    #Number of records to be read from files
k = 50                           #Number of Clusters
usesqrt = False                  #Set value of k to sqrt of recnum, overrides k
randomsample = False             #Random Sampling to be True/False for records which are read
embedDir = "../MegaSentEmbs/"    #Directory where embeddings are saved for that selected embedding
modelDir = "../models/"          #Directory where models are saved
megadfDir = "../MegaDfs/"        #Directory Where Megadf is to be saved
plotDir = "../plots/"            #Directory where plots are saved
dumpDir = "../dump/"             #Directory where test outcomes are saved

# Actual Code

### imports and time

In [16]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import cluster, datasets, mixture
import seaborn as sns
import os, subprocess
import datetime, time
import pickle

In [17]:
start_time = time.time()
start_datetime = datetime.datetime.now()
print("Start time: ", start_time)
print("Start datetime: ", start_datetime)
# print(datetime.datetime.now()-start_datetime)

Start time:  1586869488.5868793
Start datetime:  2020-04-14 18:34:48.586950


### File Settings

In [18]:
oldlist = os.listdir(embedDir)
filelist = sorted([embedDir+f for f in oldlist if f[-3:]=="pkl"])
filenum = len(filelist)
smalllist = filelist[:filenum]
print("Length of Smalllist: ", len(smalllist))

Length of Smalllist:  116


### Number of Records
It is Recommended to Set this at the top parameters

In [19]:
recnum = recnum

### Read all the pandas dataframes

In [20]:
%%time
megadf = pd.DataFrame()

if randomsample == True:
    print("randomsample: ", randomsample)
    for f in smalllist:
        tempdf = pd.read_pickle(f)
        megadf = megadf.append(tempdf, ignore_index = True)
    megadf = megadf.sample(recnum, random_state=42)
else:
    print("randomsample: ", randomsample)
    for f in smalllist:
        tempdf = pd.read_pickle(f)
        megadf = megadf.append(tempdf, ignore_index = True)
        if megadf.shape[0] >= recnum:
            megadf = megadf[:recnum]
            break

print("megadf.shape: ", megadf.shape)

randomsample:  False
megadf.shape:  (3000, 4)
CPU times: user 101 ms, sys: 27.1 ms, total: 128 ms
Wall time: 129 ms


In [21]:
predata = megadf["embedding"]
data = np.matrix(predata.to_list())
print(data.shape)

(3000, 250)


### Number of Clusters
It is Recommended to Set this at the top parameters

In [22]:
if usesqrt == True:
    print("usesqrt: ", usesqrt)
    sqrt_k = int(np.sqrt(data.shape[0]))
    k = int(sqrt_k)
else:
    print("usesqrt: ", usesqrt)
    k = k
print("k: ", k)

usesqrt:  False
k:  50


## Clustering
Please modify the functions here to change algorithm

In [29]:
%%time
print("Starting Clustering Process")
model = mixture.GaussianMixture(n_components=k, covariance_type='full', tol=0.001, reg_covar=1e-06, max_iter=100, verbose=1)
model.fit(data)

print("Gaussian.fit(data) Done!")

Starting Clustering Process
Initialization 0
Initialization converged: True
Gaussian.fit(data) Done!
CPU times: user 6.95 s, sys: 238 ms, total: 7.19 s
Wall time: 3.71 s


### Saving the output data into vars

In [31]:
centroids = model.means_
labels = model.labels_
megadf["clusterlabel"]=labels
centroidDF = pd.DataFrame(centroids)


AttributeError: 'GaussianMixture' object has no attribute 'labels_'

## Plotting

In [None]:
plt.figure(figsize=(16,16))
titlestring = "{} with k={} records={} features={} using {}".format(__algo__, k, data.shape[0], data.shape[1], __emb__)
snsplot = sns.countplot("clusterlabel", data=megadf)
snsplot.xaxis.label.set_size(20)
snsplot.yaxis.label.set_size(20)
plt.title(
        titlestring,
        fontdict = {'fontsize' : 30}
    )

### *Name given to saved files*

In [None]:
name = "{}_{}_{}_K{}_R{}_F{}".format(__algo__, __emb__, __sentemb__, k, data.shape[0], data.shape[1])
name

### Saving Data

Save model

In [32]:
modelname = "{}_model.pkl".format(name)
pickle.dump(model, open(modelDir + modelname, 'wb'))

NameError: name 'name' is not defined

Save Plot

In [None]:
snspltname = "{}_plt.png".format(name)
snsplot.figure.savefig(plotDir + snspltname)

Save Megadf

In [None]:
clusterdfname = "{}_clustered_megadf.pkl".format(name)
megadf.to_pickle(megadfDir + clusterdfname)

Save Centroids

In [None]:
centroidDF = pd.DataFrame(centroids)
centroidDFname = "{}_centroids.pkl".format(name)
centroidDF.to_pickle(megadfDir + centroidDFname)
print(centroidDF.shape)
print(centroidDF.columns)

#### Total Time to Exec

In [None]:
end_time = time.time()
end_datetime = datetime.datetime.now()
# print("end_time:",  end_time)
print("end_datetime: ", end_datetime)
print("done! {}".format(round(end_time-start_time, 2)))
print("Time taken: ", str(end_datetime-start_datetime))

#### Open dataframe to test

In [None]:
sub = megadf.loc[:, ["id", "title", "abstract", "clusterlabel"]]
sub.tail()

In [None]:
megadf.columns