In [6]:
import numpy as np
import pandas as pd
from models import InferSent
import torch
import torch.nn as nn
import faiss
import nltk
from sentence_transformers import util

## Load Embeddings

In [2]:
V = 2
MODEL_PATH = 'encoder/infersent%s.pkl' % V
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))

<All keys matched successfully>

In [3]:
use_cuda = False
infersent = infersent.cuda() if use_cuda else infersent
W2V_PATH = 'fastText/crawl-300d-2M.vec'
infersent.set_w2v_path(W2V_PATH)

In [4]:
infersent.build_vocab_k_words(100000)

Vocab size : 100000


## Load Data

In [7]:
chunksize = 100000
chunks = []
chunk_no=0
for chunk in pd.read_csv('./actor_action_desc.csv', chunksize=chunksize):
    print(chunk_no)
    chunks.append(chunk)
    chunk_no+=1
    if chunk_no==2:
        break

0
1


In [8]:
df_case_action = pd.concat(chunks,ignore_index=True)
del chunks
df_case_action.columns = ['Actor','Action','Description']
df_case_action.head()
print(len(df_case_action))

200000


In [None]:
df_case_action_null = df_case_action[df_case_action['Action'].isnull()]
print(len(df_case_action_null))
df_case_action_null['Description'] = df_case_action_null['Description'].astype('str')

## Encode to Embeddings

In [9]:
embeddings = infersent.encode(df_case_action['Description'].values, bsize=128, tokenize=False, verbose=True)
embeddings.shape

AttributeError: 'float' object has no attribute 'split'

## Clustering

In [None]:
class FaissKMeans:
    def __init__(self, n_clusters=8, n_init=10, max_iter=300):
        self.n_clusters = n_clusters
        self.n_init = n_init
        self.max_iter = max_iter
        self.kmeans = None
        self.cluster_centers_ = None
        self.inertia_ = None

    def fit(self, X, y):
        self.kmeans = faiss.Kmeans(d=X.shape[1],
                                   k=self.n_clusters,
                                   niter=self.max_iter,
                                   nredo=self.n_init)
        self.kmeans.train(X.astype(np.float32))
        self.cluster_centers_ = self.kmeans.centroids
        self.inertia_ = self.kmeans.obj[-1]

    def predict(self, X):
        return self.kmeans.index.search(X.astype(np.float32), 1)[1]

In [None]:
f_kmeans = FaissKMeans(n_clusters=100)
f_kmeans.fit(embeddings,None)

In [None]:
preds = f_kmeans.predict(embeddings)

In [None]:
temp_df = df_case_action_null.iloc[:1000]
temp_df['label'] = preds
temp_df.head()

In [None]:
temp_df['label'].value_counts()

In [None]:
temp_df[temp_df['label']==2]['Description'].tolist()

In [None]:
temp_df.to_csv('./clustered_1000.csv',index=False)