<h1>The new db-index: using clustering and semantic homogenity and principle component analysis</h1>

In [269]:
from datasets import load_dataset
import random
import spacy as sp
import pandas as pd
import numpy as np
import en_core_web_lg
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA

In [270]:
nlp = en_core_web_lg.load()

In [271]:
testdataset = load_dataset("DevamMondal/dbindexdemo")
testdataset.set_format(type="pandas")
testdataset = pd.DataFrame(testdataset['train'][:])

Vectorized text using word emebddings through SpaCy

In [308]:
testdataset["vectors"] = testdataset['report'].apply(nlp).apply(lambda x : x.vector)
testdataset

Unnamed: 0,report,vectors
0,"""This political candidate is clearly the best ...","[-0.9651882, 2.2312863, -0.6606929, -0.2265092..."
1,"""This company always produces top-quality prod...","[-1.1817936, 0.8873633, 0.098376654, 1.8417091..."
2,"""The experiment results indicate a 20% increas...","[-3.2654548, -0.25928292, -0.5695937, 2.879466..."
3,"""According to the survey, 70% of respondents p...","[-4.600674, -0.12548637, 0.01597282, 2.4468849..."
4,"""The research findings suggest a correlation b...","[-0.80799526, 0.95509624, -0.2872335, 0.461630..."


In [309]:
vectors = pd.DataFrame(testdataset['vectors'])
for index, row in vectors.iterrows():
    row[0].astype(np.float16)

In [310]:
vectors['vectors']

0    [-0.9651882, 2.2312863, -0.6606929, -0.2265092...
1    [-1.1817936, 0.8873633, 0.098376654, 1.8417091...
2    [-3.2654548, -0.25928292, -0.5695937, 2.879466...
3    [-4.600674, -0.12548637, 0.01597282, 2.4468849...
4    [-0.80799526, 0.95509624, -0.2872335, 0.461630...
Name: vectors, dtype: object

In [311]:
v = testdataset['vectors'].values.tolist()
v

[array([-9.65188205e-01,  2.23128629e+00, -6.60692871e-01, -2.26509258e-01,
         6.58662844e+00, -4.10575628e-01,  1.22489417e-02,  7.73086488e-01,
        -2.22249722e+00, -1.69124091e+00,  7.93587112e+00,  3.10871422e-01,
        -3.70033789e+00,  8.17997873e-01, -6.79709971e-01,  2.79195738e+00,
         1.57658458e+00,  5.25943816e-01,  8.86741519e-01, -4.35301512e-01,
         1.72036603e-01, -1.41921866e+00, -1.24459422e+00, -1.48129380e+00,
        -1.86029506e+00,  1.59133524e-01, -7.47122884e-01, -1.41932797e+00,
        -1.43779719e+00, -3.31869841e-01,  3.99545819e-01,  9.14585769e-01,
        -1.90411580e+00, -1.72962594e+00, -1.35380864e+00,  5.28402746e-01,
        -7.28304803e-01,  1.02615714e+00,  3.06142807e-01,  1.17703426e+00,
         4.01145697e-01,  1.37362123e+00, -4.89159316e-01,  3.48138183e-01,
        -2.61659431e+00,  2.19487405e+00,  9.00371552e-01, -1.84549677e+00,
        -8.95624280e-01,  1.79176009e+00, -2.38556361e+00,  1.59424698e+00,
         8.0

In [192]:
km = KMeans(n_clusters=4, random_state=0, n_init="auto").fit(v)

Clustering algorithm complete

In [198]:
#find labels for each element, add it to dataframe
testdataset['clusters'] = km.labels_

In [242]:
testdataset

Unnamed: 0,report,vectors,clusters
0,"""This political candidate is clearly the best ...","[-0.9651882, 2.2312863, -0.6606929, -0.2265092...",1
1,"""This company always produces top-quality prod...","[-1.1817936, 0.8873633, 0.098376654, 1.8417091...",1
2,"""The experiment results indicate a 20% increas...","[-3.2654548, -0.25928292, -0.5695937, 2.879466...",0
3,"""According to the survey, 70% of respondents p...","[-4.600674, -0.12548637, 0.01597282, 2.4468849...",2
4,"""The research findings suggest a correlation b...","[-0.80799526, 0.95509624, -0.2872335, 0.461630...",3


create clusters and add text

In [228]:
lists = [[] for _ in range(testdataset['clusters'].nunique())]
for index, row in testdataset.iterrows():
    lists[row[2]].append(row[0])

In [229]:
lists

[['"The experiment results indicate a 20% increase in productivity." '],
 ['"This political candidate is clearly the best choice for our country."',
  '"This company always produces top-quality products, unlike its competitors." '],
 ['"According to the survey, 70% of respondents prefer option A." '],
 ['"The research findings suggest a correlation between diet and heart health."']]

then use mb-index on all five

but what if we need to figure out how many clusters to use, we can use GridSearchCV

In [243]:
param_grid = {"n_clusters": range(1,5)}

grid = GridSearchCV(km, param_grid, scoring = 'adjusted_mutual_info_score')
grid.fit(v,testdataset['clusters'])

print (grid.best_score_)
print (grid.best_params_)
print (grid.best_estimator_)
grid.best_params_['n_clusters']

1.0
{'n_clusters': 1}
KMeans(n_clusters=1, n_init='auto', random_state=0)


1

therefore:

In [264]:
#assuming that both datasets passed in are already dataframes 
def dbindex(target, comparison):
    #load in spaCy pipeline
    nlp = en_core_web_lg.load()
    
    #Pick a random entry from the comparison dataset.
    index = random.randrange(0, len(comparison), 1)

    #Find that entry.
    comparisonEntry = comparison['sentence'][index]
    
    #Vectoritze entry.
    vectorizedComparisonEntry = nlp(comparisonEntry)
    
    #vectorize all entries in target dataset using above pipeline and add to the dataset
    target["vectors"] = target[target.columns[0]].apply(nlp).apply(lambda x : x.vector)
    
    #get list of vectors
    v = target['vectors'].values.tolist()
    
    #run initial k-means
    km = KMeans(n_clusters=4, random_state=0, n_init="auto").fit(v)
    
    #use grid search to optimize number of clusters
    param_grid = {"n_clusters": range(1,5)}
    grid = GridSearchCV(km, param_grid, scoring = 'adjusted_mutual_info_score')
    grid.fit(v,target['clusters'])

    #rerun k-means clustering
    km = KMeans(n_clusters=grid.best_params_['n_clusters'], random_state=0, n_init="auto").fit(v)
    
    #find labels for each element, add it to dataframe
    target['clusters'] = km.labels_
    
    #create list for each cluster
    clusters = [[] for _ in range(target['clusters'].nunique())]

    #add sentence to respective cluster
    for index, row in target.iterrows():
        clusters[row[2]].append(row[0])
        
    #initialize list of cosine similarities
    tcossim = []
    
    for cluster in clusters:
        
        #initialize cluster total cosine simliarity
        ccossim = 0
        
        #iterate through each sentence in a cluster (list) 
        for sentence in cluster: 
            vecSentence = nlp(sentence)
            ccossim += vecSentence.similarity(vectorizedComparisonEntry)
        
        tcossim.append(ccossim/len(cluster))
        
    return (sum(tcossim)/len(clusters))

In [265]:
offensivedataset = load_dataset("henryscheible/implicit_bias")
offensivedataset.set_format(type='pandas')
compdataset = offensivedataset['train'][1:]
compdataset = compdataset.drop(['category', 'label'], axis = 1)

In [266]:
dbindex(testdataset, compdataset)

0.5758231597494824

In [257]:
sentence = 'The brown dog jumps over the lazy frog'
vecSentence = nlp(sentence)
vecSentence[1]

brown

testing out pca