# Setting Up Evaluation Criterions :
-

In [None]:
!pip install -q octis

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.0/131.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.0/24.0 MB[0m [31m56.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m51.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.3/100.3 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.0/17.0 MB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m170.6/170.6 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from octis.models.LDA import LDA
from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.diversity_metrics import TopicDiversity, InvertedRBO, WordEmbeddingsInvertedRBO, WordEmbeddingsInvertedRBOCentroid, LogOddsRatio, KLDivergence
from octis.evaluation_metrics.coherence_metrics import Coherence, WECoherencePairwise, WECoherenceCentroid

In [None]:
# Define dataset
dataset = Dataset()
dataset.fetch_dataset("20NewsGroup")

In [None]:
dataset.save('hello_dataset')

In [None]:
!head /content/hello_dataset/corpus.tsv

fax modem card sell mail	train	misc.forsale
run server server install run add	train	comp.windows.x
live part lead wait important remember judge judge guess close situation listen statement sense regard passage remember letter church people body talk work translation lack concern make sick point throw faith faith catch meaning offer explanation fire cold make aware child eternal	train	soc.religion.christian
doesn pain deserve die lie rape	train	talk.religion.misc
sale mile good condition good condition player component speaker mount door car maintain clean good car solid body spot surface spot touch make car problem firm car average cost interested call email	train	rec.autos
post real disease disease question case active culture reduce hear work mechanism common minor common major evidence	train	sci.med
execute future criminal activity compare rate black white commit crime black commit crime perfectly fair system black represent note black white crime rate thing economic thing poor peop

In [None]:
# Create Model
model = LDA(num_topics=20, alpha=0.1)

In [None]:
# Train the model using default partitioning choice
output = model.train_model(dataset)

print(*list(output.keys()), sep="\n") # Print the output identifiers



topic-word-matrix
topics
topic-document-matrix
test-topic-document-matrix


In [None]:
for t in output['topics'][:5]:
  print(" ".join(t))

water current internet problem class circuit make channel function case
year doctor patient time study work disease system low effect
image color program bit file format graphic datum software source
drive problem scsi card disk work monitor bus hard system
game team year play win season good goal player period


In [None]:
# Initialize metric
npmi = Coherence(texts=dataset.get_corpus(), topk=10, measure='c_npmi')

In [None]:
# Initialize metric
topic_diversity = TopicDiversity(topk=10)

In [None]:
# Retrieve metrics score
topic_diversity_score = topic_diversity.score(output)
print("Topic diversity: "+str(topic_diversity_score))

npmi_score = npmi.score(output)
print("Coherence: "+str(npmi_score))

Topic diversity: 0.705
Coherence: 0.05326432178262006


In [None]:
UMass_Coherence = Coherence(texts=dataset.get_corpus(), topk=10,measure='u_mass')
C_V_Coherence = Coherence(texts=dataset.get_corpus(), topk=10,measure='c_v')
UCI_Coherence = Coherence(texts=dataset.get_corpus(), topk=10,measure='c_uci')
NPMI_Coherence = Coherence(texts=dataset.get_corpus(), topk=10,measure='c_npmi')
Word_Embedding_based_Coherence_Pairwise = WECoherencePairwise(topk=10)
Word_Embedding_based_Coherence_Centroid = WECoherenceCentroid(topk=10)

In [None]:
print("Coherence Metrics :")

u_mass = UMass_Coherence.score(output)
print("UMass_Coherence : "+str(u_mass))

c_v = C_V_Coherence.score(output)
print("C_V_Coherence : "+str(c_v))

c_uci = UCI_Coherence.score(output)
print("UCI_Coherence : "+str(c_uci))

c_npmi = NPMI_Coherence.score(output)
print("NPMI_Coherence : "+str(c_npmi))

wemb_pairwise = Word_Embedding_based_Coherence_Pairwise.score(output)
print("Word_Embedding_based_Coherence_Pairwise : "+str(wemb_pairwise))

wemb_centroid = Word_Embedding_based_Coherence_Centroid.score(output)
print("Word_Embedding_based_Coherence_Centroid : "+str(wemb_centroid))

Coherence Metrics :
UMass_Coherence : -2.053830083072772
C_V_Coherence : 0.5475953170885669
UCI_Coherence : 0.09543590206576266
NPMI_Coherence : 0.05326432178262006
Word_Embedding_based_Coherence_Pairwise : 0.04038473970360226
Word_Embedding_based_Coherence_Centroid : 0.7592717415915142


In [None]:
print("Diversity Metrics :")

Topic_Diversity = TopicDiversity(topk=10)
topic_diversity_score = Topic_Diversity.score(output)
print("Topic_Diversity: "+str(topic_diversity_score))
del Topic_Diversity

InvertedRBO = InvertedRBO(topk=10)
topic_diversity_score = InvertedRBO.score(output)
print("InvertedRBO: "+str(topic_diversity_score))
del InvertedRBO

Word_Embedding_based_InvertedRBO_Matches = WordEmbeddingsInvertedRBO(topk=10)
topic_diversity_score = Word_Embedding_based_InvertedRBO_Matches.score(output)
print("Word_Embedding_based_InvertedRBO_Matches: "+str(topic_diversity_score))
del Word_Embedding_based_InvertedRBO_Matches

Word_Embedding_based_InvertedRBO_Centroid = WordEmbeddingsInvertedRBOCentroid(topk=10)
topic_diversity_score = Word_Embedding_based_InvertedRBO_Centroid.score(output)
print("Word_Embedding_based_InvertedRBO_Centroid: "+str(topic_diversity_score))
del Word_Embedding_based_InvertedRBO_Centroid

Log_odds_ratio = LogOddsRatio()
topic_diversity_score = Log_odds_ratio.score(output)
print("Log_odds_ratio: "+str(topic_diversity_score))
del Log_odds_ratio

Kullback_Liebler_Divergence = KLDivergence()
topic_diversity_score = Kullback_Liebler_Divergence.score(output)
print("Kullback_Liebler_Divergence: "+str(topic_diversity_score))
del Kullback_Liebler_Divergence

Diversity Metrics :
Topic_Diversity: 0.69
InvertedRBO: 0.9369660122553007
Word_Embedding_based_InvertedRBO_Matches: 0.42210528668987835
Word_Embedding_based_InvertedRBO_Centroid: 0.8439932910467731
Log_odds_ratio: 1.8694633969107615
Kullback_Liebler_Divergence: 1.6907708578988125
