In [None]:
import re
import json
import time
import string
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

from sklearn.feature_extraction.text import CountVectorizer

import warnings
warnings.filterwarnings('ignore')

# Enable logging
import logging
logging.basicConfig(level= logging.INFO)
wordnet_lemmatizer= WordNetLemmatizer()

from tqdm import tqdm

import gensim
from gensim.models.ldamulticore import LdaMulticore
from gensim.corpora import Dictionary
from gensim.models import Phrases
from collections import Counter
from gensim.models import Word2Vec

# Load Data

In [None]:
""" top_applications_data_file_path - stores the path to a file containing 
a json list of the most popular applications with package names, 
titles, descriptions and categories"""

top_applications_data = []

with open(top_applications_data_file_path) as inf:
    for line in inf:
        line = json.loads(line.strip())
        top_applications_data.append(line)

In [None]:
game_categs = ["action", "adventure", "arcade", "board", "card", "casino", "casual",
               "educational", "music", "puzzle", "racing", "role playing", "simulation",
               "game sports", "strategy", "trivia", "word"]

In [None]:
def sentence_clean_up(description):
    description = description.lower()
    url_extract_pattern = "https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)"
    description = re.sub(url_extract_pattern, "", description, flags=re.MULTILINE)
    description = re.sub(r'[^\w\s]', ' ', description)
    description = [word for word in word_tokenize(description) if word.isalpha()]
    description = " ".join(description)
    return description

In [None]:
data_for_dataframe = []
i = 0
for top_app in top_applications_data:
    category = top_app["i"][0]["ct"]
    original_pkn = top_app["p"].split(".")
    pkn = ' '.join(original_pkn)
    descriptive_fields = top_app["i"][0]["ed"] + " " + top_app["i"][0]["ean"] + " " + pkn

    if category.lower() in game_categs:
        category = "Game"

    data_for_dataframe.append({
        "category": category,
        "description": sentence_clean_up(descriptive_fields), 
        "package_name": top_app["p"],
    })

In [None]:
descrtiptions_df = pd.DataFrame(data_for_dataframe)

In [None]:
descrtiptions_df.head()

## Prepare descriprions for LDA

In [None]:
# these are to be excluded from the descriptions of the applications
english_articles = ["a", "an", "the"]
english_conjunctions = ["for", "and", "nor", "but", "or", "yet", "so"]
english_pronouns = ["i", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them",
                    "what", "who", "me", "him", "her", "it", "us", "you", "them", "whom", "mine", 
                    "yours", "his", "hers", "ours", "theirs", "this", "that", "these", "those",
                    "who", "whom", "which", "what", "whose", "whoever", "whatever", "whichever",
                    "whomever", "who", "whom", "whose", "which", "that", "what", "whatever", 
                    "whoever", "whomever", "whichever", "myself", "yourself", "himself", "herself",
                    "itself", "ourselves", "themselves", "myself", "yourself", "himself", "herself",
                    "itself", "ourselves", "themselves", "each other", "one another", "anything", 
                    "everybody", "another", "each", "few", "many", "none", "some", "all", "any", "anybody",
                    "anyone", "everyone", "everything", "no one", "nobody", "nothing", "none", "other",
                    "others", "several", "somebody", "someone", "something", "most", "enough", "little",
                    "more", "both", "either", "neither", "one", "much", "such"]
english_prepositions = ["aboard", "about", "above", "across", "after", "against", "along", "amid", "among",
                        "anti", "around", "as", "at", "before", "behind", "below", "beneath", "beside", 
                        "besides", "between", "beyond", "but", "by", "concerning", "considering", "despite", 
                        "down", "during", "except", "excepting", "excluding", "following", "for", "from", "in",
                        "inside", "into", "like", "minus", "near", "of", "off", "on", "onto", "opposite",
                        "outside", "over", "past", "per", "plus", "regarding", "round", "save", "since",
                        "than", "through", "to", "toward", "towards", "under", "underneath", "unlike", "until",
                        "up", "upon", "versus", "via", "with", "within", "without"]
english_auxiliary_verbs = ["be", "am", "are", "is", "was", "were", "being", "can", "could", "do" "did", "does",
                           "doing", "have" "had", "has", "having", "may", "might", "must", "shall", "should", 
                           "will", "would"]
english_modal_verbs = ["can", "could", "may", "might", "shall", "should", "will", "would", "must"]
noise_words = ["http", "https", "also", "online", "android", "app", "apps", "application", "applications",
                "google", "offline", "policy", "always", "facebook", "current", "premium", "latest", "unlimited",
                "term", "anywhere", "various", "exclusive", "code", "whether", "add", "instagram", "go", "apply",
                "second", "whatsapp", "twitter", "youtube", "2022", "already", "christmas", "tiktok", "2021", "2020",
                "le", "santa", "from", "free", "retweet"]


In [None]:
stop_words = stopwords.words('english')
stop_words.extend(english_articles)
stop_words.extend(english_pronouns)
stop_words.extend(english_prepositions)
stop_words.extend(english_conjunctions)
stop_words.extend(english_auxiliary_verbs)
stop_words.extend(english_modal_verbs)
stop_words.extend(noise_words)

In [None]:
stop_words = list(set(stop_words))

In [None]:
def get_pos_tag(tag):    
    if tag.startswith('N') or tag.startswith('J'):
        return wordnet.NOUN
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN 

In [None]:
def preprocess(text):
    doc_tokens = nltk.word_tokenize(text)
    word_tokens = [word for word in doc_tokens if len(word)>3]
    pos_tags = nltk.pos_tag(word_tokens)
    doc_words = [wordnet_lemmatizer.lemmatize(word, pos=get_pos_tag(tag)) for word, tag in pos_tags]
    doc_words = [word for word in doc_words if word not in stop_words]

    return doc_words

In [None]:
df_clean = descrtiptions_df['description'].apply(preprocess)

In [None]:
df_clean.to_frame().head()

### Keep all PoS / Keep Only Nouns & Adjectives

In [None]:
def get_specific_pos(series):
    pos_tags= nltk.pos_tag(series)
    all_adj_nouns= [word for (word, tag) in pos_tags] 
    return all_adj_nouns

In [None]:
pos_df = df_clean.apply(get_specific_pos)

In [None]:
pos_df.to_frame()

# Create Bigrams


In [None]:
docs = list(pos_df)
phrases = gensim.models.Phrases(docs, min_count=10, threshold=20)
bigram_model = gensim.models.phrases.Phraser(phrases)

In [None]:
def make_bigrams(texts):
    return [bigram_model[doc] for doc in texts]

In [None]:
data_words_bigrams = make_bigrams(docs)

In [None]:
data_words_bigrams[0]

In [None]:
bigram_counter = Counter()
for key in phrases.vocab.keys():
    if key not in stop_words:
        if len(str(key).split('_')) > 1:
            bigram_counter[key] += phrases.vocab[key]

for key, counts in bigram_counter.most_common(20):
    print(key,">>>>", counts)

# Create a dictionary and corpus for input to our LDA model. Filter out the most common and uncommon words.


In [None]:
dictionary = Dictionary(data_words_bigrams)

# Filter out words that occur less than 100 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=100, no_above=0.5)
corpus = [dictionary.doc2bow(doc) for doc in data_words_bigrams]

print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

## Load the Best Model from Disk / Train

In [None]:
from gensim.models.ldamulticore import LdaModel

In [None]:
dir_path = r"best_ldamodel"
model_name = "/best_ldamodel.gensim"
full_path = dir_path + model_name

In [None]:
best_ldamodel = LdaModel.load(full_path)

In [None]:
# workers = 250
num_topics = 33
# alpha = 50/33
# beta = 0.1
# # 200/len(dictionary)
# passes = 150

In [None]:
# np.random.seed(1) # use this seed to obtain the same results

# best_ldamodel = LdaMulticore(
#                                 corpus, 
#                                 id2word=dictionary, 
#                                 num_topics=num_topics, 
#                                 chunksize=3000, 
#                                 batch=True,
#                                 iterations=350,
#                                 passes=passes,
#                                 workers=workers,
#                                 alpha=alpha,
#                                 eta=beta
#                             ) 

In [None]:
best_ldamodel.log_perplexity(corpus)

In [None]:
from gensim.models import CoherenceModel

coherence_model_lda = CoherenceModel(model=best_ldamodel, texts=data_words_bigrams,
                                     dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

In [None]:
best_ldamodel.show_topics(num_words=25, formatted=True, num_topics=num_topics)

In [None]:
lda_corpus = best_ldamodel[corpus]

In [None]:
all_topics = best_ldamodel.get_document_topics(corpus)
num_docs = len(all_topics)

all_topics_csr= gensim.matutils.corpus2csc(all_topics)
all_topics_numpy= all_topics_csr.T.toarray()

major_topic= [np.argmax(arr) for arr in all_topics_numpy]
descrtiptions_df['major_lda_topic'] = major_topic

In [None]:
sns.set(rc= {'figure.figsize': (10,5)})
sns.set_style('darkgrid')

descrtiptions_df.major_lda_topic.value_counts().plot(kind='bar')

In [None]:
descrtiptions_df.head()

## Compute topic proportion for each class 


In [None]:
descrtiptions_df["clean_description"] = docs

In [None]:
descrtiptions_df.head()

In [None]:
topics_distributions_per_category = {}
for index, row in descrtiptions_df.iterrows():
    category = row["category"]
    descr = best_ldamodel.id2word.doc2bow(row["clean_description"])
    doc_topics = best_ldamodel.get_document_topics(descr, minimum_probability=0)

    if category.lower() in game_categs:
        category = "Game"
        
    if category not in topics_distributions_per_category:
        topics_distributions_per_category[category] = {"nr_of_docs": 1,
                                                       "topics_distr": {
                                                       }}
        for i in range(0, 33):
            topics_distributions_per_category[category]["topics_distr"][doc_topics[i][0]] = [doc_topics[i][1]]
    else: 
        topics_distributions_per_category[category]["nr_of_docs"] += 1
        for i in range(0, 33):
            topics_distributions_per_category[category]["topics_distr"][doc_topics[i][0]].append(doc_topics[i][1])

In [None]:
import statistics

harmonic_mean_per_category_per_topic = {}
for category, info in topics_distributions_per_category.items():
    harmonic_mean_per_category_per_topic[category] = {}
    n = info["nr_of_docs"]
    for i in range(0, 33):
            harmonic_mean_per_category_per_topic[category][i] = statistics.harmonic_mean(topics_distributions_per_category[category]["topics_distr"][i])

## Compute clusters quality
### Categories Definitions based Approach

In [None]:
categories_definitions = {
    "Art & Design": "Sketchbooks, painter tools, art and design tools, coloring books",
    "Auto & Vehicles": "Auto shopping, auto insurance, auto price comparison, road safety, auto reviews and news",
    "Beauty": "Makeup tutorials, makeover tools, hair styling, beauty shopping, makeup simulators",
    "Books & Reference": "Book readers, reference books, textbooks, dictionaries, thesaurus, wikis",
    "Business": "Document editor/reader, package tracking, remote desktop, email management, job search",
    "Comics": "Comic players, comic titles",
    "Communication": "Messaging, chat/IM, dialers, address books, browsers, call management",
    "Dating": "Matchmaking, courtship, relationship building, meeting new people, finding love",
    "Education": "Exam preparations, study-aids, vocabulary, educational games, language learning",
    "Entertainment": "Streaming video, movies, TV, interactive entertainment",
    "Events": "Concert tickets, sporting event tickets, ticket resales, movie tickets",
    "Finance": "Banking, payment, ATM finders, financial news, insurance, taxes, portfolio/trading, tip calculators",
    "Food & Drink": "Recipes, restaurants, food guides, wine tasting and discovery, beverage recipes",
    "Health & Fitness": "Personal fitness, workout tracking, diet and nutritional tips, health and safety",
    "House & Home": "House and apartment search, home improvement, interior decoration, mortgages, real estate",
    "Libraries & Demo": "Software libraries, technical demos",
    "Lifestyle": "Style guides, wedding and party planning, how-to guides",
    "Maps & Navigation": "Navigation tools, GPS, mapping, transit tools, public transportation",
    "Medical": "Drug and clinical references, calculators, handbooks for healthcare providers, medical journals and news",
    "Music & Audio": "Music services, radios, music players",
    "News & Magazines": "Newspapers, news aggregators, magazines, blogging",
    "Parenting": "Pregnancy, infant care and monitoring, childcare",
    "Personalization": "Wallpapers, live wallpapers, home screen, lock screen, ringtones",
    "Photography": "Cameras, photo editing tools, photo management, and sharing",
    "Productivity": "Notepad, to-do list, keyboard, printing, calendar, backup, calculator, conversion",
    "Shopping": "Online shopping, auctions, coupons, price comparison, grocery lists, product reviews",
    "Social": "Social networking, check-in",
    "Sports": "Sports news and commentary, score tracking, fantasy team management, game coverage",
    "Tools": "Tools for Android devices",
    "Travel & Local": "Trip booking tools, ride-sharing, taxis, city guides, local business information, trip management tools, tour booking",
    "Video Players & Editors": "Video players, video editors, media storage",
    "Weather": "Weather reports",
    "Game": "Action, adventure, arcade, board, card, casino, casual, educational, music, puzzle, racing, role playing, simulation, sports, strategy, trivia, word"
}

In [None]:
categories_data_for_dataframe = []

for category, categ_def in categories_definitions.items():
    categories_data_for_dataframe.append({
        "category": category,
        "description": sentence_clean_up(categ_def),
    })

categ_defs_df = pd.DataFrame(categories_data_for_dataframe)  
categ_defs_df.head()

In [None]:
categs_df_clean = categ_defs_df['description'].apply(preprocess)

In [None]:
categs_df_clean.to_frame().head()

In [None]:
categs_docs = list(categs_df_clean)
categs_phrases = gensim.models.Phrases(categs_docs, min_count=10, threshold=20)

In [None]:
categs_corpus = [dictionary.doc2bow(doc) for doc in categs_docs]

print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(categs_corpus))

In [None]:
best_ldamodel.log_perplexity(categs_corpus)

In [None]:
categs_lda_corpus = best_ldamodel[categs_corpus]

In [None]:
categs_all_topics = best_ldamodel.get_document_topics(categs_corpus)
categs_num_docs = len(categs_all_topics)

In [None]:
categs_all_topics_csr= gensim.matutils.corpus2csc(categs_all_topics)
categs_all_topics_numpy= categs_all_topics_csr.T.toarray()

In [None]:
categs_major_topic= [np.argmax(arr) for arr in categs_all_topics_numpy]
categ_defs_df['major_lda_topic'] = categs_major_topic

In [None]:
second_categs_major_topic = []
for category_no, arr in enumerate(categs_all_topics_numpy):
    second = np.sort(arr)[-2] 
    i = np.where(arr == second)
    print(category_no, second, i)

In [None]:
sns.set(rc= {'figure.figsize': (10,5)})
sns.set_style('darkgrid')

ax = categ_defs_df.major_lda_topic.value_counts().plot(kind='bar', title='S1.1: Distribution of Categories Against the Topics') # plot the distribution of categories in each topic
ax.set_xlabel("Topic ID Number")
ax.set_ylabel("Number of Categories")

In [None]:
topics_ids = set(list(range(0, 33)))
missing_ids = topics_ids - set(categs_major_topic)

In [None]:
missing_ids

In [None]:
len(missing_ids)

In [None]:
topic_to_categ = {}
for index, row in categ_defs_df.iterrows():
    category = row['category']
    topic = row['major_lda_topic']
    if topic not in topic_to_categ:
        topic_to_categ[topic] = [category]
    else: 
        topic_to_categ[topic].append(category)

In [None]:
keys = list(topic_to_categ.keys())
keys.sort()
topic_to_categ = {i: topic_to_categ[i] for i in keys}
topic_to_categ

In [None]:
nr = 0
for t, categs in topic_to_categ.items():
    if len(categs) == 1:
        nr+=1
nr 

In [None]:
length_count = {}
for t, categs in topic_to_categ.items():
    if len(categs) not in length_count:
        length_count[len(categs)] = 1
    else:
        length_count[len(categs)] += 1

In [None]:
length_count

## Evaluate Classification Abilities of LDA

### Label topics without a category by using Cosine Similarity

In [None]:
topic_to_categ = {}
for index, row in categ_defs_df.iterrows():
    category = row['category']
    topic = row['major_lda_topic']
    if topic not in topic_to_categ:
        topic_to_categ[topic] = [category]
    else: 
        topic_to_categ[topic].append(category)

In [None]:
keys = list(topic_to_categ.keys())
keys.sort()
topic_to_categ = {i: topic_to_categ[i] for i in keys}
topic_to_categ

In [None]:
cosine_sim_preds = dict()
for compute_similarity_for in missing_ids:
    to_get = best_ldamodel.get_topic_terms(compute_similarity_for, topn=25)
    max_sim = -1
    max_sim_topic = -1
    for i in range(num_topics):
        if i != compute_similarity_for and i not in missing_ids:
            compared_with = best_ldamodel.get_topic_terms(i, topn=25)
            similarity = gensim.matutils.cossim(to_get, compared_with)
            if similarity > max_sim:
                max_sim = similarity
                max_sim_topic = i
    cosine_sim_preds[compute_similarity_for] = {"max_sim":max_sim,
                                                "topic":max_sim_topic}

In [None]:
cosine_sim_preds

In [None]:
for topic_id in missing_ids:
    topic_to_categ[topic_id] = topic_to_categ[cosine_sim_preds[topic_id]["topic"]]

In [None]:
keys = list(topic_to_categ.keys())
keys.sort()
topic_to_categ = {i: topic_to_categ[i] for i in keys}
topic_to_categ

In [None]:
descrtiptions_df.head()

In [None]:
package_names_topics_and_category = {}
for index, row in descrtiptions_df.iterrows():
    topic = row["major_lda_topic"]
    category = row['category']
    package_name = row["package_name"]
    if package_name not in package_names_topics_and_category:
        package_names_topics_and_category[package_name] = {"c": category,
                                                           "t": topic_to_categ[topic]}

In [None]:
topic_to_categ

In [None]:
sorted_categs = sorted(list(set(categories_definitions.keys())))

In [None]:
numeric_labels_for_categories = {}
i = 0
for category in sorted_categs:
    numeric_labels_for_categories[category] = i
    i+=1

In [None]:
numeric_labels_for_categories

In [None]:
initial = []
predicted_topic = []

for pkn, info in package_names_topics_and_category.items():
    
    decided_category_label = -1
    predicted_topic_label = -1

    if info["c"].lower() in game_categs:
        decided_category_label = numeric_labels_for_categories["Game"]
    else:
        decided_category_label = numeric_labels_for_categories[info["c"]]
        
    if info["c"] in info["t"]:
        predicted_topic_label = numeric_labels_for_categories[info["c"]] 
    else:
        predicted_topic_label = numeric_labels_for_categories[info["t"][0]] 

 
    initial.append(decided_category_label)
    predicted_topic.append(predicted_topic_label)

In [None]:
from sklearn import metrics

In [None]:
cossim_accuracy = metrics.accuracy_score(initial, predicted_topic)
cossim_precision = metrics.precision_score(initial, predicted_topic, average='weighted')
cossim_recall = metrics.recall_score(initial, predicted_topic, average='weighted')
cossim_f1_score = metrics.f1_score(initial, predicted_topic, average='weighted')
cossim_metrics_for_each_class = metrics.precision_recall_fscore_support(initial, predicted_topic, average="weighted")

In [None]:
cossim_accuracy

In [None]:
cossim_precision

In [None]:
cossim_recall

In [None]:
cossim_f1_score

In [None]:
cossim_metrics_for_each_class

In [None]:
initial = []
predicted_topic = []

for pkn, info in package_names_topics_and_category.items():
    
    decided_category_label = -1
    predicted_topic_label = -1

    if info["c"].lower() in game_categs:
        decided_category_label = "Game"
    else:
        decided_category_label = info["c"]
        
    if info["c"] in info["t"]:
        predicted_topic_label = info["c"] 
    else:
        predicted_topic_label = info["t"][0]
    
    initial.append(decided_category_label)
    predicted_topic.append(predicted_topic_label)

In [None]:
correct = []
for i in range(len(initial)):
    if initial[i] == predicted_topic[i]:
        correct.append(initial[i])

pd.Series(correct).to_frame().value_counts()

### Label topics without a category based on human interpretation

In [None]:
topic_to_categ = {}
for index, row in categ_defs_df.iterrows():
    category = row['category']
    topic = row['major_lda_topic']
    if topic not in topic_to_categ:
        topic_to_categ[topic] = [category]
    else: 
        topic_to_categ[topic].append(category)

In [None]:
# sort topic_to_categ
keys = list(topic_to_categ.keys())
keys.sort()
topic_to_categ = {i: topic_to_categ[i] for i in keys}
topic_to_categ

In [None]:
missing_ids

In [None]:
topic_to_categ[3] = ["Tools"]
topic_to_categ[7] = ["Art & Design"]
topic_to_categ[8] = ["Game"] 
topic_to_categ[9] = ["Education"] 

topic_to_categ[15] = ["Food & Drink"]
topic_to_categ[17] = ["Game"]
topic_to_categ[19] = ["Entertainment"] 

topic_to_categ[20] = ["Tools"] 
topic_to_categ[23] = ["Weather"]
topic_to_categ[24] = ["Lifestyle"]
topic_to_categ[25] = ["Game"]

topic_to_categ[30] = ["Lifestyle"]

In [None]:
# sort topic_to_categ
keys = list(topic_to_categ.keys())
keys.sort()
topic_to_categ = {i: topic_to_categ[i] for i in keys}
topic_to_categ

In [None]:
package_names_topics_and_category = {}
for index, row in descrtiptions_df.iterrows():
    topic = row["major_lda_topic"]
    category = row['category']
    package_name = row["package_name"]
    if package_name not in package_names_topics_and_category:
        package_names_topics_and_category[package_name] = {"c": category,
                                                           "t": topic_to_categ[topic]}

In [None]:
sorted_categs = sorted(list(set(categories_definitions.keys())))

In [None]:
numeric_labels_for_categories = {}
i = 0
for category in sorted_categs:
    numeric_labels_for_categories[category] = i
    i+=1

In [None]:
initial = []
predicted_topic = []

for pkn, info in package_names_topics_and_category.items():
    
    decided_category_label = -1
    predicted_topic_label = -1

    if info["c"].lower() in game_categs:
        decided_category_label = numeric_labels_for_categories["Game"]
    else:
        decided_category_label = numeric_labels_for_categories[info["c"]]
        
    if info["c"] in info["t"]:
        predicted_topic_label = numeric_labels_for_categories[info["c"]] 
    else:
        predicted_topic_label = numeric_labels_for_categories[info["t"][0]] 

 
    initial.append(decided_category_label)
    predicted_topic.append(predicted_topic_label)

In [None]:
from sklearn import metrics

In [None]:
hi_accuracy = metrics.accuracy_score(initial, predicted_topic)
hi_precision = metrics.precision_score(initial, predicted_topic, average='weighted')
hi_recall = metrics.recall_score(initial, predicted_topic, average='weighted')
hi_f1_score = metrics.f1_score(initial, predicted_topic, average='weighted')
hi_metrics_for_each_class = metrics.precision_recall_fscore_support(initial, predicted_topic, average="weighted")

In [None]:
hi_accuracy

In [None]:
hi_precision

In [None]:
hi_recall

In [None]:
hi_f1_score

In [None]:
hi_metrics_for_each_class

In [None]:
second_apps_major_topic = []
second_apps_major_topic_values = []

for category_no, arr in enumerate(all_topics_numpy):
    second = np.sort(arr)[-2] #Finds the second highest number
    i = np.where(arr == second)
    if len(i[0]) > 1:
        second_maj_topic = -1
    else:
        second_maj_topic = i[0][0]

    second_apps_major_topic.append(second_maj_topic)
    second_apps_major_topic_values.append(second)

In [None]:
third_apps_major_topic = []
third_apps_major_topic_values = []
for category_no, arr in enumerate(all_topics_numpy):
    third = np.sort(arr)[-3] 
    i = np.where(arr == third)
    if len(i[0]) > 1:
        third_maj_topic = -1
    else:
        third_maj_topic = i[0][0]
    third_apps_major_topic.append(third_maj_topic)
    third_apps_major_topic_values.append(third)    

In [None]:
descrtiptions_df['second_major_lda_topic'] = second_apps_major_topic
descrtiptions_df['third_major_lda_topic'] = third_apps_major_topic

In [None]:
with_correct_second_topic = 0
with_correct_third_topic = 0
for index, row in descrtiptions_df.iterrows():
    if row["categs_label"] == row["second_major_lda_topic"]:
        with_correct_second_topic += 1
    if row["categs_label"] == row["third_major_lda_topic"]:
        with_correct_third_topic += 1

In [None]:
with_correct_second_topic

In [None]:
with_correct_third_topic

In [None]:
with_correct_second_topic_per_categ = {}
with_correct_third_topic_per_categ = {}

for index, row in descrtiptions_df.iterrows():
    
    categ = row["category"]
    if categ in game_categs:
        categ = "Game"
    
    if row["categs_label"] == row["second_major_lda_topic"]:
        if categ not in with_correct_second_topic_per_categ:
            with_correct_second_topic_per_categ[categ] = 1
        else:
            with_correct_second_topic_per_categ[categ] += 1
            
    if row["categs_label"] == row["third_major_lda_topic"]:
        if categ not in with_correct_third_topic_per_categ:
            with_correct_third_topic_per_categ[categ] = 1
        else:
            with_correct_third_topic_per_categ[categ] += 1
        

# Topics clusters visualization

## pyLDAVis

In [None]:
import pyLDAvis.gensim
import pyLDAvis.gensim_models as gensimvis
import gensim

In [None]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(best_ldamodel, corpus, dictionary=best_ldamodel.id2word, sort_topics=False)
vis