# Set up

In [None]:
!pip install gitpython

In [None]:
import git

from git import Repo

git_url = 'https://github.com/AdhyaSuman/GINopic'
repo_dir = 'GINopic_local'

Repo.clone_from(git_url, repo_dir)

In [None]:
# Go to the home directory of the repo
cd GINopic_local

In [None]:
!pip install -e.

In [None]:
#Install DGL
!pip install  dgl -f https://data.dgl.ai/wheels/cu121/repo.html

In [None]:
!pip install  dglgo -f https://data.dgl.ai/wheels-test/repo.html

# Imports

In [None]:
from octis.dataset.dataset import Dataset

#Import models:
from octis.models.GINOPIC import GINOPIC

#Import coherence metrics:
from octis.evaluation_metrics.coherence_metrics import *

#Import TD metrics:
from octis.evaluation_metrics.diversity_metrics import *

#Import classification metrics:
from octis.evaluation_metrics.classification_metrics import *

import random, torch

# Utils

In [None]:
data_dir = './preprocessed_datasets'

def get_dataset(dataset_name):
    data = Dataset()
    if dataset_name=='20NG':
        data.fetch_dataset("20NewsGroup")
    elif dataset_name=='SO':
        data.load_custom_dataset_from_folder(data_dir + "/SO")
    elif dataset_name=='BBC':
        data.fetch_dataset("BBC_News")
    elif dataset_name=='Bio':
        data.load_custom_dataset_from_folder(data_dir + "/Bio")
    elif dataset_name=='SearchSnippets':
        data.load_custom_dataset_from_folder(data_dir + "/SearchSnippets")
    else:
        raise Exception('Missing Dataset name...!!!')
    return data

# Hyperparameter Tuning

**Skip this if you do not want to tune the hyperparameters**

In [None]:
# #Define some parameters
# dataset_name = '20NG'
# K=20
# use_partitions=True
# use_validation=False

# #Get the dataset
# data = get_dataset(dataset_name)

# #Define the Model
# model = GINOPIC(num_topics=K,
#                 use_partitions=use_partitions,
#                 use_validation=use_validation,
#                 num_epochs=50,
#                 w2v_path='./w2v/{}_part{}_valid{}/'.format(dataset_name, use_partitions, use_validation),
#                 graph_path= './doc_graphs/{}_part{}_valid{}/'.format(dataset_name, use_partitions, use_validation))
        

# #Coherence Score:
# npmi = Coherence(texts=data.get_corpus(), topk=10, measure='c_npmi')

In [None]:
# #Define the Search Space
# from skopt.space.space import Real, Categorical, Integer

# search_space = {
#     "g_feat_size":    Categorical({64, 128, 256, 512, 768, 1024, 2048}), 
#     "num_gin_layers": Categorical({2, 3}), 
#     "num_mlp_layers": Categorical({1, 2, 3, 4, 5}),
#     "gin_hidden_dim": Categorical({50, 100, 200, 300}),
#     "gin_output_dim": Categorical({64, 128, 256, 512, 768, 1024, 2048}),
#     "eps_simGraph":   Categorical({.0, .05, .1, .2, .3, .4, .5}),
#                }

In [None]:
# #Define the Optimizer
# from octis.optimization.optimizer import Optimizer

# optimizer=Optimizer()
# optimization_result = optimizer.optimize(
#                         model,
#                         data,
#                         npmi,
#                         search_space,
#                         number_of_call=50, 
#                         model_runs=5,
#                         save_models=False, 
#                         early_stop=False,
#                         early_step=10,
#                         plot_best_seen=False,
#                         plot_model=False,
#                         save_path='./H_optimization/{}/K_{}/'.format(dataset_name, K))

In [None]:
# optimization_result.save_to_csv("./H_optimization/{}/K_{}/results.csv".format(dataset_name, K))

# Run Models

In [None]:
import os
from random import randint
from IPython.display import clear_output

seeds = [randint(0, 2e3) for _ in range(1)]

n_topics = {
    '20NG': [20, 50, 100],
    'BBC': [5, 20, 50, 100],
    'Bio': [20, 50, 100],
    'SO': [20, 50, 100],
    'SearchSnippets': [8, 20, 50, 100],
}

m = 'GINopic'
datasets = ['20NG', 'BBC', 'Bio', 'SO', 'SearchSnippets']

params = {
    '20NG': {
        'num_gin_layers': 2,
        'g_feat_size': 2048,
        'num_mlp_layers': 1,
        'gin_hidden_dim': 200,
        'gin_output_dim': 768,
        'eps_simGraph': 0.4
    },
    'BBC': {
        'num_gin_layers': 3,
        'g_feat_size': 256,
        'num_mlp_layers': 1,
        'gin_hidden_dim': 50,
        'gin_output_dim': 512,
        'eps_simGraph': 0.3
    },
    'Bio': {
        'num_gin_layers': 2,
        'g_feat_size': 1024,
        'num_mlp_layers': 1,
        'gin_hidden_dim': 200,
        'gin_output_dim': 256,
        'eps_simGraph': 0.05
    },
    'SO': {
        'num_gin_layers': 2,
        'g_feat_size': 64,
        'num_mlp_layers': 1,
        'gin_hidden_dim': 300,
        'gin_output_dim': 512,
        'eps_simGraph': 0.1
    },
    'SearchSnippets': {
        'num_gin_layers': 2,
        'g_feat_size': 1024,
        'num_mlp_layers': 1,
        'gin_hidden_dim': 50,
        'gin_output_dim': 256,
        'eps_simGraph': 0.2
    }
}

results = {
    'Dataset': [],
    'K': [],
    'Seed': [],
    'Model':[],
    'NPMI': [],
    'CV': [],
    'Accuracy': []
}

irbo = InvertedRBO(topk=10, weight=.95)

partition = True
validation = False

for seed in seeds:
    for d in datasets:
        for k in n_topics[d]:
            data = get_dataset(d)

            print('Results:-\n', results)

            print("-"*100)
            print('Dataset:{},\t Model:{},\t K={},\t Seed={}'.format(d, m, k, seed))
            print("-"*100)

            random.seed(seed)
            torch.random.manual_seed(seed)

            model = GINOPIC(num_topics=k,
                 use_partitions=partition,
                 use_validation=validation,
                 num_epochs=50,
                 w2v_path='./w2v/{}_part{}_valid{}/'.format(d, partition, validation),
                 graph_path='./doc_graphs/{}_part{}_valid{}/'.format(d, partition, validation),
                 num_gin_layers=params[d]['num_gin_layers'],
                 g_feat_size=params[d]['g_feat_size'],
                 num_mlp_layers=params[d]['num_mlp_layers'],
                 gin_hidden_dim=params[d]['gin_hidden_dim'],
                 gin_output_dim=params[d]['gin_output_dim'],
                 eps_simGraph=params[d]['eps_simGraph']
                )

            output = model.train_model(dataset=data)

            del model
            torch.cuda.empty_cache()

            #Hyperparams:
            results['Dataset'].append(d)
            results['Model'].append(m)
            results['K'].append(k)
            results['Seed'].append(seed)
            #############

            #Coherence Scores:
            npmi = Coherence(texts=data.get_corpus(), topk=10, measure='c_npmi')
            results['NPMI'].append(npmi.score(output))
            del npmi

            cv = Coherence(texts=data.get_corpus(), topk=10, measure='c_v')
            results['CV'].append(cv.score(output))
            del cv

            #############
            if partition==True:
                #classification:
                try:
                    #Accuracy
                    accuracy = AccuracyScore(data)
                    results['Accuracy'].append(accuracy.score(output))
                except:
                    results['Accuracy'].append(0.0)
            else:
                results['Accuracy'].append(0.0)
            #############
            clear_output(wait=False)  