# Download the repo from https://github.com/AdhyaSuman/CTMKD and run the setup file to install OCTIS

In [None]:
cd ..

Run the setup.py file using pip

In [None]:
!pip install -e.

In [None]:
from octis.dataset.dataset import Dataset

#import models
from octis.models.LDA import LDA
from octis.models.ETM import ETM
from octis.models.ProdLDA import ProdLDA
from octis.models.NeuralLDA import NeuralLDA
from octis.models.CTM import CTM
from octis.models.CTMKD import CTMKD

#Import coherence metrics:
from octis.evaluation_metrics.coherence_metrics import *

import random, torch

# Utils

In [None]:
data_dir = './preprocessed_datasets'


def get_dataset(dataset_name):
    data = Dataset()
    if dataset_name=='20NG':
        data.fetch_dataset("20NewsGroup")
    elif dataset_name=='M10':
        data.load_custom_dataset_from_folder(data_dir + "/M10")
    else:
        raise Exception('Missing Dataset name...!!!')
    return data



def get_model(model_name, num_topics, dataset_name, ctm_T=None, num_epochs=100,
              KD_epochs=100, num_layers=1, alpha=None, temp=None, seed=None):
    
    T_BERT = "paraphrase-distilroberta-base-v2" 
    S_BERT = "all-MiniLM-L6-v2"
    
    
    if model_name=='LDA':
        model = LDA(num_topics=num_topics, random_state=seed, passes=5, use_partitions=False)
    
    elif model_name=='ETM':
        model = ETM(num_topics=num_topics, train_embeddings=True, use_partitions=False)
    
    elif model_name=='NeuralLDA':
        model = NeuralLDA(num_topics=num_topics, use_partitions=False)
    
    elif model_name=='ProdLDA':
        model = ProdLDA(num_topics=num_topics, use_partitions=False)
    
    
    elif model_name=='Student':
        model = CTM(num_topics=num_topics,
                    hidden_sizes=None,
                    dropout=0.2,
                    bert_path='./bert/{}_{}'.format(S_BERT, dataset_name),
                    bert_model=S_BERT,
                    num_epochs=num_epochs,
                    inference_type="zeroshot",
                    use_partitions=False, use_validation=False
                   )
        
    elif model_name=='CTM_KD_2wd_woPartition':
        model = CTMKD(num_topics=num_topics,
                       inference_type="zeroshot",
                       dropout=0.2,
                       num_epochs=num_epochs,
                       pre_processing_type='KD',
                       hidden_sizes=None,
                       student_bert_path='./bert/{}_{}'.format(S_BERT, dataset_name),
                       teacher_bert_path='./bert/{}_{}'.format(T_BERT, dataset_name),
                       student_bert_model=S_BERT,
                       teacher_bert_model=T_BERT,
                       teacher=ctm_T,
                       use_partitions=False, use_validation=False,
                       KD_epochs=KD_epochs,
                       alpha=alpha,
                       temp=temp,
                       KD_loss_type='2wd',
                       use_mean_logvar_recon_kd=True
                      )
    
    elif model_name=='CTM_Teacher':
        model = CTMKD(num_topics=num_topics,
                       inference_type="combined",
                       num_epochs=num_epochs,
                       dropout=0.2,
                       pre_processing_type='Teacher_only',
                       num_neurons=100,
                       num_layers=num_layers,
                       student_bert_path='./bert/{}_{}'.format(S_BERT, dataset_name),
                       teacher_bert_path='./bert/{}_{}'.format(T_BERT, dataset_name),
                       student_bert_model=S_BERT,
                       teacher_bert_model=T_BERT,
                       teacher=ctm_T,
                       use_partitions=False, use_validation=False
                       )
        
    return model

# Run Teacher

In [None]:
from os import listdir

n_topics = [20, 50, 100]
models = ['CTM_Teacher']
datasets = ['20NG']

T_Layer = {
    '20NG':{
        20:1,
        50:1,
        100:5
    },
    'M10':{
        10:4,
        20:5,
        50:2,
        100:3
    }
}

results = {
    'Dataset': [],
    'K': [],
    'Model':[],
    'NPMI':[],
    'CV': [],
}


for d in datasets:
    data = get_dataset(d)
    
    npmi = Coherence(texts=data.get_corpus(), measure='c_npmi')
    cv = Coherence(texts=data.get_corpus(), measure='c_v')
    
    for m in models:
        for k in n_topics:
            print("-"*100)
            print('Dataset:{},\t Model:{},\t K={}'.format(d, m, k))
            print("-"*100)

            model = get_model(model_name=m, num_topics=k, dataset_name=d, num_layers=T_Layer[d][k])
            output = model.train_model(dataset=data, save_dir='./Teacher_models/{}/'.format(d))
            
            del model
            torch.cuda.empty_cache()

            #params:
            results['Dataset'].append(d)
            results['Model'].append(m)
            results['K'].append(k)

            #Coherence Scores:
            results['NPMI'].append(npmi.score(output))
            results['CV'].append(cv.score(output))

            print('Results:-\n', results)
            print('#'*100)

# Run S and SKD

In [None]:
from octis.models.contextualized_topic_models_KD.models import ctmkd
from os import listdir
from random import seed, randint
from IPython.display import clear_output

seeds = [randint(0, 2e3) for _ in range(5)]
n_topics = [20, 50, 100]
models = ['LDA', 'ETM', 'NeuralLDA', 'ProdLDA', 'Student', 'CTM_KD_2wd_woPartition']
datasets = ['20NG']


SKD_hyp = {
    '20NG':{
        20: {
            'alpha': 0.1,
            'temp': 3.0
            },
        
        50: {
            'alpha': 0.9,
            'temp': 3.0
            },
        
        100: {
            'alpha': 0.9,
            'temp': 5.0
            },
    },
        
    'M10':{
        10: {
            'alpha': 0.6,
            'temp': 2.0
            },
        
        20: {
            'alpha': 0.8,
            'temp': 4.0
            },
        
        50: {
            'alpha': 0.8,
            'temp': 1.0
            },
        
        100: {
            'alpha': 0.9,
            'temp': 4.0
            }
    }
}

T_Layer = {
    '20NG':{
        20:'(100,)',
        50:'(100,)',
        100:'(100, 100, 100, 100, 100)'
    },
    'M10':{
        10:'(100, 100, 100, 100)',
        20:'(100, 100, 100, 100, 100)',
        50:'(100, 100)',
        100:'(100, 100, 100)',
    }
}

results = {
    'Dataset': [],
    'Seed': [],
    'K': [],
    'Model':[],
    'NPMI':[],
    'CV': []
}

for d in datasets:
    data = get_dataset(d)
    
    npmi = Coherence(texts=data.get_corpus(), measure='c_npmi', topk=10)
    cv = Coherence(texts=data.get_corpus(), measure='c_v', topk=10)
    
    for m in models:
        for k in n_topics:
            for seed in seeds:
#                 clear_output(wait=False)
                print("-"*100)
                print('Dataset:{},\t Model:{},\t K={},\t seed={}'.format(d, m, k, seed))
                print("-"*100)
                
                random.seed(seed)
                torch.random.manual_seed(seed)
                
                ctm_T=None
                alpha=None
                temp=None
                if m in ['CTM_KD_2wd_woPartition']:
                    SAVE_DIR = './Teacher_models/{}/AVITM_nc_{}_tpm_0.0_tpv_{}_hs_prodLDA_ac_{}_do_softplus_lr_0.2_mo_0.002_rp_0.99'.format(d, k, 1 - (1./k), T_Layer[d][k])
                    epoch = ''.join(x for x in listdir(SAVE_DIR)[0] if x.isdigit())
                    ctm_T=ctmkd.CTMKD(input_size=len(data.get_vocabulary()), bert_input_size=768)
                    print('Loading Teacher Model for, model={}, #Topics={}'.format(m, k))
                    ctm_T.load(SAVE_DIR, epoch=epoch)
                    alpha = SKD_hyp[d][k]['alpha']
                    temp=SKD_hyp[d][k]['temp']
                
                model = get_model(model_name=m,
                                  num_topics=k,
                                  dataset_name=d,
                                  ctm_T=ctm_T, 
                                  alpha=alpha, 
                                  temp=temp,
                                  seed=seed)
                
                output = model.train_model(dataset=data)
                
                del model
                torch.cuda.empty_cache()
                
                #Hyperparams:
                results['Dataset'].append(d)
                results['Model'].append(m)
                results['K'].append(k)
                results['Seed'].append(seed)
                
                #############

                #Coherence Scores:
                results['NPMI'].append(npmi.score(output))
                results['CV'].append(cv.score(output))
                
                print('Results:-\n', results)
                print('#'*100)