In [None]:
#Set random seed for reproducability

os.environ['PYTHONHASHSEED']=str(SEED)
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
os.environ['TF_DETERMINISTIC_OPS'] = '1'
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED) 
torch.cuda.manual_seed_all(SEED)
torch.cuda.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.enabled=True
torch.backends.cudnn.deterministic=True
torch.backends.cudnn.benchmark = False

SEED = 85


# Approach 1

# Run topic modeling using party manifesto data

In [None]:
#Create function for tokenizing Japanese party manifestos

#Note: Since there are only few documents with many sentences, it's better to split them into the sentences and consider one sentence as one document.

from sklearn.feature_extraction.text import CountVectorizer
import fugashi

def tokenize_jp(text):
    tagger = fugashi.Tagger()
    words = [word.surface for word in tagger(text)]
    return words

#Topic-wise document grouping so later we can easily retrieve all documents per topic
def groupdoc(topics,text):
    topic_docs = {topic: [] for topic in set(topics)}
    for topic, doc in zip(topics, text):
        topic_docs[topic].append(doc)
    del topic_docs[-1]
    return topic_docs

vectorizer = CountVectorizer(tokenizer=tokenize_jp)

#Run topic modeling for party manifesto data

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
topicwiseGropedDocforAllModels = dict()
topicwordsAllBertModelsPartyManifestos = dict()

nt = ['20topics','25topics', '30topics','35topics','40topics','45topics','50topics','50topics',]
r = 0

#Starting number of topics
n = 20

#Name to add in the saved model
name = 'PartyManifestosNew'

#Set different parameter for HDBSCAN

hdbscanmodel = hdbscan.HDBSCAN(min_cluster_size=10, cluster_selection_epsilon = 0.15, metric='euclidean', cluster_selection_method='eom',prediction_data=True)

#Run topic modeling
for i in nt[r]:
    japanese = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
    topic_el = BERTopic(embedding_model=japanese, top_n_words=10, hdbscan_model=hdbscanmodel, nr_topics=n, vectorizer_model=vectorizer,calculate_probabilities=False, low_memory = True)    
    topics, _ = topic_el.fit_transform(YouTUbeData)
#Topic_el.save('Bert' + name + nt[r])
    b = groupdoc(topics,YouTUbeData)
    a = extract_words(topic_el,n)
    topicwiseGropedDocforAllModels[n] = b
    topicwordsAllBertModelsPartyManifestos[n] = a
    n += 7
    if r !=len(nt) - 1:
        r += 1
    else:     
         break

# Calculate coherence score (Cv) to select one topic model 
Need tokens and dictionary which is  {word: frequency}


In [None]:
#unction for tokenization
words=[]
tagger = fugashi.Tagger()
def tokenize_p(docs):
    word = [word.surface for word in tagger(docs)]
    return word

In [None]:
#Function for tokenizing set of documant for every topic
#Remove stopwords

def funcdoc(text):
    stopwordsJp = [line.rstrip('\n') for line in open (r"C:\Users\esoc-ws\Desktop\Research\Journal_2022\stopwords-jp.txt", encoding='utf-8')]
    
    tok=dict()
    nr = int(text[-8:-6])
    for i in range(nr):
        splist=[]
        for h in range(len(topicwiseGropedDocforAllModels[nr][i])):
                b = tokenize_p(topicwiseGropedDocforAllModels[nr][i][h])
                splist.append(b)
                
                
        tok[i] = splist
    return tok

In [None]:
#Function for tokenizing set of documant for every topic

def funcdoc(text):
    stopwordsJp = [line.rstrip('\n') for line in open ("stopwords-jp.txt", encoding='utf-8')]
    tok=dict()
    nr = int(text[-8:-6])
    for i in range(nr):
        splist=[]
        for h in range(len(topicwiseGropedDocforAllModels[nr][i])):
                b = tokenize_p(topicwiseGropedDocforAllModels[nr][i][h])
                for s in b:
                    if s in stopwordsJp:
                        b.remove(s)
                splist.append(b)    
        tok[i] = splist
    return tok

In [None]:
#function to delete stopwords from dict
def cleandict(dictJP):
    stopwordsJp = [line.rstrip('\n') for line in open ("stopwords-jp.txt", encoding='utf-8')]
    for i in range(len(dictJP)):
        for b in dictJP[i]:
            if b in stopwordsJp:
                dictJP[i].remove(b)
    return dictJP

In [None]:
#Names of all saved models
listofBertNames = ['BertPartyManifestosNew20topics','BertPartyManifestosNew25topics','BertPartyManifestosNew30topics',
                   'BertPartyManifestosNew35topics','BertPartyManifestosNew40topics',
                   'BertPartyManifestosNew45topics','BertPartyManifestosNew50topics',
                   'BertPartyManifestosNew55topics']

In [None]:
#Function to calculate coherence score
def calculate_C_S():
    resList = list()
    from gensim.test.utils import common_corpus
    from gensim.models.coherencemodel import CoherenceModel
    from gensim.corpora import Dictionary 
#Tokenizing documentssnand crating gensim dictionary for corpus
    for i in listofBertNames:
        tok = funcdoc(i)
        total_coherence = 0

        for b in range(len(tok)):
            a = cleandict(tok[b])
            JpDict = Dictionary(a)
            JpDict.token2id
            
            
#Calculate topic-wise coherence score and average it for every BertModel

            
            topics = [topicwordsAllBertModelsPartyManifestos[int(i[-8:-6])][b]]
            cm = CoherenceModel(topics=topics, texts=tok[b], dictionary=JpDict, coherence='c_v')
#Get coherence value
            coherence = cm.get_coherence()
            total_coherence += coherence
                
        AverC_V = total_coherence / int(i[-8:-6])
        print(b, AverC_V)
        resList.append(AverC_V)
            
            
    return resList


resList = calculate_C_S()

# Calculate similarity score between party manifestos and VAA statements

In [None]:
#Extract topic-wise documents
topicwise_documents = topicwiseGropedDocforAllModels[45]
len(topicwise_documents)

In [None]:
#Output the topic-documents pairs with their score
def calculate_cossim(sentences1,sentences2):
    from sentence_transformers import SentenceTransformer, util
    model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
    #Compute embeddings for both lists
    global score
    score={}
    embeddings1 = model.encode(sentences1, convert_to_tensor=True)
    for i in range(len(sentences2)):
        embeddings2 = (model.encode(sentences2[i], convert_to_tensor=True))
    #Compute cosine-similarity and use probability
        global cosine_scores
        cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)
        for f in range(len(sentences1)):
            cosine_sim=0
            for s in range(len(sentences2[i])):
                cosine_sim+=cosine_scores[f][s]
            score[sentences1[f],i]=cosine_sim/len(sentences2[i])
            print('''{} \n {} \n\x1b[31m\"SCORE\"\x1b[0m: {:.4f}'''.format(sentences1[f], i, score[sentences1[f],i]))
            #print(sentences1[f]\n\n, sentences2[i], '\033[91m{SCORE}\033[0m', score)

In [None]:
sentences1 = VAA_statements

sentences2 = topicwise_documents

calculate_cossim(sentences1,sentences2)

In [None]:
#Calculate average cossim score for every VAA statement
dictVaaScore={}

for st in range(len(VAAStat)):
    average = 0
    finalscore = 0
    for i in range(len(topicwise_documents)):
        average+=np.asarray(score[VAAStat[st],i]).tolist()
    finalscore = average / len(topicwise_documents)
    dictVaaScore[VAAStat[st]] = finalscore
    
dictVaaScore

In [None]:
#Leave only document-statement pairs which cosine similarity score is above 0.5
dictwithresults={}
for k in dictVaaScore.keys():
    for i in range(35):
        if score[k,i] > 0.5:
            dictwithresults[VAAStat[VAAStat.index(k)]]= score[k,i],i
            

dictwithresults    
    

In [1]:
#Here need to merge some VAA statements since some of them are very similar, and recalculate cosine similarity with the selected documents

#Same approach should be used with YouTube comments

# Approach 2
In addition to approach 1, if documents needs to be summarized to save time of VAA designers, or they would like to consider only the most representative documents the following additional feature can be added to the system


In [None]:
#In BERT there is a method to retrieve the most representative documents from each topic

representative_documents = topic_el.model.get_representative_docs()


# Bert extractive summarizer

In [None]:
#Simmarize documents
from transformers import AutoConfig, AutoTokenizer, AutoModel
from summarizer import Summarizer
#In order to make method work properly, "sentence_handler.py" from the Summarizer library had to be adjusted for for the specific language

In [None]:
custom_config = AutoConfig.from_pretrained("cl-tohoku/bert-base-japanese")
custom_config.output_hidden_states=True
custom_tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")
custom_model = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese", config=custom_config)

model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)