In [2]:
import logging
import argparse
import math
import os
import sys
from time import strftime, localtime
import random
import numpy as np
import subprocess

from pytorch_transformers import BertModel, BertConfig
# from data_utils import Tokenizer4Bert, ABSADataset
# from asa_tgcn_model import AsaTgcn

# !pip install scikit-learn
from sklearn import metrics
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, random_split


CONFIG_NAME = 'config.json'
WEIGHTS_NAME = 'pytorch_model.bin'

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler(sys.stdout))

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import os
import copy
import numpy as np
import torch
from torch.utils.data import Dataset
# !pip install pytorch_transformers
from pytorch_transformers import BertTokenizer

In [4]:
def pad_and_truncate(sequence, maxlen, dtype='int64', padding='post', truncating='post', value=0):
#is designed to adjust the length of a sequence (such as a list or array) to a specified maximum length
#by either padding or truncating the sequence as necessary
#padding='post' add words in the end of the sentence if necessary 
#trancating='post' trancate sentence in the end if the length of a sentence is longer than maxlen
    
    x = (np.ones(maxlen) * value).astype(dtype)
    
    if truncating == 'pre':
        trunc = sequence[-maxlen:] 
    else:
        trunc = sequence[:maxlen]
    
    trunc = np.asarray(trunc, dtype=dtype)

    if padding == 'post':
        x[:len(trunc)] = trunc
    else:
        x[-len(trunc):] = trunc
    return x

In [5]:
def get_args(model_type = 'tgcn', # tgcn, tgcn+sem, tri_gcn
             # Select which modules to use for hybrid model
             tgcn = True,
             semgcn = True, 
             lexgcn = True,
             tgcn_layers = 3,
             semgcn_layers = 2,
             lexgcn_layers = 2,
             path = None, 
             year='2015',
             val_file='val.txt',
             log = 'log',
             bert_model='bert-large-uncased', # ADD ANOTHER VALUE FOR THAT VARIABLE BECAUSE OF THE ERROR (change underscore to the dash)
             #bert_model='bert_large_uncased',
             cooc_path = 'cooc_matrix.csv', # Path to co-occurrence matrix file
             cooc = None, # Pandas DataFrame co-occurrence matrix. If not specified, it will be loaded from cooc_path
             learning_rate=2e-5,
             dropout=0.2,
             concat_dropout = 0.5,
             bert_dropout=0.2,
             l2reg=0.01,
             num_epoch=50,
             batch_size=6, # PREVIOUS NUMBER 16  CHANGED BECAUSE OF THE MEMORY ERROR
             log_step=5,
             max_seq_len=100,
             polarities_dim=3,
             device='cuda',
             seed=50,
             valset_ratio=0.1, # the percentage fo the validation set
             do_train=True,
             do_eval=True,
             eval_epoch_num=0,
             fusion_type = 'concat', # 'concat' or 'gate'
             use_ensemble = True, 
            save_models='last',
            print_sentences = False,
             optim = 'adam'
            ):
    assert model_type == 'tgcn' or model_type == 'tgcn+sem' or model_type == 'tri_gcn'
    opt = argparse.Namespace()
    opt.model_type = model_type
    opt.modules = {'tgcn': tgcn, 'semgcn': semgcn, 'lexgcn': lexgcn}
    opt.num_layers = {'tgcn': tgcn_layers, 'semgcn': semgcn_layers, 'lexgcn': lexgcn_layers}
    opt.year = year
    fusion = "" if model_type == 'tgcn' else "+" + fusion_type
    opt.train_file = f'data/train{year}restaurant.txt'
    opt.test_file = f'data/test{year}restaurant.txt'
    opt.model_path = f'test_models/{year}{model_type}{fusion}_seed{seed}_reg{l2reg}_drop{dropout}_cdrop{concat_dropout}_lr{learning_rate}_epochs{num_epoch}_{optim.lower()}'
#     if model_type == 'tgcn':
#         opt.model_path = f'models/rest_{year}/BERT.L_seed{seed}_reg{l2reg}_drop{dropout}_lr{learning_rate}_epochs{num_epoch}' 
#     elif model_type == 'tgcn+sem':
#         opt.model_path = f'models/rest_{year}/{model_type}/{model_type}_seed{seed}_reg{l2reg}_drop{dropout}_lr{learning_rate}_epochs{num_epoch}'
    if do_eval and not do_train:
        opt.model_path += f'/epoch_{eval_epoch_num}'
    if path:
        opt.model_path = path
    opt.val_file = val_file
    opt.log = log
    opt.bert_model = bert_model
    opt.cooc_path = cooc_path
    opt.cooc = cooc
    opt.learning_rate = learning_rate
    opt.dropout = dropout
    opt.concat_dropout = concat_dropout
    opt.bert_dropout = bert_dropout
    opt.l2reg = l2reg
    opt.num_epoch = num_epoch
    opt.batch_size = batch_size
    opt.log_step = log_step
    opt.max_seq_len = max_seq_len
    opt.polarities_dim = polarities_dim
    opt.device = device
    opt.seed = seed
    opt.valset_ratio = valset_ratio
    opt.do_train = do_train
    opt.do_eval = do_eval
    opt.eval_epoch_num = eval_epoch_num
    opt.fusion_type = fusion_type
    opt.use_ensemble = True
    opt.save_models = save_models
    opt.print_sent = print_sentences
    opt.optim = optim
    return opt

In [6]:
class Tokenizer4Bert:
#is designed to handle the tokenization of text for use with a BERT model
    def __init__(self, max_seq_len, pretrained_bert_name):
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_bert_name)
        self.max_seq_len = max_seq_len

    def text_to_sequence(self, text, reverse=False, padding='post', truncating='post'):
        sequence = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text)) 
        #tokenize the text and thenmap it with the corresponding ids
        if len(sequence) == 0:
            sequence = [0]
        if reverse:
            sequence = sequence[::-1]
        return pad_and_truncate(sequence, self.max_seq_len, padding=padding, truncating=truncating)

    def id_to_sequence(self, sequence, reverse=False, padding='post', truncating='post'):
        if len(sequence) == 0:
            sequence = [0]
        if reverse:
            sequence = sequence[::-1]
        return pad_and_truncate(sequence, self.max_seq_len, padding=padding, truncating=truncating)

In [7]:
class ABSADataset(Dataset):
    def __init__(self, datafile, tokenizer, opt, deptype2id=None, dep_order="first"):
        self.datafile = datafile
        self.depfile = "{}.dep".format(datafile)
        self.tokenizer = tokenizer
        self.opt = opt #IT IS THE ORIGINAL CODE, I COMMENTED BECAUSE I HAVE A MISTAKE WHEN CREATING COOC
        #self.opt = opt if opt is not None else get_default_config()
        self.deptype2id = deptype2id
        self.dep_order = dep_order
        self.textdata = ABSADataset.load_datafile(self.datafile)
        self.depinfo = ABSADataset.load_depfile(self.depfile)
        self.polarity2id = self.get_polarity2id()
        self.feature = []
        for sentence,depinfo in zip(self.textdata, self.depinfo):
            self.feature.append(self.create_feature(sentence, depinfo, opt.print_sent))
            
        #self.cooc_matrix = self.create_cooc_matrix() # NEW LINE TO CREATE COOC 
        print(self.feature[:1])

    def __getitem__(self, index):
        return self.feature[index]

    def __len__(self):
        return len(self.feature)

    def ws(self, text):
        tokens = []
        valid_ids = []
        for i, word in enumerate(text):
            if len(text) <= 0:
                continue
            token = self.tokenizer.tokenizer.tokenize(word)
            tokens.extend(token)
            for m in range(len(token)):
                if m == 0:
                    valid_ids.append(1)
                else:
                    valid_ids.append(0)
        token_ids = self.tokenizer.tokenizer.convert_tokens_to_ids(tokens)
        return tokens, token_ids, valid_ids

    def create_feature(self, sentence, depinfo, print_sent = False):
        text_left, text_right, aspect, polarity = sentence

        cls_id = self.tokenizer.tokenizer.vocab["[CLS]"]
        sep_id = self.tokenizer.tokenizer.vocab["[SEP]"]

        doc = text_left + " " + aspect + " " + text_right

        left_tokens, left_token_ids, left_valid_ids = self.ws(text_left.split(" "))
        right_tokens, right_token_ids, right_valid_ids = self.ws(text_right.split(" "))
        aspect_tokens, aspect_token_ids, aspect_valid_ids = self.ws(aspect.split(" "))
        tokens = left_tokens + aspect_tokens + right_tokens
        input_ids = [cls_id] + left_token_ids + aspect_token_ids + right_token_ids + [sep_id] + aspect_token_ids + [sep_id]
        valid_ids = [1] + left_valid_ids + aspect_valid_ids + right_valid_ids + [1] + aspect_valid_ids + [1]
        mem_valid_ids = [0] + [0] * len(left_tokens) + [1] * len(aspect_tokens) + [0] * len(right_tokens) # aspect terms mask
        segment_ids = [0] * (len(tokens) + 2) + [1] * (len(aspect_tokens)+1)
        
        
        dep_instance_parser = DepInstanceParser(basicDependencies=depinfo, tokens=[])
        if self.dep_order == "first":
            dep_adj_matrix, dep_type_matrix = dep_instance_parser.get_first_order()
        elif self.dep_order == "second":
            dep_adj_matrix, dep_type_matrix = dep_instance_parser.get_second_order()
        elif self.dep_order == "third":
            dep_adj_matrix, dep_type_matrix = dep_instance_parser.get_third_order()
        else:
            raise ValueError()

        token_head_list = []
        for input_id, valid_id in zip(input_ids, valid_ids):
            if input_id == cls_id:
                continue
            if input_id == sep_id:
                break
            if valid_id == 1:
                token_head_list.append(input_id)

        input_ids = self.tokenizer.id_to_sequence(input_ids)
        valid_ids = self.tokenizer.id_to_sequence(valid_ids)
        segment_ids = self.tokenizer.id_to_sequence(segment_ids)
        mem_valid_ids = self.tokenizer.id_to_sequence(mem_valid_ids)

        size = input_ids.shape[0]
        
        if print_sent:
            print(doc)
            print(len(dep_adj_matrix[0]))

        # final_dep_adj_matrix = [[0] * size for _ in range(self.tokenizer.max_seq_len)]
        # final_dep_value_matrix = [[0] * size for _ in range(self.tokenizer.max_seq_len)]
        final_dep_adj_matrix = [[0] * size for _ in range(size)]
        final_dep_value_matrix = [[0] * size for _ in range(size)]
        for i in range(len(token_head_list)):
            for j in range(len(dep_adj_matrix[i])):
                if j >= size:
                    break
                final_dep_adj_matrix[i+1][j] = dep_adj_matrix[i][j]
                final_dep_value_matrix[i+1][j] = self.deptype2id[dep_type_matrix[i][j]]

        return {
            "input_ids":torch.tensor(input_ids),
            "valid_ids":torch.tensor(valid_ids),
            "segment_ids":torch.tensor(segment_ids),
            "mem_valid_ids":torch.tensor(mem_valid_ids),
            "dep_adj_matrix":torch.tensor(final_dep_adj_matrix),
            "dep_value_matrix":torch.tensor(final_dep_value_matrix),
            "polarity": self.polarity2id[polarity],
            "raw_text": doc,
            "aspect": aspect
        }


    @staticmethod
    def load_depfile(filename):
        data = []
        with open(filename, 'r') as f:
            dep_info = []
            for line in f:
                line = line.strip()
                if len(line) > 0:
                    items = line.split("\t")
                    dep_info.append({
                        "governor": int(items[0]),
                        "dependent": int(items[1]),
                        "dep": items[2],
                    })
                else:
                    if len(dep_info) > 0:
                        data.append(dep_info)
                        dep_info = []
            if len(dep_info) > 0:
                data.append(dep_info)
                dep_info = []
        return data

    @staticmethod
    def load_datafile(filename):
        data = []
        with open(filename, 'r') as f:
            lines = f.readlines()
            for i in range(0, len(lines), 3):
                text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")]
                aspect = lines[i + 1].lower().strip()
                text_right = text_right.replace("$T$", aspect)
                polarity = lines[i + 2].strip()
                data.append([text_left, text_right, aspect, polarity])

        return data

    @staticmethod
    def load_deptype_map(opt):
        deptype_set = set()
        for filename in [opt.train_file, opt.test_file, opt.val_file]:
            filename = "{}.dep".format(filename)
            if os.path.exists(filename) is False:
                continue
            data = ABSADataset.load_depfile(filename)
            for dep_info in data:
                for item in dep_info:
                    deptype_set.add(item['dep'])
        deptype_map = {"none": 0}
        for deptype in sorted(deptype_set, key=lambda x:x):
            deptype_map[deptype] = len(deptype_map)
        return deptype_map

    @staticmethod
    def get_polarity2id():
        polarity_label = ["-1","0","1"]
        return dict([(label, idx) for idx,label in enumerate(polarity_label)])

In [8]:
class DepInstanceParser():
    def __init__(self, basicDependencies, tokens):
        self.basicDependencies = basicDependencies
        self.tokens = tokens
        self.words = []
        self.dep_governed_info = []
        self.dep_parsing()


    def dep_parsing(self):
#         print('strat dep_parsing function...')
#         print(len(self.tokens)) #0
        if len(self.tokens) > 0:
            words = []
            for token in self.tokens:
                token['word'] = token
                words.append(self.change_word(token['word'])) #change_word check for the paranthesis
            dep_governed_info = [
                {"word": word}
                for i,word in enumerate(words)
            ]
            self.words = words
        else:
            dep_governed_info = [{}] * len(self.basicDependencies)
        for dep in self.basicDependencies:
            dependent_index = dep['dependent'] - 1
            governed_index = dep['governor'] - 1
            dep_governed_info[dependent_index] = {
                "governor": governed_index,
                "dep": dep['dep']
            }
        self.dep_governed_info = dep_governed_info #contains detailed information about the dependencies among these tokens.

    def change_word(self, word):
    #designed to handle specific formatting issues within the text data it processes, particularly dealing 
    #with tokens representing left and right parentheses.
        
        if "-RRB-" in word:
        #The method first checks if the string "-RRB-" is present in the word. This string is often used in 
        #linguistic data to represent a right parenthesis ) to prevent misinterpretation during parsing processes. 
        #If "-RRB-" is found, it is replaced with ")".
            return word.replace("-RRB-", ")")
        
        if "-LRB-" in word:
        #Next, the method checks for the presence of "-LRB-" in the word. Similarly, this string represents a left 
        #parenthesis ( and is replaced by "(".
            return word.replace("-LRB-", "(")
        return word

    def get_first_order(self, direct=False):
        #designed to generate matrices representing the adjacency and types of dependency relationships between 
        #tokens in a sentence based on their parsed dependencies.
        
        #indicate whether there is a direct dependency link between the tokens
        dep_adj_matrix  = [[0] * len(self.dep_governed_info) for _ in range(len(self.dep_governed_info))]
        
        #indicate the type of dependency (like "subj", "obj") between tokens instead of binary indicators as in dep_adj_matrix
        dep_type_matrix = [["none"] * len(self.dep_governed_info) for _ in range(len(self.dep_governed_info))]
        
        for i, dep_info in enumerate(self.dep_governed_info):
            governor = dep_info["governor"] #the index of the token that governs the current token
            dep_type = dep_info["dep"] #the type of the dependency
            
            #indicate the existance of the dependency between tokens
            dep_adj_matrix[i][governor] = 1
            dep_adj_matrix[governor][i] = 1
            
            #If direct is False, both [i][governor] and [governor][i] are set to the dependency type.
            #If direct is True, the entries are suffixed to indicate the direction (_in for incoming, _out for outgoing 
            #dependencies relative to each token).
            
            dep_type_matrix[i][governor] = dep_type if direct is False else "{}_in".format(dep_type)
            dep_type_matrix[governor][i] = dep_type if direct is False else "{}_out".format(dep_type)
        
        return dep_adj_matrix, dep_type_matrix

    def get_next_order(self, dep_adj_matrix, dep_type_matrix):
        new_dep_adj_matrix = copy.deepcopy(dep_adj_matrix)
        new_dep_type_matrix = copy.deepcopy(dep_type_matrix)
        for target_index in range(len(dep_adj_matrix)):
            for first_order_index in range(len(dep_adj_matrix[target_index])):
                if dep_adj_matrix[target_index][first_order_index] == 0:
                    continue
                for second_order_index in range(len(dep_adj_matrix[first_order_index])):
                    if dep_adj_matrix[first_order_index][second_order_index] == 0:
                        continue
                    if second_order_index == target_index:
                        continue
                    if new_dep_adj_matrix[target_index][second_order_index] == 1:
                        continue
                    new_dep_adj_matrix[target_index][second_order_index] = 1
                    new_dep_type_matrix[target_index][second_order_index] = dep_type_matrix[first_order_index][second_order_index]
        return new_dep_adj_matrix, new_dep_type_matrix

    def get_second_order(self, direct=False):
        dep_adj_matrix, dep_type_matrix = self.get_first_order(direct=direct)
        return self.get_next_order(dep_adj_matrix, dep_type_matrix)

    def get_third_order(self, direct=False):
        dep_adj_matrix, dep_type_matrix = self.get_second_order(direct=direct)
        return self.get_next_order(dep_adj_matrix, dep_type_matrix)

    def search_dep_path(self, start_idx, end_idx, adj_max, dep_path_arr):
        for next_id in range(len(adj_max[start_idx])):
            if next_id in dep_path_arr or adj_max[start_idx][next_id] in ["none"]:
                continue
            if next_id == end_idx:
                return 1, dep_path_arr + [next_id]
            stat, dep_arr = self.search_dep_path(next_id, end_idx, adj_max, dep_path_arr + [next_id])
            if stat == 1:
                return stat, dep_arr
        return 0, []

    def get_dep_path(self, start_index, end_index, direct=False):
        dep_adj_matrix, dep_type_matrix = self.get_first_order(direct=direct)
        _, dep_path = self.search_dep_path(start_index, end_index, dep_type_matrix, [start_index])
        return dep_path

In [9]:
#trainset for 2015
opt = get_args(year='2015')
deptype2id = ABSADataset.load_deptype_map(opt)
tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.bert_model)
trainset2015 = ABSADataset(opt.train_file, tokenizer, opt, deptype2id=deptype2id)

loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt from cache at C:\Users\bromi\.cache\torch\pytorch_transformers\9b3c03a36e83b13d5ba95ac965c9f9074a99e14340c523ab405703179e79fc46.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
[{'input_ids': tensor([  101, 13325,  2013,  3025,  8466,  2023,  2109,  2000,  2022,  1037,
         2204,  2173,  1010,  2021,  2025,  2151,  2936,  1012,   102,  2173,
          102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,    

In [34]:
#trainset for 2016
opt = get_args(year='2016')
deptype2id = ABSADataset.load_deptype_map(opt)
tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.bert_model)
trainset2016 = ABSADataset(opt.train_file, tokenizer, opt, deptype2id=deptype2id)

loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt from cache at C:\Users\bromi\.cache\torch\pytorch_transformers\9b3c03a36e83b13d5ba95ac965c9f9074a99e14340c523ab405703179e79fc46.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
[{'input_ids': tensor([  101, 13325,  2013,  3025,  8466,  2023,  2109,  2000,  2022,  1037,
         2204,  2173,  1010,  2021,  2025,  2151,  2936,  1012,   102,  2173,
          102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,    

In [39]:
#merge senteces from both datasets
cooc_text = []

# Collect text from the 2015 dataset
for feature in trainset2015.feature:
    cooc_text.append(feature['raw_text'])

# Collect text from the 2016 dataset
for feature in trainset2016.feature:
    cooc_text.append(feature['raw_text'])

In [13]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from nltk.corpus import webtext
from collections import defaultdict
import pandas as pd

In [14]:
# Downloading the webtext corpus
nltk.download('webtext')

[nltk_data] Downloading package webtext to
[nltk_data]     C:\Users\bromi\AppData\Roaming\nltk_data...
[nltk_data]   Package webtext is already up-to-date!


True

In [41]:
# Load webtext sentences
web_sentences = [text for text in webtext.sents()]

In [42]:
web_sentences = [' '.join(sent) for sent in webtext.sents()]

In [43]:
for i in web_sentences:
    cooc_text.append(i)

In [56]:
from collections import defaultdict
import pandas as pd

def build_cooccurrence_matrix(sentences, window_size=3):
    cooc_counts = defaultdict(int)
    word_counts = defaultdict(int)
    
    # Process each sentence
    for sentence in sentences:
        # Clean and tokenize the sentence
        words = tokenizer.tokenizer.tokenize(sentence)  # sentence.split()
        
        # Keep track of encountered word pairs in each sentence
        encountered_pairs = set()
        
        # Count word occurrences and co-occurrences within the window size
        for i in range(len(words)):
            # Count each word occurrence
            word_counts[words[i]] += 1 
            
            start = max(0, i - window_size)
            end = min(len(words), i + window_size + 1)
            
            for j in range(start, end):
                if i != j:
                    # Check if the word pair has already been encountered in this sentence
                    pair = tuple(sorted([words[i], words[j]]))
                    if pair not in encountered_pairs:
                        # Increment co-occurrence count for this word pair
                        cooc_counts[pair] += 1
                        encountered_pairs.add(pair)

    # Build DataFrame from co-occurrence dictionary
    words = list(word_counts.keys())
    cooc_matrix = pd.DataFrame(0.0, index=words, columns=words)  # Initialize DataFrame with float zeros
    
    for (w1, w2), count in cooc_counts.items():
        cooc_matrix.at[w1, w2] += count
        cooc_matrix.at[w2, w1] += count  # Assuming undirected co-occurrence

    # Scale by the product of word frequencies
    for (w1, w2), count in cooc_counts.items():
        cooc_matrix.at[w1, w2] /= (word_counts[w1] * word_counts[w2])
        cooc_matrix.at[w2, w1] /= (word_counts[w1] * word_counts[w2])

    return cooc_matrix


              judging          from  previous     posts          this  \
judging      0.000000  8.880995e-04  0.021739  0.200000  0.000000e+00   
from         0.000888  2.488321e-12  0.000097  0.000355  2.238906e-05   
previous     0.021739  9.653255e-05  0.000000  0.008696  3.425283e-05   
posts        0.200000  3.552398e-04  0.008696  0.000000  2.100840e-04   
this         0.000000  2.238906e-05  0.000034  0.000210  2.282724e-12   
...               ...           ...       ...       ...           ...   
persistence  0.000000  0.000000e+00  0.000000  0.000000  0.000000e+00   
juniper      0.000000  0.000000e+00  0.000000  0.000000  0.000000e+00   
bel          0.000000  0.000000e+00  0.000000  0.000000  0.000000e+00   
##utz        0.000000  0.000000e+00  0.000000  0.000000  0.000000e+00   
ideally      0.000000  0.000000e+00  0.000000  0.000000  0.000000e+00   

                 used        to        be         a      good  ...  gently  \
judging      0.000000  0.000000  0.000000  0.

In [None]:
sentences = cooc_text
cooc_matrix = build_cooccurrence_matrix(sentences, window_size=3)

In [58]:
cooc_matrix_test=cooc_matrix

In [59]:
id_to_index_map_2 = {tokenizer.tokenizer.convert_tokens_to_ids(tokenizer.tokenizer.tokenize(w))[0]: i for i, w in enumerate(cooc_matrix_test.columns)}
indices_to_keep = [index for token_id, index in id_to_index_map_2.items()]
filtered_cooc_matrix = cooc_matrix_test.iloc[indices_to_keep, indices_to_keep]

In [61]:
cooc_matrix_final=filtered_cooc_matrix

In [62]:
# Mapping token ids to indices of matrix
id_to_index_map = {tokenizer.tokenizer.convert_tokens_to_ids(tokenizer.tokenizer.tokenize(w))[0]: i for i, w in enumerate(filtered_cooc_matrix.columns)}

In [63]:
# THE CODE FOR COLUMNS AND INDEXES (COLUMNS - INDEX IN CO-OCCUR MATRIX, INDEX - INDEX IN FROM TOKENIZATION)
cooc_ids = [token_id for token_id, index in id_to_index_map.items()]
cooc_ids_array = np.array(cooc_ids)
cooc_matrix_final.index = cooc_ids_array.astype(int)
cooc_matrix_final.columns = cooc_ids_array.astype(int)

In [65]:
df = pd.DataFrame(cooc_matrix_final)  # Convert your matrix to a DataFrame

In [66]:
df.to_csv('cooc_matrix_final2.csv', index=True)

In [67]:
cooc = pd.read_csv('cooc_matrix_final2.csv',index_col=0)