In [1]:
# =========================================================================================
# Libraries
# =========================================================================================
import os
import gc
import time
import math
import random
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding
import cupy as cp
from cuml.metrics import pairwise_distances
from cuml.neighbors import NearestNeighbors
%env TOKENIZERS_PARALLELISM=false
# =========================================================================================
# Data Loading
# =========================================================================================
import re
import string

def read_data(cfg):
    content = pd.read_csv("/mnt/hdd1/wangjingqi/dataset/lecr/content.csv")
    topics = pd.read_csv("/mnt/hdd1/wangjingqi/dataset/lecr/topics.csv")
    correlations = pd.read_csv("/mnt/hdd1/wangjingqi/dataset/lecr/correlations.csv")
    
    topics['title'].fillna("", inplace = True)
    content['title'].fillna("", inplace = True)
    topics['description'].fillna("", inplace = True)
    content['description'].fillna("", inplace = True)
    content['text'].fillna("", inplace = True)

    topics["Ti"] = topics["title"]
    content["Ti"] = content["title"]
    
    topics["TiDe"] = topics["title"]+" "+topics["description"]
    content["TiDe"] = content["title"]+" "+content["description"]

    topics["TiDeTe"] = topics["title"]+" "+topics["description"]
    content["TiDeTe"] = content["title"]+" "+content["description"]+" "+content["text"]

    topics['length'] = topics[cfg.uns_key].apply(lambda x: len(x))
    content['length'] = content[cfg.uns_key].apply(lambda x: len(x))
    
    topics.sort_values('length', inplace = True)
    content.sort_values('length', inplace = True)
    # Drop cols
    topics.drop(['title','description', 'channel', 'category', 'level', 'has_content', 'length'], axis = 1, inplace = True)
    content.drop(['title','description', 'kind',  'text', 'copyright_holder', 'license', 'length'], axis = 1, inplace = True)
    # Reset index
    topics.reset_index(drop = True, inplace = True)
    content.reset_index(drop = True, inplace = True)
    print(' ')
    print('-' * 50)
    print(f"topics.shape: {topics.shape}")
    print(f"content.shape: {content.shape}")
    print(f"correlations.shape: {correlations.shape}")
    return topics, content, correlations

class LECRDataset(torch.utils.data.Dataset):
    def __init__(self,df,key):
        self.inputs = df[key].values
    def __len__(self):
        return len(self.inputs)
    def __getitem__(self,idx):
        sample = self.inputs[idx]
        return sample

class collator():
    def __init__(self,pretrained_path,max_len=None) -> None:
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_path)
        self.max_len = max_len
    def __call__(self, data):
        inputs = self.tokenize(list(data))
        return inputs
    def tokenize(self,texts):
            return self.tokenizer(
                texts,padding='longest',max_length=self.max_len,truncation=True,return_tensors="pt",return_token_type_ids=False)
# =========================================================================================
# Unsupervised model
# =========================================================================================
class MeanPooling(nn.Module):
    def __init__(self) -> None:
        super().__init__()
    def forward(self,hidden_state, attention_mask):

        input_mask_expanded = (
            attention_mask.unsqueeze(-1).expand(hidden_state.size())
        )
        mean_embeddings = torch.sum(hidden_state * input_mask_expanded, 1) / torch.clamp(
            input_mask_expanded.sum(1), min=1e-9)
        return mean_embeddings

class UNSModel(nn.Module):
    def __init__(self, pretrained_path):
        super().__init__()
        self.model = AutoModel.from_pretrained(pretrained_path)
        self.pool = MeanPooling()
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        feature = self.pool(last_hidden_state, inputs['attention_mask'])
        return feature
    def forward(self, inputs):
        feature = self.feature(inputs)
        return feature
    
# =========================================================================================
# Get embeddings
# =========================================================================================
def get_embeddings(loader, model, device):
    model.eval()
    preds = []
    for step, inputs in enumerate(tqdm(loader)):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    preds = np.concatenate(preds)
    return preds

# =========================================================================================
# Get the amount of positive classes based on the total
# =========================================================================================
def get_pos_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    int_true = np.array([len(x[0] & x[1]) / len(x[0]) for x in zip(y_true, y_pred)])
    return round(np.mean(int_true), 5)

# =========================================================================================
# Build our training set
# =========================================================================================
def build_training_set(topics, content, cfg):
    # Create lists for training
   
    input_key = cfg.sup_key
    topics_ids = []
    content_ids = []
    input1 = []
    input2 = []
    targets = []
    topics_languages = []
    content_languages = []
    # Iterate over each topic
    for k in tqdm(range(len(topics))):
        row = topics.iloc[k]
        topics_id = row['id']
        topics_input = row[input_key]
        topics_language = row['language']
        predictions = row['predictions'].split(' ')
        ground_truth = row['content_ids'].split(' ')
        predictions = list(set(predictions)|set(ground_truth))
        for pred in predictions:
            content_language = content.loc[pred, 'language']
            content_input= content.loc[pred, input_key]
            topics_ids.append(topics_id)
            content_ids.append(pred)
            input1.append(topics_input)
            input2.append(content_input)
            topics_languages.append(topics_language)
            content_languages.append(content_language)
            # If pred is in ground truth, 1 else 0
            if pred in ground_truth:
                targets.append(1)
            else:
                targets.append(0)
    # Build training dataset
    train = pd.DataFrame(
        {'topics_ids': topics_ids, 
         'content_ids': content_ids, 
         'input1': input1, 
         'input2': input2, 
         'target': targets,
         'topic_language': topics_languages, 
         'content_language': content_languages, }
    )
    # Release memory
    del topics_ids, content_ids, input1, input2, targets
    gc.collect()
    return train
    
# =========================================================================================
# Get neighbors
# =========================================================================================
def get_neighbors(topics, content, cfg):
    # Create topics dataset
    topics_dataset = LECRDataset(topics,cfg.uns_key)
    # Create content dataset
    content_dataset = LECRDataset(content,cfg.uns_key)
    
    collate_fn= collator(cfg.model_name, cfg.max_len)
    # Create topics and content dataloaders
    topics_loader = DataLoader(topics_dataset,batch_size = cfg.bs, shuffle = False, num_workers= cfg.nw, pin_memory=True,collate_fn =collate_fn,drop_last=False)
    content_loader = DataLoader(content_dataset,batch_size = cfg.bs, shuffle = False, num_workers= cfg.nw, pin_memory=True,collate_fn =collate_fn,drop_last=False)
    # Create unsupervised model to extract embeddings
    model = UNSModel(cfg.model_name)
    model.to(cfg.device)
    model.float()
    # Predict topics
    cp.cuda.Device(cfg.device).use()
    print(torch.cuda.current_device())
    topics_preds = get_embeddings(topics_loader, model, cfg.device)
    topics_preds_gpu = cp.array(topics_preds)
    del topics_loader
    gc.collect()
    content_preds = get_embeddings(content_loader, model, cfg.device)
    # Transfer predictions to gpu
    
    content_preds_gpu = cp.array(content_preds)
    # Release memory
    torch.cuda.empty_cache()
    del topics_dataset, content_dataset, content_loader, topics_preds, content_preds
    gc.collect()
    # KNN model
    print(' ')
    print('Training KNN model...')
    neighbors_model = NearestNeighbors(n_neighbors = cfg.top_n, metric = 'cosine')
    neighbors_model.fit(content_preds_gpu)
    indices = neighbors_model.kneighbors(topics_preds_gpu, return_distance = False)
    predictions = []
    for k in range(len(indices)):
        pred = indices[k]
        p = ' '.join([content.loc[ind, 'id'] for ind in pred.get()])
        predictions.append(p)
    topics['predictions'] = predictions
    # Release memory
    del topics_preds_gpu, content_preds_gpu, neighbors_model, predictions, indices, model
    gc.collect()
    return topics, content 

# Read data
# topics, content, correlations = read_data(CFG)
# # Run nearest neighbors
# topics, content = get_neighbors(topics, content, CFG)
# # Merge with target and comput max positive score
# topics = topics.merge(correlations, how = 'inner', left_on = ['id'], right_on = ['topic_id'])
# pos_score = get_pos_score(topics['content_ids'], topics['predictions'])
# print(f'Our max positive score is {pos_score}')
# # We can delete correlations
# del correlations
# gc.collect()
# # Set id as index for content
# content.set_index('id', inplace = True)
# # Build training set
# train = build_training_set(topics, content, CFG)
# print(f'Our training set has {len(train)} rows')
# # Save train set to disk to train on another notebook
# train.to_csv('train.csv', index = False)
# train.head()
def test(CFG,models):
    scores = {}
    for model in models:
        id = model.split('/')[-1]
        CFG.model_name = model
        print(f'Running model {id}')
        topics, content, correlations = read_data(CFG)
        topics, content = get_neighbors(topics, content, CFG)
        topics = topics.merge(correlations, how = 'inner', left_on = ['id'], right_on = ['topic_id'])
        pos_score = get_pos_score(topics['content_ids'], topics['predictions'])
        print(f'{id} max positive score is {pos_score}')
        scores[id] = pos_score
    print(scores)

env: TOKENIZERS_PARALLELISM=false


In [2]:
content = pd.read_csv("/mnt/hdd1/wangjingqi/dataset/lecr/content.csv")
topics = pd.read_csv("/mnt/hdd1/wangjingqi/dataset/lecr/topics.csv")
correlations = pd.read_csv("/mnt/hdd1/wangjingqi/dataset/lecr/correlations.csv")
    

In [4]:
content

Unnamed: 0,id,title,description,kind,text,language,copyright_holder,license
0,c_00002381196d,"Sumar números de varios dígitos: 48,029+233,930","Suma 48,029+233,930 mediante el algoritmo está...",video,,es,,
1,c_000087304a9e,Trovare i fattori di un numero,Sal trova i fattori di 120.\n\n,video,,it,,
2,c_0000ad142ddb,Sumar curvas de demanda,Cómo añadir curvas de demanda\n\n,video,,es,,
3,c_0000c03adc8d,Nado de aproximação,Neste vídeo você vai aprender o nado de aproxi...,document,\nNado de aproximação\nSaber nadar nas ondas ...,pt,Sikana Education,CC BY-NC-ND
4,c_00016694ea2a,geometry-m3-topic-a-overview.pdf,geometry-m3-topic-a-overview.pdf,document,Estándares Comunes del Estado de Nueva York\n\...,es,Engage NY,CC BY-NC-SA
...,...,...,...,...,...,...,...,...
154042,c_fffcbdd4de8b,2. 12: Diffusion,,html5,What will eventually happen to these dyes?\n\n...,en,CSU and Merlot,CC BY-NC-SA
154043,c_fffe15a2d069,Sommare facendo gruppi da 10,Sal somma 5+68 spezzando il 5 in un 2 e un 3.\n\n,video,,it,,
154044,c_fffed7b0d13a,Introdução à subtração,Sal fala sobre o que significa subtrair. Os ex...,video,,pt,,
154045,c_ffff04ba7ac7,SA of a Cone,,video,,en,,


: 

In [2]:
class CFG:
    nw = 4
    bs = 32
    top_n = 50
    seed = 42
    device = 7
    max_len = 32
    uns_key = "TiDeTe"
    sup_key = "TiDeTe"
    model_name = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'

In [None]:
# models = ["/mnt/hdd1/wangjingqi/ck/lecr/ft/pmmb2_TiDeTe/65610"]
# test(CFG,models)

In [3]:
topics, content, correlations = read_data(CFG)

In [None]:
topics, content = get_neighbors(topics, content, CFG)

In [None]:
topics.head()

In [None]:
topics = topics.merge(correlations, how = 'inner', left_on = ['id'], right_on = ['topic_id'])
topics.head()

In [None]:
pos_score = get_pos_score(topics['content_ids'], topics['predictions'])
print(f'Our max positive score is {pos_score}')#(0.92822,0.84639,0.92973,0.84837)(,0.87946)

In [None]:
content.set_index('id', inplace = True)

In [None]:

gc.collect()
# Set id as index for content

# Build training set
train = build_training_set(topics, content, CFG)
print(f'Our training set has {len(train)} rows')
# Save train set to disk to train on another notebook


In [None]:

train.to_csv(f'/mnt/hdd1/wangjingqi/dataset/lecr/train_{CFG.max_len}.csv', index = False)
