In [2]:
# =========================================================================================
# Libraries
# =========================================================================================
import os
import gc
import time
import math
import random
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding
import cupy as cp
from cuml.metrics import pairwise_distances
from cuml.neighbors import NearestNeighbors
%env TOKENIZERS_PARALLELISM=false
# =========================================================================================
# Data Loading
# =========================================================================================
import re
import string

def read_data(cfg):
    content = pd.read_csv("/mnt/hdd1/wangjingqi/dataset/lecr/content.csv")
    topics = pd.read_csv("/mnt/hdd1/wangjingqi/dataset/lecr/topics.csv")
    correlations = pd.read_csv("/mnt/hdd1/wangjingqi/dataset/lecr/correlations.csv")
    
    topics['title'].fillna("", inplace = True)
    content['title'].fillna("", inplace = True)
    topics['description'].fillna("", inplace = True)
    content['description'].fillna("", inplace = True)
    content['text'].fillna("", inplace = True)


    topics["Ti"] = topics["title"]
    content["Ti"] = content["title"]
    
    topics["TiDe"] = topics["title"]+" "+topics["description"]
    content["TiDe"] = content["title"]+" "+content["description"]

    topics["TiDeTe"] = topics["title"]+" "+topics["description"]
    content["TiDeTe"] = content["title"]+" "+content["description"]+" "+content["text"]

    topics['length'] = topics[cfg.uns_key].apply(lambda x: len(x))
    content['length'] = content[cfg.uns_key].apply(lambda x: len(x))
    
    topics.sort_values('length', inplace = True)
    content.sort_values('length', inplace = True)
    # Drop cols
    topics.drop(['title','description', 'channel', 'category', 'level', 'has_content', 'length'], axis = 1, inplace = True)
    content.drop(['title','description', 'kind',  'text', 'copyright_holder', 'license', 'length'], axis = 1, inplace = True)
    # Reset index
    topics.reset_index(drop = True, inplace = True)
    content.reset_index(drop = True, inplace = True)
    print(' ')
    print('-' * 50)
    print(f"topics.shape: {topics.shape}")
    print(f"content.shape: {content.shape}")
    print(f"correlations.shape: {correlations.shape}")
    return topics, content, correlations

class LECRDataset(torch.utils.data.Dataset):
    def __init__(self,df,key):
        self.inputs = df[key].values
    def __len__(self):
        return len(self.inputs)
    def __getitem__(self,idx):
        sample = self.inputs[idx]
        return sample

class collator():
    def __init__(self,pretrained_path,max_len=None) -> None:
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_path)
        self.max_len = max_len
    def __call__(self, data):
        inputs = self.tokenize(list(data))
        return inputs
    def tokenize(self,texts):
            return self.tokenizer(
                texts,padding='longest',max_length=self.max_len,truncation=True,return_tensors="pt",return_token_type_ids=False)
# =========================================================================================
# Unsupervised model
# =========================================================================================
class MeanPooling(nn.Module):
    def __init__(self) -> None:
        super().__init__()
    def forward(self,hidden_state, attention_mask):

        input_mask_expanded = (
            attention_mask.unsqueeze(-1).expand(hidden_state.size())
        )
        mean_embeddings = torch.sum(hidden_state * input_mask_expanded, 1) / torch.clamp(
            input_mask_expanded.sum(1), min=1e-9)
        return mean_embeddings

class UNSModel(nn.Module):
    def __init__(self, pretrained_path):
        super().__init__()
        self.model = AutoModel.from_pretrained(pretrained_path)
        self.pool = MeanPooling()
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        feature = self.pool(last_hidden_state, inputs['attention_mask'])
        return feature
    def forward(self, inputs):
        feature = self.feature(inputs)
        return feature
    
# =========================================================================================
# Get embeddings
# =========================================================================================
def get_embeddings(loader, model, device):
    model.eval()
    preds = []
    for step, inputs in enumerate(tqdm(loader)):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    preds = np.concatenate(preds)
    return preds

# =========================================================================================
# Get the amount of positive classes based on the total
# =========================================================================================
def get_pos_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    int_true = np.array([len(x[0] & x[1]) / len(x[0]) for x in zip(y_true, y_pred)])
    return round(np.mean(int_true), 5)

# =========================================================================================
# Build our training set
# =========================================================================================
def build_training_set(topics, content, cfg):
    # Create lists for training
   
    input_key = cfg.sup_key
    topics_ids = []
    content_ids = []
    input1 = []
    input2 = []
    targets = []
    topics_languages = []
    content_languages = []
    # Iterate over each topic
    for k in tqdm(range(len(topics))):
        row = topics.iloc[k]
        topics_id = row['id']
        topics_input = row[input_key]
        topics_language = row['language']
        predictions = row['predictions'].split(' ')
        ground_truth = row['content_ids'].split(' ')
        # predictions = list(set(predictions)|set(ground_truth))
        for pred in predictions:
            content_language = content.loc[pred, 'language']
            content_input= content.loc[pred, input_key]
            topics_ids.append(topics_id)
            content_ids.append(pred)
            input1.append(topics_input)
            input2.append(content_input)
            topics_languages.append(topics_language)
            content_languages.append(content_language)
            # If pred is in ground truth, 1 else 0
            if pred in ground_truth:
                targets.append(1)
            else:
                targets.append(0)
    # Build training dataset
    train = pd.DataFrame(
        {'topics_ids': topics_ids, 
         'content_ids': content_ids, 
         'input1': input1, 
         'input2': input2, 
         'target': targets,
         'topic_language': topics_languages, 
         'content_language': content_languages, }
    )
    # Release memory
    del topics_ids, content_ids, input1, input2, targets
    gc.collect()
    return train
    
# =========================================================================================
# Get neighbors
# =========================================================================================
def get_neighbors(topics, content, cfg):
    # Create topics dataset
    topics_dataset = LECRDataset(topics,cfg.uns_key)
    # Create content dataset
    content_dataset = LECRDataset(content,cfg.uns_key)
    

    collate_fn= collator(cfg.model_name, cfg.max_len)
    # Create topics and content dataloaders
    topics_loader = DataLoader(topics_dataset,batch_size = cfg.bs, shuffle = False, num_workers= cfg.nw, pin_memory=True,collate_fn =collate_fn,drop_last=False)
    content_loader = DataLoader(content_dataset,batch_size = cfg.bs, shuffle = False, num_workers= cfg.nw, pin_memory=True,collate_fn =collate_fn,drop_last=False)
    # Create unsupervised model to extract embeddings
    model = UNSModel(cfg.model_name)
    model.to(cfg.device)
    model.float()
    # Predict topics
    


    topics_preds = get_embeddings(topics_loader, model, cfg.device)
    del topics_loader
    gc.collect()
    content_preds = get_embeddings(content_loader, model, cfg.device)
    # Transfer predictions to gpu
    topics_preds_gpu = cp.array(topics_preds)
    content_preds_gpu = cp.array(content_preds)
    # Release memory
    torch.cuda.empty_cache()
    del topics_dataset, content_dataset, content_loader, topics_preds, content_preds
    gc.collect()
    # KNN model
    print(' ')
    print('Training KNN model...')
    neighbors_model = NearestNeighbors(n_neighbors = cfg.top_n, metric = 'cosine')
    neighbors_model.fit(content_preds_gpu)
    distances,indices = neighbors_model.kneighbors(topics_preds_gpu, return_distance = True)
    distances = distances.get()
    indices = indices.get()
    predictions = []
    for k in range(len(indices)):
        pred = indices[k]
        dis = distances[k]
        p = []
        for i in range(len(pred)):
            if dis[i] < 1000:
                p.append(content.loc[pred[i], 'id'])
        if len(p)==0:
            p = []
            for i in range(len(pred)):
                p.append(content.loc[pred[i], 'id'])
        p = ' '.join(p)
        predictions.append(p)
    topics['predictions'] = predictions
    # Release memory
    del topics_preds_gpu, content_preds_gpu, neighbors_model, predictions, indices, model
    gc.collect()
    return topics, content 

# Read data
# topics, content, correlations = read_data(CFG)
# # Run nearest neighbors
# topics, content = get_neighbors(topics, content, CFG)
# # Merge with target and comput max positive score
# topics = topics.merge(correlations, how = 'inner', left_on = ['id'], right_on = ['topic_id'])
# pos_score = get_pos_score(topics['content_ids'], topics['predictions'])
# print(f'Our max positive score is {pos_score}')
# # We can delete correlations
# del correlations
# gc.collect()
# # Set id as index for content
# content.set_index('id', inplace = True)
# # Build training set
# train = build_training_set(topics, content, CFG)
# print(f'Our training set has {len(train)} rows')
# # Save train set to disk to train on another notebook
# train.to_csv('train.csv', index = False)
# train.head()
def test(CFG,models):
    scores = {}
    for model in models:
        id = model.split('/')[-1]
        CFG.model_name = model
        print(f'Running model {id}')
        topics, content, correlations = read_data(CFG)
        topics, content = get_neighbors(topics, content, CFG)
        topics = topics.merge(correlations, how = 'inner', left_on = ['id'], right_on = ['topic_id'])
        pos_score = get_pos_score(topics['content_ids'], topics['predictions'])
        print(f'{id} max positive score is {pos_score}')
        scores[id] = pos_score
    print(scores)

env: TOKENIZERS_PARALLELISM=false


In [6]:
m=AutoTokenizer.from_pretrained('/mnt/hdd1/wangjingqi/ck/lecr/ft/pmmb2_TiDeTe/26250')

In [7]:
m1=AutoTokenizer.from_pretrained('xlm-roberta-base')

In [9]:
m('hell fgd safawerawer o'),m1('hell fgd safawerawer o')

({'input_ids': [0, 33600, 1238, 177, 71, 57, 1021, 6488, 11, 6488, 36, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
 {'input_ids': [0, 33600, 1238, 177, 71, 57, 1021, 6488, 11, 6488, 36, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]})

In [5]:
m1.load_state_dict(m.state_dict())

<All keys matched successfully>

In [3]:
ck = torch.load('/mnt/hdd1/wangjingqi/ck/lecr/xlm-roberta-base_train_50_pmmb2_TiDeTe_26250_TiDeTe_TiDeTe_split32_54_super/xlm-roberta-base_train_50_pmmb2_TiDeTe_26250_TiDeTe_TiDeTe_split32_54_super_0.ckpt',map_location='cpu')

In [8]:
ck.keys()

dict_keys(['epoch', 'global_step', 'pytorch-lightning_version', 'state_dict', 'loops'])

In [7]:
m.load_state_dict(ck['state_dict'])

RuntimeError: Error(s) in loading state_dict for XLMRobertaModel:
	Missing key(s) in state_dict: "embeddings.position_ids", "embeddings.word_embeddings.weight", "embeddings.position_embeddings.weight", "embeddings.token_type_embeddings.weight", "embeddings.LayerNorm.weight", "embeddings.LayerNorm.bias", "encoder.layer.0.attention.self.query.weight", "encoder.layer.0.attention.self.query.bias", "encoder.layer.0.attention.self.key.weight", "encoder.layer.0.attention.self.key.bias", "encoder.layer.0.attention.self.value.weight", "encoder.layer.0.attention.self.value.bias", "encoder.layer.0.attention.output.dense.weight", "encoder.layer.0.attention.output.dense.bias", "encoder.layer.0.attention.output.LayerNorm.weight", "encoder.layer.0.attention.output.LayerNorm.bias", "encoder.layer.0.intermediate.dense.weight", "encoder.layer.0.intermediate.dense.bias", "encoder.layer.0.output.dense.weight", "encoder.layer.0.output.dense.bias", "encoder.layer.0.output.LayerNorm.weight", "encoder.layer.0.output.LayerNorm.bias", "encoder.layer.1.attention.self.query.weight", "encoder.layer.1.attention.self.query.bias", "encoder.layer.1.attention.self.key.weight", "encoder.layer.1.attention.self.key.bias", "encoder.layer.1.attention.self.value.weight", "encoder.layer.1.attention.self.value.bias", "encoder.layer.1.attention.output.dense.weight", "encoder.layer.1.attention.output.dense.bias", "encoder.layer.1.attention.output.LayerNorm.weight", "encoder.layer.1.attention.output.LayerNorm.bias", "encoder.layer.1.intermediate.dense.weight", "encoder.layer.1.intermediate.dense.bias", "encoder.layer.1.output.dense.weight", "encoder.layer.1.output.dense.bias", "encoder.layer.1.output.LayerNorm.weight", "encoder.layer.1.output.LayerNorm.bias", "encoder.layer.2.attention.self.query.weight", "encoder.layer.2.attention.self.query.bias", "encoder.layer.2.attention.self.key.weight", "encoder.layer.2.attention.self.key.bias", "encoder.layer.2.attention.self.value.weight", "encoder.layer.2.attention.self.value.bias", "encoder.layer.2.attention.output.dense.weight", "encoder.layer.2.attention.output.dense.bias", "encoder.layer.2.attention.output.LayerNorm.weight", "encoder.layer.2.attention.output.LayerNorm.bias", "encoder.layer.2.intermediate.dense.weight", "encoder.layer.2.intermediate.dense.bias", "encoder.layer.2.output.dense.weight", "encoder.layer.2.output.dense.bias", "encoder.layer.2.output.LayerNorm.weight", "encoder.layer.2.output.LayerNorm.bias", "encoder.layer.3.attention.self.query.weight", "encoder.layer.3.attention.self.query.bias", "encoder.layer.3.attention.self.key.weight", "encoder.layer.3.attention.self.key.bias", "encoder.layer.3.attention.self.value.weight", "encoder.layer.3.attention.self.value.bias", "encoder.layer.3.attention.output.dense.weight", "encoder.layer.3.attention.output.dense.bias", "encoder.layer.3.attention.output.LayerNorm.weight", "encoder.layer.3.attention.output.LayerNorm.bias", "encoder.layer.3.intermediate.dense.weight", "encoder.layer.3.intermediate.dense.bias", "encoder.layer.3.output.dense.weight", "encoder.layer.3.output.dense.bias", "encoder.layer.3.output.LayerNorm.weight", "encoder.layer.3.output.LayerNorm.bias", "encoder.layer.4.attention.self.query.weight", "encoder.layer.4.attention.self.query.bias", "encoder.layer.4.attention.self.key.weight", "encoder.layer.4.attention.self.key.bias", "encoder.layer.4.attention.self.value.weight", "encoder.layer.4.attention.self.value.bias", "encoder.layer.4.attention.output.dense.weight", "encoder.layer.4.attention.output.dense.bias", "encoder.layer.4.attention.output.LayerNorm.weight", "encoder.layer.4.attention.output.LayerNorm.bias", "encoder.layer.4.intermediate.dense.weight", "encoder.layer.4.intermediate.dense.bias", "encoder.layer.4.output.dense.weight", "encoder.layer.4.output.dense.bias", "encoder.layer.4.output.LayerNorm.weight", "encoder.layer.4.output.LayerNorm.bias", "encoder.layer.5.attention.self.query.weight", "encoder.layer.5.attention.self.query.bias", "encoder.layer.5.attention.self.key.weight", "encoder.layer.5.attention.self.key.bias", "encoder.layer.5.attention.self.value.weight", "encoder.layer.5.attention.self.value.bias", "encoder.layer.5.attention.output.dense.weight", "encoder.layer.5.attention.output.dense.bias", "encoder.layer.5.attention.output.LayerNorm.weight", "encoder.layer.5.attention.output.LayerNorm.bias", "encoder.layer.5.intermediate.dense.weight", "encoder.layer.5.intermediate.dense.bias", "encoder.layer.5.output.dense.weight", "encoder.layer.5.output.dense.bias", "encoder.layer.5.output.LayerNorm.weight", "encoder.layer.5.output.LayerNorm.bias", "encoder.layer.6.attention.self.query.weight", "encoder.layer.6.attention.self.query.bias", "encoder.layer.6.attention.self.key.weight", "encoder.layer.6.attention.self.key.bias", "encoder.layer.6.attention.self.value.weight", "encoder.layer.6.attention.self.value.bias", "encoder.layer.6.attention.output.dense.weight", "encoder.layer.6.attention.output.dense.bias", "encoder.layer.6.attention.output.LayerNorm.weight", "encoder.layer.6.attention.output.LayerNorm.bias", "encoder.layer.6.intermediate.dense.weight", "encoder.layer.6.intermediate.dense.bias", "encoder.layer.6.output.dense.weight", "encoder.layer.6.output.dense.bias", "encoder.layer.6.output.LayerNorm.weight", "encoder.layer.6.output.LayerNorm.bias", "encoder.layer.7.attention.self.query.weight", "encoder.layer.7.attention.self.query.bias", "encoder.layer.7.attention.self.key.weight", "encoder.layer.7.attention.self.key.bias", "encoder.layer.7.attention.self.value.weight", "encoder.layer.7.attention.self.value.bias", "encoder.layer.7.attention.output.dense.weight", "encoder.layer.7.attention.output.dense.bias", "encoder.layer.7.attention.output.LayerNorm.weight", "encoder.layer.7.attention.output.LayerNorm.bias", "encoder.layer.7.intermediate.dense.weight", "encoder.layer.7.intermediate.dense.bias", "encoder.layer.7.output.dense.weight", "encoder.layer.7.output.dense.bias", "encoder.layer.7.output.LayerNorm.weight", "encoder.layer.7.output.LayerNorm.bias", "encoder.layer.8.attention.self.query.weight", "encoder.layer.8.attention.self.query.bias", "encoder.layer.8.attention.self.key.weight", "encoder.layer.8.attention.self.key.bias", "encoder.layer.8.attention.self.value.weight", "encoder.layer.8.attention.self.value.bias", "encoder.layer.8.attention.output.dense.weight", "encoder.layer.8.attention.output.dense.bias", "encoder.layer.8.attention.output.LayerNorm.weight", "encoder.layer.8.attention.output.LayerNorm.bias", "encoder.layer.8.intermediate.dense.weight", "encoder.layer.8.intermediate.dense.bias", "encoder.layer.8.output.dense.weight", "encoder.layer.8.output.dense.bias", "encoder.layer.8.output.LayerNorm.weight", "encoder.layer.8.output.LayerNorm.bias", "encoder.layer.9.attention.self.query.weight", "encoder.layer.9.attention.self.query.bias", "encoder.layer.9.attention.self.key.weight", "encoder.layer.9.attention.self.key.bias", "encoder.layer.9.attention.self.value.weight", "encoder.layer.9.attention.self.value.bias", "encoder.layer.9.attention.output.dense.weight", "encoder.layer.9.attention.output.dense.bias", "encoder.layer.9.attention.output.LayerNorm.weight", "encoder.layer.9.attention.output.LayerNorm.bias", "encoder.layer.9.intermediate.dense.weight", "encoder.layer.9.intermediate.dense.bias", "encoder.layer.9.output.dense.weight", "encoder.layer.9.output.dense.bias", "encoder.layer.9.output.LayerNorm.weight", "encoder.layer.9.output.LayerNorm.bias", "encoder.layer.10.attention.self.query.weight", "encoder.layer.10.attention.self.query.bias", "encoder.layer.10.attention.self.key.weight", "encoder.layer.10.attention.self.key.bias", "encoder.layer.10.attention.self.value.weight", "encoder.layer.10.attention.self.value.bias", "encoder.layer.10.attention.output.dense.weight", "encoder.layer.10.attention.output.dense.bias", "encoder.layer.10.attention.output.LayerNorm.weight", "encoder.layer.10.attention.output.LayerNorm.bias", "encoder.layer.10.intermediate.dense.weight", "encoder.layer.10.intermediate.dense.bias", "encoder.layer.10.output.dense.weight", "encoder.layer.10.output.dense.bias", "encoder.layer.10.output.LayerNorm.weight", "encoder.layer.10.output.LayerNorm.bias", "encoder.layer.11.attention.self.query.weight", "encoder.layer.11.attention.self.query.bias", "encoder.layer.11.attention.self.key.weight", "encoder.layer.11.attention.self.key.bias", "encoder.layer.11.attention.self.value.weight", "encoder.layer.11.attention.self.value.bias", "encoder.layer.11.attention.output.dense.weight", "encoder.layer.11.attention.output.dense.bias", "encoder.layer.11.attention.output.LayerNorm.weight", "encoder.layer.11.attention.output.LayerNorm.bias", "encoder.layer.11.intermediate.dense.weight", "encoder.layer.11.intermediate.dense.bias", "encoder.layer.11.output.dense.weight", "encoder.layer.11.output.dense.bias", "encoder.layer.11.output.LayerNorm.weight", "encoder.layer.11.output.LayerNorm.bias", "pooler.dense.weight", "pooler.dense.bias". 
	Unexpected key(s) in state_dict: "epoch", "global_step", "pytorch-lightning_version", "state_dict", "loops". 

In [5]:
class CFG:
    nw = 4
    bs = 256
    top_n = 50
    seed = 42
    device = 0
    max_len = 50
    uns_key = "TiDeTe"
    sup_key = "TiDe"
    model_name = "/mnt/hdd1/wangjingqi/ck/lecr/ft/pmmb2_TiDeTe/26250"

In [6]:
topics, content, correlations = read_data(CFG)

 
--------------------------------------------------
topics.shape: (76972, 6)
content.shape: (154047, 5)
correlations.shape: (61517, 2)


In [7]:
topics, content = get_neighbors(topics, content, CFG)

100%|██████████| 301/301 [01:45<00:00,  2.85it/s]
100%|██████████| 602/602 [07:03<00:00,  1.42it/s]


 
Training KNN model...


In [8]:
topics.head()

Unnamed: 0,id,language,parent,Ti,TiDe,TiDeTe,predictions
0,t_b908fd457c9b,es,t_5780107c8277,,,,c_a4ab048221a9 c_05b1c711b712 c_a3959a8d38c6 c...
1,t_975bc0c269f5,en,t_11e7bc1103df,9,9,9,c_808b3d694d7e c_f015cfe3c1c5 c_3a093d5f3553 c...
2,t_9d031273c9c4,zh,t_bd070b953e5c,税,税,税,c_3fd3364c7b0a c_a5f02f4f09a2 c_cef911d05090 c...
3,t_26915e343b70,zh,t_6fea62eb7a7e,简介,简介,简介,c_25f168ffb66a c_a3b815330028 c_f2c99f213884 c...
4,t_e105123ddb73,zh,t_4d762b5d1165,艺术,艺术,艺术,c_9af796d59f0a c_fa65796bfbb9 c_8fa78ce3962c c...


In [9]:
topics = topics.merge(correlations, how = 'inner', left_on = ['id'], right_on = ['topic_id'])
topics.head()

Unnamed: 0,id,language,parent,Ti,TiDe,TiDeTe,predictions,topic_id,content_ids
0,t_26915e343b70,zh,t_6fea62eb7a7e,简介,简介,简介,c_25f168ffb66a c_a3b815330028 c_f2c99f213884 c...,t_26915e343b70,c_03697937b392 c_b380020ac642
1,t_909b58c6d293,zh,t_02402e8102b8,投篮,投篮,投篮,c_0c1899fb281f c_980187e51826 c_60295d0182cd c...,t_909b58c6d293,c_0c1899fb281f c_60295d0182cd c_683d0adcdb39 c...
2,t_ab4c1b92c735,zh,t_02402e8102b8,防守,防守,防守,c_9f66c1641b50 c_10785744108b c_417058c2b39e c...,t_ab4c1b92c735,c_10785744108b c_2d71d8a7adcc c_417058c2b39e c...
3,t_3f13d204a214,zh,t_2d2fafe51d9d,挥杆,挥杆,挥杆,c_8d537721a710 c_2b6f7b40b473 c_5b451d883626 c...,t_3f13d204a214,c_209cf4acb7f9 c_24c7ef246641 c_2b6f7b40b473 c...
4,t_527deeb1e651,zh,t_696a0f81e676,访谈,访谈,访谈,c_7a04cf6c64b8 c_3278e8557417 c_64ccf43280e0 c...,t_527deeb1e651,c_3278e8557417 c_7a04cf6c64b8


In [10]:
pos_score = get_pos_score(topics['content_ids'], topics['predictions'])
print(f'Our max positive score is {pos_score}')#(0.94418)

Our max positive score is 0.94418


In [11]:
content.set_index('id', inplace = True)

In [12]:

gc.collect()
# Set id as index for content
# Build training set
train = build_training_set(topics, content, CFG)
print(f'Our training set has {len(train)} rows',train.target.sum()/len(train))
# Save train set to disk to train on another notebook
#0.82,0.310,0.92365,0.24

100%|██████████| 61517/61517 [00:53<00:00, 1146.58it/s]


Our training set has 3075850 rows 0.08459417721930523


In [13]:
model_name = "_".join(CFG.model_name.split('/')[-2:])

train.to_csv(f'/mnt/hdd1/wangjingqi/dataset/lecr/train_{CFG.top_n}_{model_name}_{CFG.uns_key}_{CFG.sup_key}_super.csv', index = False)


In [11]:
import pandas as pd
pd.read_csv("/mnt/hdd1/wangjingqi/dataset/lecr/train_50_pmmb2_TiDeTe_26250_TiDeTe_TiDe.csv").shape

(3095572, 7)