In [1]:
import os
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
model_path = '/kaggle/input/crab-age-fine-tuned-gpt2/'

/kaggle/input/playground-series-s3e16/sample_submission.csv
/kaggle/input/playground-series-s3e16/train.csv
/kaggle/input/playground-series-s3e16/test.csv
/kaggle/input/crab-age-fine-tuned-gpt2/config.json
/kaggle/input/crab-age-fine-tuned-gpt2/model.pt


In [2]:
!pip install transformers==4.26.1 -q # avoiding a dependency issue

[0m

In [6]:
train_data = pd.read_csv('/kaggle/input/playground-series-s3e16/train.csv')

In [7]:
train_data.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
0,0,I,1.525,1.175,0.375,28.973189,12.728926,6.647958,8.348928,9
1,1,I,1.1,0.825,0.275,10.418441,4.521745,2.324659,3.40194,8
2,2,M,1.3875,1.1125,0.375,24.777463,11.3398,5.556502,6.662133,9
3,3,F,1.7,1.4125,0.5,50.660556,20.354941,10.991839,14.996885,11
4,4,I,1.25,1.0125,0.3375,23.289114,11.977664,4.50757,5.953395,8


In [3]:
def row2str(row):
    return ", ".join(f"{name} is {row[name]}" 
              for name in row.index if name!='id')

In [4]:
from tqdm import tqdm
tqdm.pandas()

In [7]:
X_train = train_data.copy()
y_train = X_train.pop('Age')

In [8]:
X_train['prompt'] = X_train.progress_apply(lambda x: row2str(x)+", Age is", axis=1)

100%|██████████| 74051/74051 [00:05<00:00, 13644.80it/s]


In [9]:
prompt = X_train.iloc[0].prompt
prompt

'Sex is I, Length is 1.525, Diameter is 1.175, Height is 0.375, Weight is 28.973189, Shucked Weight is 12.7289255, Viscera Weight is 6.64795775, Shell Weight is 8.34892775, Age is'

In [10]:
X_sample = X_train[1000:3000]
y_sample = y_train[1000:3000]

In [5]:
def calc_mae(y_true, y_predict):
    assert len(y_true)==len(y_predict)
    return sum(abs(a-b) for a,b in zip(y_true, y_predict))/len(y_true)

In [16]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch.nn.functional as F

# Load the pre-trained model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [18]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.load_state_dict(torch.load(model_path+'model.pt'))

<All keys matched successfully>

In [6]:
def age_probs(prompt, temperature=0.7, k=29):
    # Encode the prompt using the tokenizer
    encoded_prompt = tokenizer.encode(prompt, add_special_tokens=False, return_tensors='pt')

    # Generate the next token probabilities using the model
    output = model(encoded_prompt)
    next_token_logits = output[0][:, -1, :] / temperature
    next_token_probabilities = F.softmax(next_token_logits, dim=-1)

    # Get the top-10 token possibilities and their probabilities
    top_next_token_probabilities, top_next_token_indices = torch.topk(next_token_probabilities, k, dim=-1)

    result = [(tokenizer.decode(top_next_token_indices[0][i]), float(top_next_token_probabilities[0][i]))
        for i in range(k)
    ]
    result = [(int(x[0]), x[1]) for x in filter(lambda x: x[0].strip().isnumeric(), result)]
    return dict(sorted(result))


In [40]:
sample_probs = X_sample['prompt'].progress_apply(age_probs)

100%|██████████| 2000/2000 [08:21<00:00,  3.99it/s]


In [7]:
import numpy as np

def calculate_median(prob_dict):
    """
    Calculate the median of a probability distribution.

    Args:
        prob_dict (dict): A dictionary where the keys are the values and the values are the probabilities.

    Returns:
        float: The median of the probability distribution.
    """
    # Convert the dictionary to a 1-D numpy array
    values = np.array(list(prob_dict.keys()))
    probs = np.array(list(prob_dict.values()))

    # Compute the cumulative distribution function (CDF)
    cdf = np.cumsum(probs)

    # Find the index i where cdf[i] >= 0.5
    i = np.searchsorted(cdf, 0.5)
    
    return values[i]

In [42]:
sample_probs[1000]

{9: 5.7555629609851167e-05,
 10: 0.004945171531289816,
 11: 0.0188369732350111,
 12: 0.05568859726190567,
 13: 0.051542844623327255,
 14: 0.04076721519231796,
 15: 0.0053699808195233345,
 16: 0.1850092113018036,
 17: 0.23313423991203308,
 18: 0.12777920067310333,
 19: 0.04261331632733345,
 20: 0.1832588016986847,
 21: 0.04383772611618042,
 22: 0.0011465057032182813,
 23: 0.003474720288068056,
 24: 0.0012915021507069468,
 25: 0.0006016453262418509,
 26: 0.0002361457736697048,
 27: 0.00035198775003664196,
 28: 3.082212060689926e-05,
 29: 1.1608691238507163e-05,
 30: 1.0226673111901619e-05,
 31: 5.099257691654202e-07,
 32: 7.790997642587172e-07,
 33: 6.436925445996167e-07,
 34: 3.6775122680410277e-07,
 40: 2.370036611409887e-07,
 42: 2.3193425136014412e-07,
 49: 2.328810921881086e-07}

In [43]:
calculate_median(sample_probs[1000])

17

In [44]:
sample_medians = sample_probs.progress_apply(calculate_median)

100%|██████████| 2000/2000 [00:00<00:00, 27493.45it/s]


In [45]:
calc_mae(y_sample, sample_medians)

1.46

In [8]:
def expected_absolute_error(probs, func):
    func_val = func(probs)
    return sum(abs(func_val-val)*prob
    for val, prob in probs.items())

In [49]:
expected_absolute_error(sample_probs[1000], calculate_median)

1.935641901064173

In [9]:
from functools import partial

In [53]:
sample_probs.progress_apply(partial(expected_absolute_error, func=calculate_median)).mean()

100%|██████████| 2000/2000 [00:00<00:00, 7491.53it/s]


0.9644302914638322

In [55]:
sample_probs.apply(lambda x: min(x.values())).max()

0.0001329632941633463

In [56]:
sample_max = sample_probs.progress_apply(lambda x: max(x.items(), key=lambda y: y[1])[0])

100%|██████████| 2000/2000 [00:00<00:00, 118899.65it/s]


In [57]:
calc_mae(y_sample, sample_max)

1.537

In [10]:
!pip install be-great==0.0.3 -q

[0m

In [11]:
from be_great import GReaT

model = GReaT.load_from_dir(model_path)
synthetic_data = model.sample(
    n_samples=2_000,    # change this to generate more samples
    k=50,
    temperature=0.7,  # values between 0.5-0.9 generally give good results
    max_length=256,
    device="cuda")
synthetic_data = synthetic_data.rename_axis('id')
synthetic_data.to_csv('synthetic_data.csv')
synthetic_data.head()

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

2049it [00:56, 36.23it/s]                          


Unnamed: 0_level_0,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,F,1.4375,1.125,0.425,32.077459,14.840963,7.37087,8.193005,9.0
1,F,0.9625,0.7375,0.25,9.497082,4.252425,1.842718,2.466407,6.0
2,I,0.6875,0.55,0.1625,2.806601,1.048931,0.666213,0.850485,5.0
3,F,1.5625,1.225,0.5,37.137845,16.428535,8.77417,9.497082,11.0
4,I,1.025,0.8125,0.2125,8.5332,3.912231,1.956115,2.551455,8.0


In [12]:
X_syntetic = synthetic_data.copy()
y_syntetic = X_syntetic.pop('Age')

In [13]:
X_syntetic['prompt'] = X_syntetic.progress_apply(lambda x: row2str(x)+", Age is", axis=1)

100%|██████████| 2000/2000 [00:00<00:00, 15955.96it/s]


In [14]:
X_syntetic.iloc[0]['prompt']

'Sex is F, Length is 1.4375, Diameter is 1.125, Height is 0.425, Weight is 32.07745925, Shucked Weight is 14.84096325, Viscera Weight is 7.37087, Shell Weight is 8.1930055, Age is'

In [19]:
syntetic_probs = X_syntetic['prompt'].progress_apply(age_probs)

100%|██████████| 2000/2000 [08:44<00:00,  3.82it/s]


In [20]:
syntetic_medians = syntetic_probs.progress_apply(calculate_median)

100%|██████████| 2000/2000 [00:00<00:00, 24475.42it/s]


In [21]:
calc_mae(y_syntetic, syntetic_medians)

1.4185

In [22]:
syntetic_probs.progress_apply(partial(expected_absolute_error, func=calculate_median)).mean()

100%|██████████| 2000/2000 [00:00<00:00, 6576.21it/s]


0.9821774885822849

In [None]:
from statistics import mean

mean