## The following notebook creates synthetic data from a LLM that was fine-tuned on the [Wild blueberry Yield Prediction Dataset](https://www.kaggle.com/datasets/shashwatwork/wild-blueberry-yield-prediction-dataset)

The output of this notebook can be used to create a Community Competition similar to this [Playground competition](https://www.kaggle.com/competitions/playground-series-s3e14) using refreshed data.

In [None]:
import os
import pandas as pd
from hashlib import md5

import warnings
warnings.filterwarnings("ignore")

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
model_path = '/kaggle/input/blueberry-yield-fine-tuned-gpt2-medium'

In [None]:
!pip install transformers==4.26.1 -q # avoiding a dependency issue

In [None]:
!pip install be-great==0.0.3 -q

In [None]:
from be_great import GReaT

model = GReaT.load_from_dir(model_path)
data = model.sample(
    n_samples=1_000,    # change this to generate more samples
    k=50,
    temperature=0.7,  # values between 0.5-0.9 generally give good results
    max_length=256,
    device="cuda")
data = data.rename_axis('id')
data.head()

## You may need/want to do data cleanup here

* Remove outliers
* etc.

In [None]:
# Optional data cleanup here

data = data.drop('Row#', axis='columns')

In [None]:
target_col = 'yield'

train_split = 0.6
public_split = 0.2

train_ids = range(int(data.shape[0] * train_split))
train = data.loc[train_ids]
test = data.loc[train_ids[-1]+1:]

solution = test.pop(target_col).to_frame()

# random sort using hash
solution['sort'] = solution.index.astype(str) + solution[target_col].astype(str)
solution['sort'] = solution['sort'].apply(lambda x: md5(bytes(x, 'utf=8')).hexdigest())
assert solution['sort'].is_unique
solution = solution.sort_values('sort')
solution['Usage'] = 'Private'

public_count = int(solution.shape[0] * public_split)
solution.iloc[:public_count, 2] = 'Public'

solution = solution.drop('sort', axis='columns').sort_index()

# sample submission to use the mean of the training data
submission = solution[[target_col]].copy()
submission[target_col] = train[target_col].mean()

In [None]:
train.head()

In [None]:
test.head()

In [None]:
submission.head()

In [None]:
solution.head()

In [None]:
train.to_csv('train.csv')
test.to_csv('test.csv')
submission.to_csv('sample_submission.csv')
solution.to_csv('solution.csv')