In [3]:
import pandas as pd
import numpy as np

# Load the dataset
data = pd.read_csv('TGE Price Pump Factors - Sheet1.csv')

# Mapping categorical columns to numerical values using factorize
data['narrative'], narrative_mapping = pd.factorize(data['narrative'])
data['type'], type_mapping = pd.factorize(data['type'])
data['cex_1'], cex_1_mapping = pd.factorize(data['cex_1'])
data['cex_2'], cex_2_mapping = pd.factorize(data['cex_2'])
data['cex_3'], cex_3_mapping = pd.factorize(data['cex_3'])

# Separate numerical and categorical columns
numerical_cols = data.select_dtypes(include=[np.number]).columns
categorical_cols = data.select_dtypes(exclude=[np.number]).columns

# Function to generate new samples
def generate_samples(data, num_samples, noise_level=0.01):
    numerical_data = data[numerical_cols]
    sampled_data = numerical_data.sample(n=num_samples, replace=True).reset_index(drop=True)
    noise = noise_level * np.random.randn(num_samples, numerical_data.shape[1])
    noisy_data = sampled_data + noise
    return noisy_data

# Number of samples to generate
num_samples = 10000

# Generate new samples for numerical data
synthetic_numerical_data = generate_samples(data, num_samples)

# Resample categorical data
synthetic_categorical_data = data[categorical_cols].sample(n=num_samples, replace=True).reset_index(drop=True)

# Combine numerical and categorical data
synthetic_data = pd.concat([synthetic_categorical_data, synthetic_numerical_data], axis=1)

# Ensure the synthetic data has the same column names
synthetic_data.columns = data.columns

# Reverse the factorize mapping for categorical columns (optional)
synthetic_data['narrative'] = synthetic_data['narrative'].map(dict(enumerate(narrative_mapping)))
synthetic_data['type'] = synthetic_data['type'].map(dict(enumerate(type_mapping)))
synthetic_data['cex_1'] = synthetic_data['cex_1'].map(dict(enumerate(cex_1_mapping)))
synthetic_data['cex_2'] = synthetic_data['cex_2'].map(dict(enumerate(cex_2_mapping)))
synthetic_data['cex_3'] = synthetic_data['cex_3'].map(dict(enumerate(cex_3_mapping)))

# Save the synthetic data to CSV
synthetic_data.to_csv('synthetic_tge_data.csv', index=False)

# Display first few rows of the synthetic datase
synthetic_data.head()


Unnamed: 0,name,tge_year,narrative,type,inst_ico,comm_ico,listing_price,inst_vesting_perc,pub_vesting_perc,cex_1,cex_2,cex_3,x_comm,hr1_high,hr1_low,hr2_high,hr2_low,hr3_high,hr3_low
0,NEO,2017.025991,,,0.994777,-0.002598,0.194733,51.629861,-0.013099,,,,8949.993805,0.020885,0.0343,0.128844,0.108183,0.494756,0.310476
1,BUBBLE,2019.984441,,,1.005704,0.004092,0.024204,99.983943,-0.002362,,,,3538.017768,0.014335,0.013096,0.046252,0.024028,0.016919,0.017649
2,POLKADOT,2024.004643,,,0.005429,1.000759,0.377727,-0.009448,30.014086,,,,297569.996603,0.706284,0.488478,0.538789,0.473151,0.439051,0.432207
3,CLOUD,2015.986408,,,0.985945,0.011499,0.369777,49.636818,-0.011506,,,,74900.006788,0.085437,0.035691,0.414102,0.295773,0.806522,19.993418
4,FIL,2024.006619,,,1.014165,0.998334,0.035105,18.508628,29.988772,,,,749999.992389,0.104823,-0.00234,0.123128,0.013531,0.008918,-0.004145
