# Loading the TEGR1 Dataset

This module produces the TEGR1 dataset. This module is the same as main.ipynb with the following changes:
* Drop unsuccessful donations
* Remove columns 'success', 'status', and 'type' as these values do not vary.
* Shorten hashes to 10 total characters to increase readability


Dropping unsuccessful rows changes the number of rows in the dataset thus you will notice a difference in the stats at the bottom of this notebook from main.ipynb.

In [1]:
import pandas as pd
import numpy as np
import panel as pn
import hvplot.pandas
from icecream import ic
from bokeh.models.formatters import NumeralTickFormatter
from bokeh.models import HoverTool
ic.configureOutput(prefix='ic|',outputFunction=print)
pn.extension('tabulator')

## Read Input Data

Vote Coefficients Inputs Dataset

In [2]:
# Read the Vote Coefficients Inputs Dataset
ic("Loading data...")
df_qf = pd.read_csv('./input/vote_coefficients_input.csv', parse_dates=['last_score_timestamp'])
ic(df_qf.shape)

# Drop Unsuccessful Rows
ic(len(df_qf[df_qf['success']==False]))
ic("Dropping unsuccessful data...")
df_qf = df_qf[df_qf['success']==True]
ic(df_qf.shape)

# Drop Unecessary Columns
drop_columns=['success', 'status', 'type']
ic(drop_columns)
ic("Dropping columns...")
df_qf = df_qf.drop(drop_columns, axis=1)
ic(df_qf.shape)

# Shorten Hash Values for Easier Reading
ic("Shortening hashes...")
df_qf[df_qf.select_dtypes('object').columns] = df_qf.select_dtypes('object').apply(lambda x: np.where(x.str.startswith('0x'), x.str.slice(stop=10), x))

df_qf.head(5)

ic|'Loading data...'
ic|df_qf.shape: (257, 16)
ic|len(df_qf[df_qf['success']==False]): 4
ic|'Dropping unsuccessful data...'
ic|df_qf.shape: (253, 16)
ic|drop_columns: ['success', 'status', 'type']
ic|'Dropping columns...'
ic|df_qf.shape: (253, 13)
ic|'Shortening hashes...'


Unnamed: 0,id,projectId,applicationId,roundId,token,voter,grantAddress,amount,amountUSD,coefficient,last_score_timestamp,rawScore,threshold
0,0x24a5bbf1,0x64a30a4b,19,0x9E669c0A,0x00000000,0x9ba96198,0xA26d6AEB,5000000000000000.0,9.184332,1,2023-04-25 13:48:59.888771+00:00,28.57,15
1,0x3dce13bb,0xc401c980,6,0x9E669c0A,0x00000000,0x9390fa86,0x9390fA86,2200000000000000.0,4.094567,1,2023-04-25 16:55:55.447871+00:00,27.21,15
2,0x4cf20243,0x97589cd1,7,0x9E669c0A,0x00000000,0x5136cdfc,0x0035cC37,4e+16,74.446665,1,2023-04-25 17:25:19.667155+00:00,28.57,15
3,0x2b032f10,0xec026845,16,0x9E669c0A,0x00000000,0x524cb61b,0x45b79C6b,3000000000000000.0,5.5835,1,2023-04-25 17:07:33.303578+00:00,23.56,15
4,0x0842753b,0xa9bdf738,29,0x9E669c0A,0x00000000,0x524cb61b,0x5041A1C1,3000000000000000.0,5.5835,1,2023-04-25 17:07:33.303578+00:00,23.56,15


Exploring data with Tabulator

In [3]:
pn.widgets.Tabulator.theme = 'simple'
pn.widgets.Tabulator(df_qf, layout='fit_data_table', page_size=5)

### Introducing TE Commons Data

In [4]:
def shorten_hashes(df):
    df[df.select_dtypes('object').columns] = df.select_dtypes('object').apply(lambda x: np.where(x.str.startswith('0x'), x.str.slice(stop=10), x))
    return df

In [5]:
# get table of valid tec holders
# extracted from https://dune.com/queries/2457553/4040451
df_tec = shorten_hashes(pd.read_csv('./input/tec_holders.csv'))

df_tec

Unnamed: 0,address,balance,tec_tokens_flag
0,0x38dfd788,150071.717791,1
1,0x5b757549,106053.271906,1
2,0x839395e2,73838.661487,1
3,0xdf290293,69337.513233,1
4,0x45602bfb,59165.981018,1
...,...,...,...
404,0x423d60df,10.873917,1
405,0xc70c7f14,10.674364,1
406,0xae7f1137,10.309472,1
407,0x96bdad64,10.223873,1


Visualize the data on a log scale with pretty blue circles.

In [6]:
# Use the Bokeh Hover Tool to show formatted numbers in the hover tooltip for balances
hover = HoverTool(tooltips=[("address", "@address"), ("balance", "@balance{0.00}")])

# Plot a scatter plot of TEC balances on a logy scale.
df_tec.hvplot.scatter(
    y='balance', 
    yformatter=NumeralTickFormatter(format='0,0'), 
    alpha=0.8, 
    logy=True, 
    hover_cols=['address', 'balance'],
    title="TEC Token Holders Distribution Log Scale",
    tools=[hover],
    size=200,
    color="white",
    line_color="skyblue",
    xlabel="index",
)

### Introducing TE Academy Data

In [7]:
# get table of te academy token holders
# extracted from https://dune.com/queries/2457581
df_tea_dune = shorten_hashes(pd.read_csv('./input/tea_holders_dune.csv'))
df_tea_tea = shorten_hashes(pd.read_excel('./input/tea_holders_tea.xlsx'))

# Combine
df_tea = pd.concat([df_tea_dune, df_tea_tea]).drop_duplicates(subset=['wallet'])

# Make a contiguous index
df_tea = df_tea.reset_index(drop=True)

# Fill balance of TEA with -1 for now
df_tea = df_tea.fillna(-1)

In [8]:
len(set(df_tea_tea['wallet']).intersection(set(df_tea_dune['wallet'])))

151

In [9]:
len(df_tea_dune), len(df_tea_tea)

(192, 214)

In [10]:
df_tea

Unnamed: 0,wallet,balance,tea_flag
0,0x68f6f2db,1.0,1
1,0x3e0cf03f,5.0,1
2,0x1d1874f9,1.0,1
3,0x4daa278b,3.0,1
4,0xc710f3da,3.0,1
...,...,...,...
244,0xe1954808,-1.0,1
245,0x7f990adf,-1.0,1
246,0xd1595177,-1.0,1
247,0xcc449df4,-1.0,1


Visualize TEA Credentials with scatter and bar plots.

In [11]:
df_tea.hvplot.scatter(y='balance', x='index', title="TEA Credentials Balances Scatter Plot", alpha=0.8)

In [12]:
df_tea.groupby('balance').count().hvplot.bar(y='wallet', title="TEA Credentials Balances Bar Chart", ylabel="Wallet Count", alpha=0.8)

# Calculate Coefficients

In [13]:
# Drop unecessary columns
df_coef = df_qf.drop(columns=['roundId', 'threshold', 'token', 'last_score_timestamp'])
df_coef

Unnamed: 0,id,projectId,applicationId,voter,grantAddress,amount,amountUSD,coefficient,rawScore
0,0x24a5bbf1,0x64a30a4b,19,0x9ba96198,0xA26d6AEB,5.000000e+15,9.184332,1,28.57
1,0x3dce13bb,0xc401c980,6,0x9390fa86,0x9390fA86,2.200000e+15,4.094567,1,27.21
2,0x4cf20243,0x97589cd1,7,0x5136cdfc,0x0035cC37,4.000000e+16,74.446665,1,28.57
3,0x2b032f10,0xec026845,16,0x524cb61b,0x45b79C6b,3.000000e+15,5.583500,1,23.56
4,0x0842753b,0xa9bdf738,29,0x524cb61b,0x5041A1C1,3.000000e+15,5.583500,1,23.56
...,...,...,...,...,...,...,...,...,...
252,0x26e1e300,0x97589cd1,7,0x4405f427,0x0035cC37,1.000000e+15,1.847803,1,29.74
253,0xa21ca1aa,0xec026845,16,0xcdfbbe10,0x45b79C6b,1.000000e+15,1.843793,1,21.07
254,0x634b5156,0xf1f4942d,24,0xcdfbbe10,0x4f8c531d,1.000000e+15,1.843793,1,21.07
255,0x4efa29aa,0xcf3165f4,10,0x410d86e3,0x7f3eb18E,1.000000e+15,1.843793,1,18.04


In [14]:
# Left join the three tables
df_merged = df_qf.merge(
    df_tec, left_on='voter', right_on='address',how='left').merge(
    df_tea, left_on='voter', right_on='wallet',how='left', suffixes=('_tec', '_tea')).drop(columns=['address','wallet'])
df_merged.sample(5)

Unnamed: 0,id,projectId,applicationId,roundId,token,voter,grantAddress,amount,amountUSD,coefficient,last_score_timestamp,rawScore,threshold,balance_tec,tec_tokens_flag,balance_tea,tea_flag
244,0x5fbd95e6,0x4cd41869,25,0x9E669c0A,0x00000000,0x89f04f5c,0xBEC643BD,5e+16,92.222587,1,2023-05-02 15:39:06.174917+00:00,26.17,15,,,,
248,0x26e1e300,0x97589cd1,7,0x9E669c0A,0x00000000,0x4405f427,0x0035cC37,1000000000000000.0,1.847803,1,2023-05-09 22:47:57.682680+00:00,29.74,15,,,,
252,0x7b5b313f,0x64a30a4b,19,0x9E669c0A,0x00000000,0xb5d64294,0xA26d6AEB,3000000000000000.0,5.531378,1,2023-05-09 23:42:11.672306+00:00,18.04,15,,,,
64,0xd5de2972,0x23387567,9,0x9E669c0A,0x6B175474,0x468fd68b,0xbbD107D7,1e+18,1.001181,1,2023-04-25 00:38:00.402327+00:00,26.79,15,,,,
195,0x16e3593a,0xa9bdf738,29,0x9E669c0A,0x6B175474,0xf8d1d349,0x5041A1C1,6e+18,5.998856,1,2023-05-08 13:30:31.187368+00:00,55.06,15,18635.449575,1.0,,


In [15]:
# Replace Nan values with 0
df_merged = df_merged.fillna(0)

# Multiply coefficient by 1.5 if tec_flag or tea_flag = 1
df_merged['coefficient'] = 1 + 0.5 * (df_merged['tec_tokens_flag'].astype(int) | df_merged['tea_flag'].astype(int))
df_merged

Unnamed: 0,id,projectId,applicationId,roundId,token,voter,grantAddress,amount,amountUSD,coefficient,last_score_timestamp,rawScore,threshold,balance_tec,tec_tokens_flag,balance_tea,tea_flag
0,0x24a5bbf1,0x64a30a4b,19,0x9E669c0A,0x00000000,0x9ba96198,0xA26d6AEB,5.000000e+15,9.184332,1.5,2023-04-25 13:48:59.888771+00:00,28.57,15,0.0,0.0,3.0,1.0
1,0x3dce13bb,0xc401c980,6,0x9E669c0A,0x00000000,0x9390fa86,0x9390fA86,2.200000e+15,4.094567,1.0,2023-04-25 16:55:55.447871+00:00,27.21,15,0.0,0.0,0.0,0.0
2,0x4cf20243,0x97589cd1,7,0x9E669c0A,0x00000000,0x5136cdfc,0x0035cC37,4.000000e+16,74.446665,1.0,2023-04-25 17:25:19.667155+00:00,28.57,15,0.0,0.0,0.0,0.0
3,0x2b032f10,0xec026845,16,0x9E669c0A,0x00000000,0x524cb61b,0x45b79C6b,3.000000e+15,5.583500,1.0,2023-04-25 17:07:33.303578+00:00,23.56,15,0.0,0.0,0.0,0.0
4,0x0842753b,0xa9bdf738,29,0x9E669c0A,0x00000000,0x524cb61b,0x5041A1C1,3.000000e+15,5.583500,1.0,2023-04-25 17:07:33.303578+00:00,23.56,15,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
248,0x26e1e300,0x97589cd1,7,0x9E669c0A,0x00000000,0x4405f427,0x0035cC37,1.000000e+15,1.847803,1.0,2023-05-09 22:47:57.682680+00:00,29.74,15,0.0,0.0,0.0,0.0
249,0xa21ca1aa,0xec026845,16,0x9E669c0A,0x00000000,0xcdfbbe10,0x45b79C6b,1.000000e+15,1.843793,1.0,2023-05-09 23:03:19.951021+00:00,21.07,15,0.0,0.0,0.0,0.0
250,0x634b5156,0xf1f4942d,24,0x9E669c0A,0x00000000,0xcdfbbe10,0x4f8c531d,1.000000e+15,1.843793,1.0,2023-05-09 23:03:19.951021+00:00,21.07,15,0.0,0.0,0.0,0.0
251,0x4efa29aa,0xcf3165f4,10,0x9E669c0A,0x00000000,0x410d86e3,0x7f3eb18E,1.000000e+15,1.843793,1.0,2023-05-09 23:24:15.932770+00:00,18.04,15,0.0,0.0,0.0,0.0


# Statistics

In [16]:
df_merged = df_merged.replace(0,np.nan)

In [17]:
# some simple statistics on the left join
df_merged[['id','tec_tokens_flag','tea_flag']].count()

id                 253
tec_tokens_flag    108
tea_flag            44
dtype: int64

In [18]:
# count the number of unique voters
df_merged[['voter','tec_tokens_flag','tea_flag']].drop_duplicates().count()

voter              83
tec_tokens_flag    19
tea_flag            8
dtype: int64

In [19]:
# count the number of voters that have both tec and tea tokens
df_merged[(df_merged['tec_tokens_flag']==True) & (df_merged['tea_flag']==True)][['voter','tec_tokens_flag','tea_flag']].drop_duplicates().count()

voter              4
tec_tokens_flag    4
tea_flag           4
dtype: int64

# The TEGR1 Dataset.

In [None]:
df_merged.to_csv('output/TEGR1.csv', index=False)