In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

In [64]:
datadir = '../tec-rewards/distribution_rounds/'
rounds = np.arange(1,8)
# load all the praise
allpraise_df = pd.DataFrame()
for kr in rounds:
    df =pd.read_csv(f'{datadir}/round-{kr}/distribution_results/raw_csv_exports/extended_praise_data.csv')
    allpraise_df=pd.concat([allpraise_df,df[['REASON','AVG SCORE','TO USER ACCOUNT','DATE']]],axis=0)


In [3]:
allpraise_df

Unnamed: 0,REASON,AVG SCORE
0,for making edits in the welcome text,50.00
1,for making edits in the welcome text.,4.67
2,for offering to help us improve some designs f...,27.00
3,for invite me to play some music,7.67
4,for sharing material about TEC simulator and c...,21.00
...,...,...
1726,for attending the Twitter planning TEAM call a...,10.33
1727,for attending the Twitter planning TEAM call a...,3.67
1728,for attending the Twitter planning TEAM call a...,5.50
1729,for hopping into our weekly Communitas and giv...,4.00


In [6]:

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re
nltk.download('stopwords')

# cleaning master function
def clean_praise(praise):
    # code adapted from: https://ourcodingclub.github.io/tutorials/topic-modelling-python/
    my_stopwords = nltk.corpus.stopwords.words('english')
    word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem # clean words to the "stem" (e.g. words->word, talked->talk)
    my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'

    praise = praise.lower() # lower case
    praise = re.sub('['+my_punctuation + ']+', ' ', praise) # strip punctuation
    praise = re.sub('\s+', ' ', praise) #remove double spacing
    praise = re.sub('([0-9]+)', '', praise) # remove numbers
    praise_token_list = [word for word in praise.split(' ')
                            if word not in my_stopwords] # remove stopwords

    praise_token_list = [word_rooter(word) if '#' not in word else word
                        for word in praise_token_list] # apply word rooter

    praise = ' '.join(praise_token_list)
    return praise

# topic modeling

## clean the language data

In [None]:
cleaned_praise = allpraise_df['REASON'].apply(clean_praise)


: 

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

# the vectorizer object will be used to transform text to vector form
vectorizer = CountVectorizer(max_df=0.9, min_df=5, token_pattern='\w+|\$[\d\.]+|\S+') # remove words appear less than 5 times or more than 90%

# apply transformation
tf = vectorizer.fit_transform(cleaned_praise).toarray() # term frequency

# tf_feature_names tells us what word each column in the matric represents
tf_feature_names = vectorizer.get_feature_names()



In [30]:
len(tf_feature_names) # total number of features

1269

In [41]:
from sklearn.decomposition import LatentDirichletAllocation

number_of_topics = 10

model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)

In [42]:
model.fit(tf)


In [43]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

In [44]:
no_top_words = 10
display_topics(model, tf_feature_names, no_top_words)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights,Topic 6 words,Topic 6 weights,Topic 7 words,Topic 7 weights,Topic 8 words,Topic 8 weights,Topic 9 words,Topic 9 weights
0,tec,864.3,work,305.1,call,459.9,common,1295.3,prais,286.0,work,184.0,join,332.2,insight,100.3,work,397.7,param,641.8
1,commun,728.1,alway,181.0,attend,367.3,forum,777.3,propos,189.2,steward,166.9,graviton,239.1,amaz,98.4,reward,345.6,comm,508.3
2,call,727.2,’,173.7,commun,366.5,week,700.4,help,134.8,great,153.6,train,220.1,share,98.2,dashboard,342.6,parti,399.6
3,join,504.5,support,169.0,wg,272.5,token,697.1,get,103.2,post,119.8,call,200.3,te,94.5,system,284.8,wg,384.1
4,http,362.6,help,154.6,sync,248.0,help,674.9,work,99.7,give,109.0,orient,187.5,meet,78.9,common,254.5,tec,380.8
5,youtu,345.1,ccd,128.3,weekli,176.7,engin,665.1,initi,97.2,hatch,105.4,today,183.4,twitter,72.3,upgrad,125.3,work,366.6
6,look,120.0,develop,108.0,communita,158.2,thank,664.2,share,92.9,question,100.8,soft,132.1,tec,72.1,push,68.6,attend,303.4
7,forward,109.5,make,105.4,reward,147.1,past,657.1,research,88.8,time,99.7,gov,126.1,thought,69.1,make,58.1,call,235.9
8,prais,95.1,tec,100.2,join,146.6,te,647.7,bot,83.9,make,98.5,week,118.9,make,67.7,config,54.1,host,197.2
9,orient,91.7,design,92.6,last,145.8,commun,645.4,incred,82.3,lead,94.9,amaz,102.1,commun,66.7,commun,53.1,today,191.6


In [None]:
# TODO: get for each praise which  topic does it belong to
# Then, roughly check if the score relates to the praise

# Finally, compare using just some keywords for that.

# just categorize by keywords

In [68]:
nonzerodf = allpraise_df.loc[allpraise_df['AVG SCORE']>0]
nonzerodf.insert(0,'CLEANED REASON',nonzerodf['REASON'].apply(clean_praise))

print(f'among {len(allpraise_df)} praises, {len(nonzerodf)} have scores more than 0. Only include them. Next, clean them up.')

among 9690 praises, 9481 have scores more than 0. Only include them. Next, clean them up.


In [69]:
from re import search # for searching sub strings
type_keywords = {'attendance':'join|attend|show up|participat','discussion':'question|ask|discuss|discussion','work':'help|work|design|make|write|hack|edit','lead':'host|lead|initiate|form|organize|steward','share':'share|spread','twitter':'twitter|tweet','hack':'hack|test','general':'support|awesome','IRL':'trip|conference'}
allcategs = []
for kr,row in nonzerodf.iterrows():
    category = []
    praise = row['CLEANED REASON'].lower()
    for praise_type,keywords in type_keywords.items():
        if search(keywords,praise):
            category.append(praise_type)
    if len(category):
        allcategs.append(category)
    else:
        allcategs.append(np.nan)
category_df = pd.concat([nonzerodf.reset_index(), pd.DataFrame({"category":allcategs})],axis=1)


In [70]:
category_df.loc[category_df['category'].isnull()].to_csv('uncateogrized.csv')
print(f"{sum(category_df['category'].isnull())} out of {len(category_df)} praises uncategorized")
category_df.to_csv('categorized_praise.csv')

3247 out of 9481 uncategorized


In [72]:
pd.Series(type_keywords)

attendance              join|attend|show up|participat
discussion             question|ask|discuss|discussion
work             help|work|design|make|write|hack|edit
lead          host|lead|initiate|form|organize|steward
share                                     share|spread
twitter                                  twitter|tweet
hack                                         hack|test
general                                support|awesome
IRL                                    trip|conference
dtype: object

# analysis based on categorization
When there's a praise matching more than one category, they will be counted multiple times

In [73]:
categ_praise_scores = {k:[] for k in type_keywords.keys()}

for kr,row in category_df.iterrows():
    if type(row['category']) is list:
        for key in row['category']:
            categ_praise_scores[key] += [{'praise':row['REASON'],'avg_score':row['AVG SCORE'],'receiver':row['TO USER ACCOUNT'],'date':row['DATE']}]
categ_praise_scores_df = dict.fromkeys(type_keywords.keys())
for key, item in categ_praise_scores.items():
    categ_praise_scores_df[key]= pd.DataFrame(item)

##  the average, min, max score of each categorization

In [60]:
categ_stats = dict.fromkeys(type_keywords.keys())
for categ in categ_praise_scores_df.keys():
    categ_stats[categ] = {'mean':np.mean(categ_praise_scores_df[categ]['avg_score']),
                            'max':np.max(categ_praise_scores_df[categ]['avg_score']),
                            'min':np.min(categ_praise_scores_df[categ]['avg_score'])}
categ_stats_df = pd.DataFrame(categ_stats)
categ_stats_df.transpose().sort_values(by='mean')

Unnamed: 0,mean,max,min
attendance,2.92964,55.0,0.03
twitter,5.12092,47.75,0.33
share,5.177013,73.33,0.1
discussion,6.421413,79.33,0.1
IRL,8.79,18.33,1.5
general,9.912228,58.25,0.73
work,10.381128,84.67,0.1
lead,10.561823,125.67,0.13
hack,16.658664,125.67,1.0


## Top 3 highest scored praise in each category

In [78]:
from IPython.display import Markdown as md

mdtext = ''
for categ in categ_praise_scores_df.keys():
    categ_name = '# '+categ + '\n'
    toppraise = categ_praise_scores_df[categ].sort_values(by='avg_score',ascending=False).iloc[:3]
    top3_table= (f"\
    | Avg. score | To | Reason | Date |\n \
    |:-----------|----|:-------|\n")
    for kr,row in toppraise.iterrows():
        to_user = row['receiver']
        reason = row['praise']
        score = row['avg_score']
        date = row['date'][:10]
                    
        top3_table += (f"| {score} | {to_user} | {reason} | {date}\n")
        #print(f'Praise score average: {score}\nFROM {from_user} TO {to_user},reason:\n{reason}\n')
    mdtext += categ_name + top3_table    
md(mdtext)

# attendance
    | Avg. score | To | Reason | Date |
     |:-----------|----|:-------|
| 55.0 | iviangita#3204 | for accepting the repsonisbility of acting Steward for the TEC and keeping tht WG afloat while we put out a call for new legal minds to join. | 2021-11-03
| 25.33 | griff (💜, 💜)#8888 | for talking about Conviction Voting at an ETHCC venue, which lead Michael to meet Livia, and then Mt. Manu, who joined him to work in ArborVote DAO | 2021-07-30
| 24.25 | Juankbell#7458 | for contributing to Graviton Training, you’ve been doing an amazing job and the attendance has been absolutely phenomenal | 2021-11-04
# discussion
    | Avg. score | To | Reason | Date |
     |:-----------|----|:-------|
| 79.33 | Nuggan#5183 | for jumping in params, giving feedback and asking questions | 2021-07-15
| 68.0 | sem(🌸,🐝)#0161 | for engaging in discussion on the TE Commons Forum (https://forum.tecommons.org) the past week. Thank you for helping our Token Engineering Commons community share and learn! | 2021-07-15
| 51.67 | Alp#3768 | for a 2-hour onboarding session full of questions, insights, and specialized knowledge | 2021-07-28
# work
    | Avg. score | To | Reason | Date |
     |:-----------|----|:-------|
| 84.67 | natesuits#4789 | for helping me write and rewrite the forum post on the power of defaults: https://forum.tecommons.org/t/the-power-of-defaults-in-the-commons-configuration-dashboard/511/2 | 2021-07-25
| 80.33 | Nuggan#5183 | for helping me write and rewrite the forum post on the power of defaults: https://forum.tecommons.org/t/the-power-of-defaults-in-the-commons-configuration-dashboard/511/2 | 2021-07-25
| 77.93 | divine_comedian#5493 | for their work building the Commons Configuration Dashboard | 2021-07-30
# lead
    | Avg. score | To | Reason | Date |
     |:-----------|----|:-------|
| 125.67 | sem(🌸,🐝)#0161 | for developing and testing the smart contracts. They are actually hosting a demo of the augmented bonding curve and all the commons upgrade tooling. Much admiration and respect for that | 2021-11-28
| 78.67 | Vyvy-vi#5040 | for being our bot & data guru that we can always reach out to for meaningful information and problem solving | 2022-01-10
| 68.0 | mZ#3472 | for thought leadership in web3 and for keeping engineering ethics as our community’s North Star | 2021-12-25
# share
    | Avg. score | To | Reason | Date |
     |:-----------|----|:-------|
| 73.33 | sem(🌸,🐝)#0161 | for the incredible work with the demos, making improvements, managing the tech team and for being such a good teacher across the space and sharing his knowledge | 2021-07-15
| 68.0 | sem(🌸,🐝)#0161 | for engaging in discussion on the TE Commons Forum (https://forum.tecommons.org) the past week. Thank you for helping our Token Engineering Commons community share and learn! | 2021-07-15
| 59.33 | sem(🌸,🐝)#0161 | for the Real Time Launch action! Great sharing the war room with you!!! | 2022-01-24
# twitter
    | Avg. score | To | Reason | Date |
     |:-----------|----|:-------|
| 47.75 | iviangita#3204 | for finding out our twitter account as block and fix it super quick!!⚡ (We need to be carefull with bots) | 2021-09-08
| 42.0 | iviangita#3204 | for all the incredible behind-the-scenes work and all the little things she’s constantly doing in the back office helping with Twitter, the board and work agreements | 2021-09-02
| 33.33 | innov8tor3#3988 | for mentioning or retweeting TE Commons on the socials the past week! Thank you for helping us grow the Token Engineering Commons community and spreading the message! 🙏🏼☺️ | 2021-07-15
# hack
    | Avg. score | To | Reason | Date |
     |:-----------|----|:-------|
| 125.67 | sem(🌸,🐝)#0161 | for developing and testing the smart contracts. They are actually hosting a demo of the augmented bonding curve and all the commons upgrade tooling. Much admiration and respect for that | 2021-11-28
| 70.67 | liviade#1387 | for the long verification hack session for the final IH list | 2021-07-15
| 63.67 | VitorNunes#0090 | for all the work on the CCD, the designs, comments, user testing and making things understandable for people | 2021-08-19
# general
    | Avg. score | To | Reason | Date |
     |:-----------|----|:-------|
| 58.25 | akrtws (TE Academy)#4246 | for keeping projects, companies, and really smart people excited about supporting education for TEs! | 2021-10-28
| 41.75 | natesuits#4789 | for their support in the creation of the Communitas WG 🏘️ | 2021-09-18
| 39.25 | divine_comedian#5493 | for design, development, ideation and copywriting support on the CCD | 2021-09-01
# IRL
    | Avg. score | To | Reason | Date |
     |:-----------|----|:-------|
| 18.33 | chuygarcia.eth#6692 | for all the work in Comms, for leading Comms and helping organize the trip to Paris | 2021-07-15
| 16.8 | jukren#8803 | for making the Paris trip possible ✈️ | 2021-08-05
| 15.0 | mateodaza#3156 | for the epic road trip we had to Paris!!! | 2021-07-17


# TODO
- maybe further adjust keyword to make the top scores look normal
- how to make keywords a manipulable setting in json?
- incorporate this into cross-period analysis