### Import essential libraries

In [2]:
import pandas as pd
import numpy as np
import texthero as hero # cleaning pandas columns
import time
import nltk
import spacy # check similarity of words precisely
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
nlp = spacy.load('en_core_web_lg')

### Read `praise` and `tag` data into pandas dataframe and perform some cleansing

In [3]:
df_praise = pd.read_csv("data/sample_praise.csv")
df_tag = pd.read_csv("data/tags.csv")

df_praise.iloc[:,5] = hero.clean(df_praise.iloc[:,5])
df_praise = pd.concat([df_praise.iloc[:,0], df_praise.iloc[:,3:15]], axis=1)

  return input.str.replace(r"^\d+\s|\s\d+\s|\s\d+$", " ")
  return input.str.replace(pat, '')


In [4]:
df_praise.head(5)

Unnamed: 0,Category Code,To,From,Reason for dishing,Server,Date,Room,v1 norm,v2 norm,v3 norm,Avg %,IH per Praise,IH per person
0,TEC15,liviade,cranders71,always caring finicky bot valued community members,Telegram,2021-01-13,TE Praise,30.0,10.0,40.0,6.549.137.185,4.911.852.889,6.767.403.323
1,TEC12,chuygarcia92,Tam2140,masana temples recommendation nice mix low high beats,Token Engineering Commons,2021-04-30,🙏praise,1000.0,100.0,100.0,723.394.157,8.680.729.884,718.127.319
2,TEC12,iviangita,iviangita,joining legal weekly sync,Token Engineering Commons,2021-04-30,🙏praise,1000.0,80.0,100.0,6.595.463.701,7.914.556.442,2.387.003.435
3,TEC12,ygganderson,JessicaZartler,presence energy smiles comms working group today,Telegram,2020-11-24,TE Praise,20.0,60.0,30.0,1.283.215.058,,
4,TEC12,metaverde,iviangita,mentioning retweeting te commons socials past week thank helping us grow token engineering commons community spreading message,Telegram,2021-01-22,TE Commons,,,,0,198.159.943.382.873,3.170.559.094


In [5]:
df_tag

Unnamed: 0,tag,keyword
0,TEC1,comms article blog organized presentation graphic design website marketing seo social platform discord telegram forum discourse medium linkedin website article retweeting retweet retweets organized presentation communications
1,TEC2,culture build soft gov survey vote voting debate
2,TEC3,params parameters param parties party
3,TEC4,legal strategy
4,TEC5,commons swarm tech dev dapp tech dapp app bug
5,TEC6,hatch outreach onboarding onboard outreach
6,TEC7,omega survey philosophy
7,TEC8,stewards github steward project management
8,TEC9,labs lab
9,TEC10,transparency youtube recording record


### `check_freq` function returns most frequent tag for each row

In [6]:
def check_freq(tag_list):
    final_tag_list = []
    try:
        for lst in tag_list:
            str_ = ''
            for item in tag_list:
                str_ = str_ +" "+ item
            moby_tokens = nltk.word_tokenize(str_)
            text = nltk.Text(moby_tokens)

            dist = nltk.FreqDist(text)
            freq_lst = [k for k, v in sorted(dist.items(), key=lambda i: i[1], reverse=True)]
            final_tag_list.append(freq_lst[0])
        return final_tag_list[0]
    except:
        return []

### Iterating through all rows and tag data just for equal words(100% similarity based on tag dataset)

In [7]:
tag_dict = set()
for index_, row in df_tag.iterrows():
    tag_dict.add((row[0], row[1]))
    
tag_dict

{('TEC1',
  'comms article blog organized presentation graphic design website marketing seo social platform discord telegram forum discourse medium linkedin website article retweeting retweet retweets organized presentation communications'),
 ('TEC10', 'transparency youtube recording record'),
 ('TEC12', 'participation retweets'),
 ('TEC13', 'fundraising donating fundraising'),
 ('TEC14', 'technical infrastructure'),
 ('TEC15', 'tec community building'),
 ('TEC16', 'personal praise'),
 ('TEC17', 'gravity conflict nonviolent communication'),
 ('TEC2', 'culture build soft gov survey vote voting debate'),
 ('TEC3', 'params parameters param parties party'),
 ('TEC4', 'legal strategy'),
 ('TEC5', 'commons swarm tech dev dapp tech dapp app bug'),
 ('TEC6', 'hatch outreach onboarding onboard outreach'),
 ('TEC7', 'omega survey philosophy'),
 ('TEC8', 'stewards github steward project management'),
 ('TEC9', 'labs lab')}

In [8]:
praise_set = set()
for index_, row in df_praise.iloc[:100,:].iterrows():
    praise_set.add((index_, row[3]))
    
praise_set

{(0, 'always caring finicky bot valued community members'),
 (1, 'masana temples recommendation nice mix low high beats'),
 (2, 'joining legal weekly sync'),
 (3, 'presence energy smiles comms working group today'),
 (4,
  'mentioning retweeting te commons socials past week thank helping us grow token engineering commons community spreading message'),
 (5,
  'engaging discussion te commons forum https forum tecommons org past week thank helping token engineering commons community share learn'),
 (6,
  'mentioning retweeting te commons socials past week thank helping us grow token engineering commons community spreading message'),
 (7, 'insightful well formed presentation gravity today'),
 (8, 'conducting giving feedback audit'),
 (9, 'last session gravity incredible whole ecosystem'),
 (10,
  'joining hack session staying great discussion anarchy vs community governance'),
 (11, 'bringing board zenhub team'),
 (12, 'joining tec community call https www youtube com watch v dhnrqaow3ic')

In [70]:
list_tag = []
temp_1 = []
counter = 0

start_time = time.time()
for index_praise in praise_set:
    temp_1 = [row_tag[0] for row_tag in tag_dict for item_praise in set(index_praise[1].split(" ")) for item_tag in set(row_tag[1].split(" ")) if nlp(item_praise).similarity(nlp(item_tag)) == 1]
    if len(temp_1)>=1:
        list_tag.append((index_praise[0], check_freq(temp_1)))
        temp_1 = []
    else:
        list_tag.append((index_praise[0],np.nan))
    print(f"row {counter} is done.", end='\r')
    counter+=1

end_time = time.time()
print(f"\nTotal run time for {len(praise_set)} rows is: {end_time - start_time}")

  temp_1 = [row_tag[0] for row_tag in tag_dict for item_praise in set(index_praise[1].split(" ")) for item_tag in set(row_tag[1].split(" ")) if nlp(item_praise).similarity(nlp(item_tag)) == 1]


row 99 is done.
Total run time for 500 rows is: 552.9995832443237


In [77]:
df_temp = pd.DataFrame(list_tag, columns=['index_', 'tag'])
df_temp

Unnamed: 0,index_,tag
0,35,TEC5
1,88,TEC15
2,37,TEC5
3,66,
4,96,TEC3
...,...,...
95,95,TEC13
96,83,TEC17
97,26,TEC15
98,31,TEC5


In [78]:
df_sample_100 = pd.merge(df_praise, df_temp, how="left", left_index=True, right_on='index_')

In [81]:
df_sample_100 = df_sample_100.loc[df_sample_100.index.dropna()]

In [87]:
df_sample_100.columns

Index(['Category Code', 'To', 'From', 'Reason for dishing', 'Server', 'Date',
       'Room', 'v1 norm', 'v2 norm', 'v3 norm', 'Avg %', 'IH per Praise',
       'IH per person', 'index_', 'tag'],
      dtype='object')

In [89]:
df_compare = df_sample_100[['Category Code', 'tag', 'Reason for dishing']]

In [91]:
df_compare.isnull().sum()

Category Code          0
tag                   30
Reason for dishing     0
dtype: int64

In [92]:
df_compare.head()

Unnamed: 0,Category Code,tag,Reason for dishing
58.0,TEC15,TEC15,always caring finicky bot valued community members
44.0,TEC12,,masana temples recommendation nice mix low high beats
32.0,TEC12,TEC4,joining legal weekly sync
36.0,TEC12,TEC1,presence energy smiles comms working group today
25.0,TEC12,TEC15,mentioning retweeting te commons socials past week thank helping us grow token engineering commons community spreading message


In [73]:
# list_tag = []
# temp_1 = []
# temp_2 = []
# temp_3 = []
# counter = 0

# start_time = time.time()
# for index_praise in praise_set:
#     for row_tag in tag_dict:
#         temp_set_1 = set(index_praise[1].split(" "))
#         for item_praise in temp_set_1:
#             temp_set_2 = set(row_tag[1].split(" "))
#             for item_tag in temp_set_2:
#                 tag_token = nlp(item_tag)
#                 reason_token = nlp(item_praise)
#                 if reason_token.similarity(tag_token) == 1:
#                     temp_1.append(row_tag[0])
#             if len(temp_1)>=1:
#                 temp_2.append(check_freq(temp_1))
#                 temp_1 = []
#         if len(temp_2)>=1:
#             temp_3.append(check_freq(temp_2))
#             temp_2 = []
#     if len(temp_3)>=1:
#         list_tag.append((index_praise[0], check_freq(temp_3)))
#         temp_3 = []
#     else:
#         list_tag.append((index_praise[0],np.nan))
#     print(f"row {counter} is done.", end='\r')
#     counter+=1

# end_time = time.time()
# print(f"\nTotal run time for {df_praise.shape[0]} rows is: {end_time - start_time}")

In [None]:
# for i in range(1, 101):
#     print('#'*i+' '*(101-i)+f'{i}%', end='\r')
#     time.sleep(0.1)