### Import essential libraries

In [1]:
import pandas as pd
import texthero as hero # for effective dataframe cleaning 
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize 
pd.set_option('display.max_columns', None)



### Read and clean main data

In [2]:
# reading praise data
df_praise = pd.read_csv("data/cleaned-non-quantifier-data.csv", index_col=False)

# convert date column from object to datetime format
df_praise["Date"] =  pd.to_datetime(df_praise["Date"], format='%Y-%m-%d')

# droppping unnecessary columns
df_praise.drop(["Year", "Month", "Day"], axis=1, inplace=True)

df_praise[(df_praise['IH per Praise'] -  48.829077) < 1]

df_praise.head(4)

Unnamed: 0,To,From,Reason for dishing,Server,Date,Room,v1 norm,v2 norm,v3 norm,Avg %,IH per Praise,IH per person,Unnamed: 12,v1,v2,v3,period,Cred per Praise,Cred per person,To.1,Room-NoEmoji,Source
0,zeptimusQ,Tam2140,for hosting this kicking params party!,Token Engineering Commons,2021-05-07,🙏praise,10000.0,100.0,200.0,0.001963,2.355252,39.04072,zeptimusQ,Metaverde,Juank,Edu,#17 May 7,,,,praise,Token Engineering Commons:praise
1,zeptimusQ,iviangita,for hosting and leading a lot of params parties,Token Engineering Commons,2021-05-07,🙏praise,10000.0,100.0,100.0,0.001663,1.995539,39.04072,zeptimusQ,Metaverde,Juank,Edu,#17 May 7,,,,praise,Token Engineering Commons:praise
2,zeptimusQ,JuankBell,for testing and deploying the bot to record an...,Token Engineering Commons,2021-04-28,🙏praise,1000.0,200.0,200.0,0.001342,1.610872,39.04072,zeptimusQ,Metaverde,Juank,Edu,#17 May 7,,,,praise,Token Engineering Commons:praise
3,zeptimusQ,iviangita,for the huge success of the MVV process,Token Engineering Commons,2021-04-30,🙏praise,1000.0,200.0,100.0,0.001043,1.25116,39.04072,zeptimusQ,Metaverde,Juank,Edu,#17 May 7,,,,praise,Token Engineering Commons:praise


### Read and clean tag data

In [3]:
# reading tag data
df_tag = pd.read_csv("data/Praise_quantifier_tags.csv", index_col=False)

# concatenating keywords and category columns to have a one comprehensive key column
df_tag["keyword"] = df_tag["Category"].str.cat(df_tag["Keywords"], sep=", ")
df_tag["keyword"].fillna(df_tag['Category'], inplace=True)
df_tag = df_tag.iloc[:16, [0,6]]
df_tag.columns = ["tag", "keyword"]

# some cleaning
df_tag["keyword"] = hero.clean(df_tag["keyword"])
df_tag

  return input.str.replace(r"^\d+\s|\s\d+\s|\s\d+$", " ")
  return input.str.replace(pat, '')


Unnamed: 0,tag,keyword
0,TEC1,comms article blog organized presentation grap...
1,TEC2,culture build soft gov soft gov survey vote vo...
2,TEC3,params parameters params param param parties
3,TEC4,legal legal legal legal strategy
4,TEC5,commons swarm tech dev dapp app tech dev dapp ...
5,TEC6,hatch outreach onboarding onboarding hatch out...
6,TEC7,omega survey philosophy
7,TEC8,stewards github steward project management
8,TEC9,labs
9,TEC10,transparency transparency youtube recording tr...


### Building `Doc2Vec` model and traing it

In [4]:
# list of keyword data
data = list(df_tag.keyword)

# building tagged data with respect to name of tags
tagged_data = [TaggedDocument(words=word_tokenize(d.lower()), tags=[str(i)]) for  d, i in zip(data, df_tag["tag"])]

# initializing some variables
max_epochs = 100
vec_size = 20
alpha = 0.025
# building model
model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)

model.build_vocab(tagged_data)

# training model with respect to max_epochs as a number of iteration
for epoch in range(max_epochs):
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=60)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
    
# save model, then later we can load and add to it
# without need to train it from scratch
model.save("data/cache/tag_d2v.model")

# model= Doc2Vec.load("data/cache/d2v.model")

### Testing model

In [5]:
# to find the vector of a document which is not in training data
test_data = word_tokenize("recording".lower())
v1 = model.infer_vector(test_data)

# to find most similar doc using tags
model.dv.most_similar([v1])

[('TEC10', 0.8960395455360413),
 ('TEC13', 0.6778905987739563),
 ('TEC12', 0.6620689630508423),
 ('TEC8', 0.6302334070205688),
 ('TEC15', 0.6228792071342468),
 ('TEC16', 0.6140098571777344),
 ('TEC4', 0.6041674613952637),
 ('TEC17', 0.60159832239151),
 ('TEC2', 0.5972786545753479),
 ('TEC14', 0.5870808362960815)]

### Adding tag column to praise data and save it

In [6]:
# adding tag to data
tag_list = []

for index_, row in df_praise.iterrows():
    tag_doc = word_tokenize(row[2].lower())
    tag_doc_vector = model.infer_vector(tag_doc)
    tag_list.append(model.dv.most_similar([tag_doc_vector])[0][0])
    
df_praise["tags"] = tag_list
df_praise.to_csv("data/tagged_praise_data.csv")

In [7]:
df_praise.head(10)

Unnamed: 0,To,From,Reason for dishing,Server,Date,Room,v1 norm,v2 norm,v3 norm,Avg %,IH per Praise,IH per person,Unnamed: 12,v1,v2,v3,period,Cred per Praise,Cred per person,To.1,Room-NoEmoji,Source,tags
0,zeptimusQ,Tam2140,for hosting this kicking params party!,Token Engineering Commons,2021-05-07,🙏praise,10000.0,100.0,200.0,0.001963,2.355252,39.04072,zeptimusQ,Metaverde,Juank,Edu,#17 May 7,,,,praise,Token Engineering Commons:praise,TEC3
1,zeptimusQ,iviangita,for hosting and leading a lot of params parties,Token Engineering Commons,2021-05-07,🙏praise,10000.0,100.0,100.0,0.001663,1.995539,39.04072,zeptimusQ,Metaverde,Juank,Edu,#17 May 7,,,,praise,Token Engineering Commons:praise,TEC3
2,zeptimusQ,JuankBell,for testing and deploying the bot to record an...,Token Engineering Commons,2021-04-28,🙏praise,1000.0,200.0,200.0,0.001342,1.610872,39.04072,zeptimusQ,Metaverde,Juank,Edu,#17 May 7,,,,praise,Token Engineering Commons:praise,TEC7
3,zeptimusQ,iviangita,for the huge success of the MVV process,Token Engineering Commons,2021-04-30,🙏praise,1000.0,200.0,100.0,0.001043,1.25116,39.04072,zeptimusQ,Metaverde,Juank,Edu,#17 May 7,,,,praise,Token Engineering Commons:praise,TEC6
4,zeptimusQ,iviangita,"for his awesome work on the recorder bot, for ...",Token Engineering Commons,2021-04-30,🙏praise,1000.0,200.0,100.0,0.001043,1.25116,39.04072,zeptimusQ,Metaverde,Juank,Edu,#17 May 7,,,,praise,Token Engineering Commons:praise,TEC10
5,zeptimusQ,Tam2140,for taking care of Ivy's access to Twitter DMs!!,Token Engineering Commons,2021-04-30,🙏praise,1000.0,100.0,200.0,0.001023,1.227785,39.04072,zeptimusQ,Metaverde,Juank,Edu,#17 May 7,,,,praise,Token Engineering Commons:praise,TEC9
6,zeptimusQ,GriffGreen,and the many other community members that did ...,Token Engineering Commons,2021-05-03,🎫general,1000.0,100.0,200.0,0.001023,1.227785,39.04072,zeptimusQ,Metaverde,Juank,Edu,#17 May 7,,,,general,Token Engineering Commons:general,TEC15
7,zeptimusQ,Tam2140,"for keeping the Stewards, and all Working Grou...",Token Engineering Commons,2021-05-06,🙏praise,1000.0,100.0,200.0,0.001023,1.227785,39.04072,zeptimusQ,Metaverde,Juank,Edu,#17 May 7,,,,praise,Token Engineering Commons:praise,TEC8
8,zeptimusQ,iviangita,for all the awesome MEMES!,Token Engineering Commons,2021-05-07,🙏praise,1000.0,100.0,200.0,0.001023,1.227785,39.04072,zeptimusQ,Metaverde,Juank,Edu,#17 May 7,,,,praise,Token Engineering Commons:praise,TEC1
9,zeptimusQ,cranders71,for attending the 1Hive TEC Hatch Params Party...,Token Engineering Commons,2021-05-07,🙏praise,1000.0,100.0,200.0,0.001023,1.227785,39.04072,zeptimusQ,Metaverde,Juank,Edu,#17 May 7,,,,praise,Token Engineering Commons:praise,TEC16
