In [1]:
import pandas as pd
import ast
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import pairwise_distances
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.utils import simple_preprocess


In [2]:
dao = pd.read_csv("data/DAOs_DATA.txt")

# Preprocessing

In [3]:
data = dao.drop('Unnamed: 0',axis=1)

data.Dao_title = data.Dao_title.str.replace('DAOHQ - ','').str.lower()

data.categories = data.categories.apply(lambda x: ast.literal_eval(x))


categories = ["Collector","DAO Tools","DeFi","Entertainment","Gaming","Grant","Investment","Media",
              "Metaverse","Operating Systems","Protocol","SPAD""Service","Social","Social Impact",
              "Venture Capital"]

def category_replace(x):
    for i, cat in enumerate(x):
        if cat not in categories:
            x[i] = "other"
    return x

data.categories = data.categories.apply(category_replace)

In [4]:
data.chain = data.chain.apply(lambda x: ast.literal_eval(x))

data['blockchain'] = data.chain.apply(lambda x: x[0] if len(x)==2 else None)
data['followers'] = data.chain.apply(lambda x: x[1] if len(x)==2 else None)

def clean_followers(x):
    if x:
        if 'M' in x:
            return float(x.replace('M',''))*1000000
        elif 'K'in x:
            return float(x.replace('K',''))*1000
    return 0.

data['followers'] = data.followers.apply(clean_followers)

In [5]:
data.drop(['simdaos','chain'], axis=1, inplace=True)


data['about'] = data.about.apply(lambda x: " ".join(simple_preprocess(x)))
data.blockchain.fillna("other", inplace=True)
data['blockchain'] = data.blockchain.str.lower()

In [6]:
data.isna().sum()

Dao_title     0
categories    0
about         0
blockchain    0
followers     0
dtype: int64

In [26]:
data.followers = data.followers.astype(int)

In [27]:
data.to_csv("dao_cleaned_data.csv",  index=False)

In [22]:
data.columns

Index(['Dao_title', 'categories', 'about', 'blockchain', 'followers'], dtype='object')

In [8]:
data.head()

Unnamed: 0,Dao_title,categories,about,blockchain,followers
0,yog8 dao,[Social Impact],yog dao encourages wellness in web by offering...,ethereum,0.0
1,lit protocol dao,"[DAO Tools, Protocol]",decentralized access control infrastructure de...,ethereum,20600.0
2,minmax dao,[Investment],the iotex native cross chain pegged assets amm...,ethereum,0.0
3,blockade games dao,"[DAO Tools, Gaming]",leading blockchain video game development studio,ethereum,13100.0
4,the lao dao,"[other, other]",the lao is limited liability for profit dao th...,other,0.0


In [9]:
dao_cat = data.categories.apply(lambda x: " ".join([ele.replace(" ","_").lower() for ele in x]))
dao_sentences = data.blockchain + " " + data.about + " " + dao_cat

In [10]:
dao_sentences.isna().sum()

0

In [11]:
users = pd.read_csv("user_dummy_data.csv")

In [12]:
users.head()

Unnamed: 0,age,gender,country,city,income,blockchains,category,choice
0,21,female,India,Bangalore,35000,[],"['music', 'sports', 'books', 'travel', 'finance']",music techno sports badminton cricket books m...
1,62,male,US,Philadelphia,95000,['Ripple'],"['sports', 'travel', 'finance']",sports table_tennis basketball cricket travel...
2,47,female,UK,Newcastle,100000,"['Ripple', 'Binance Smart Chain', 'Ethereum']","['music', 'sports', 'books', 'travel', 'finance']",music techno pop sports table_tennis books my...
3,61,female,US,San Antonio,115000,"['Ripple', 'Ethereum', 'Binance Smart Chain']","['music', 'books', 'travel', 'finance']",music country books science_fiction romance t...
4,43,female,US,Dallas,65000,"['Binance Smart Chain', 'Flow']","['music', 'sports', 'books', 'travel']",music rhythmandblues(r&b) country pop sports ...


In [13]:
users.blockchains = users.blockchains.apply(eval)

In [14]:
users_one = users.blockchains.apply(lambda x: " ".join([ele.replace(" ","_").lower() for ele in x]))
user_sentences = users.choice + " " + users_one

In [15]:
all_sents = np.hstack([dao_sentences, user_sentences])

In [16]:
def preprocessor(x):
    return " ".join(simple_preprocess(x))

vectorizer = TfidfVectorizer(max_features=5000, preprocessor=preprocessor)
_ = vectorizer.fit(all_sents)
X= vectorizer.transform(dao_sentences)

print(X.shape)


(1006, 4704)


In [17]:
len(vectorizer.vocabulary_)

4704

In [18]:
knn = NearestNeighbors(n_neighbors=5, metric='cosine')
knn.fit(X)

_, idx = knn.kneighbors(X[0].toarray())
print(user_sentences[0])

data.iloc[idx[0]]

 music techno sports badminton cricket books mysteries romance thrillers travel the_gap_year the_package_holiday finance stock_market insurance mutual_funds  


Unnamed: 0,Dao_title,categories,about,blockchain,followers
0,yog8 dao,[Social Impact],yog dao encourages wellness in web by offering...,ethereum,0.0
664,8dao,"[Social, Investment]",our mission is to connect like minded individu...,other,0.0
84,social impact dao llc dao,[Social Impact],to empower communities through disrupting and ...,other,0.0
105,wagmisaurus dao,"[Collector, Media]",the wagmisaurus project was created with one g...,other,0.0
30,blu3 dao,"[Social, Social Impact]",empowering women nonbinary and allies to learn...,other,0.0


In [19]:
print(user_sentences[2])
_, idx = knn.kneighbors(vectorizer.transform([user_sentences[2]]))

data.iloc[idx[0]]

 music techno pop sports table_tennis books mysteries science_fiction travel event_travel finance professional_advisory mutual_funds  ripple binance_smart_chain ethereum


Unnamed: 0,Dao_title,categories,about,blockchain,followers
291,rainbow dao,"[Protocol, DeFi]",ethereum wallet,ethereum,50100.0
313,unslashed finance dao,"[Protocol, DeFi]",decentralized insurance protocol built on ethe...,ethereum,10300.0
524,request dao,"[DAO Tools, DeFi]",ethereum based decentralized payment system,ethereum,66700.0
23,polymath dao,"[DAO Tools, DeFi]",decentralized protocol that operates on the et...,ethereum,76300.0
476,gnosis safe dao,"[DAO Tools, DeFi]",platform to manage digital assets on ethereum,ethereum,56600.0


In [20]:
import pickle
pickle.dump(vectorizer, open("vectorizer.pkl", "wb"))
pickle.dump(knn, open("knn.pkl", "wb"))