In [1]:
import pandas as pd
import ast
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import pairwise_distances


In [2]:
dao = pd.read_csv("data/DAOs_DATA.txt")

In [3]:
dao.head()

Unnamed: 0.1,Unnamed: 0,Dao_title,categories,about,simdaos,chain
0,0,DAOHQ - Yog8 DAO,['Social Impact'],Yog8 DAO encourages wellness in Web3 by offeri...,"['Antidote DAO', 'BoldDao', 'Boss Beauties DAO...","['Ethereum', '89.0']"
1,1,DAOHQ - Lit Protocol DAO,"['DAO Tools', 'Protocol']",Decentralized access control infrastructure de...,"['88mph', 'APY Vision', '0x', '1inch Network']","['Ethereum', '20.6K']"
2,2,DAOHQ - MinMax DAO,['Investment'],The IoTeX native cross-chain pegged assets AMM...,"['80Acres DAO', 'Alpha Finance Lab', '8DAO', '...","['Ethereum', '55.0']"
3,3,DAOHQ - Blockade Games DAO,"['DAO Tools', 'Gaming']",Leading blockchain video game development stud...,"['88mph', 'APY Vision', '0x', '1inch Network']","['Ethereum', '13.1K']"
4,4,DAOHQ - The LAO DAO,"[' www.thelao.io', ' www.thelao.io']","The LAO is a limited-liability, for-profit DAO...",[],[]


# Preprocessing

In [4]:
dao.shape

(1006, 6)

In [5]:
data = dao.drop('Unnamed: 0',axis=1)

In [6]:
data.Dao_title = data.Dao_title.str.replace('DAOHQ - ','').str.lower()

In [7]:
data.categories = data.categories.apply(lambda x: ast.literal_eval(x))
data.categories

categories = ["Collector","DAO Tools","DeFi","Entertainment","Gaming","Grant","Investment","Media",
              "Metaverse","Operating Systems","Protocol","SPAD""Service","Social","Social Impact",
              "Venture Capital"]

def category_replace(x):
    for i, cat in enumerate(x):
        if cat not in categories:
            x[i] = "other"
    return x

data.categories.apply(category_replace)


mlb = MultiLabelBinarizer()



mlb.fit(data.categories)

mlb.classes_

cat=pd.DataFrame(mlb.transform(data.categories),columns=mlb.classes_)

In [8]:
data.chain = data.chain.apply(lambda x: ast.literal_eval(x))

In [9]:
data.chain[0][0]

'Ethereum'

In [10]:
data['blockchain'] = data.chain.apply(lambda x: x[0] if len(x)==2 else None)
data['followers'] = data.chain.apply(lambda x: x[1] if len(x)==2 else None)
data.head()

Unnamed: 0,Dao_title,categories,about,simdaos,chain,blockchain,followers
0,yog8 dao,[Social Impact],Yog8 DAO encourages wellness in Web3 by offeri...,"['Antidote DAO', 'BoldDao', 'Boss Beauties DAO...","[Ethereum, 89.0]",Ethereum,89.0
1,lit protocol dao,"[DAO Tools, Protocol]",Decentralized access control infrastructure de...,"['88mph', 'APY Vision', '0x', '1inch Network']","[Ethereum, 20.6K]",Ethereum,20.6K
2,minmax dao,[Investment],The IoTeX native cross-chain pegged assets AMM...,"['80Acres DAO', 'Alpha Finance Lab', '8DAO', '...","[Ethereum, 55.0]",Ethereum,55.0
3,blockade games dao,"[DAO Tools, Gaming]",Leading blockchain video game development stud...,"['88mph', 'APY Vision', '0x', '1inch Network']","[Ethereum, 13.1K]",Ethereum,13.1K
4,the lao dao,"[other, other]","The LAO is a limited-liability, for-profit DAO...",[],[],,


In [11]:
data.blockchain.unique()

array(['Ethereum', None, 'Solana', 'Polygon', 'Binance', 'Bitcoin',
       'Avalanche', 'NEAR', 'Fantom', 'Gnosis', 'Tezos', 'Arbitrum',
       'Cosmos', 'Arweave', 'Moonriver', 'Kylin'], dtype=object)

In [12]:
data.followers

0         89.0
1        20.6K
2         55.0
3        13.1K
4         None
         ...  
1001     157.0
1002      7.2K
1003    256.9K
1004     35.4K
1005      3.5K
Name: followers, Length: 1006, dtype: object

In [13]:
data.followers[113]

'93.6K'

In [14]:
def clean_followers(x):
    if x:
        if 'M' in x:
            return float(x.replace('M',''))*1000000
        elif 'K'in x:
            return float(x.replace('K',''))*1000
    return 0.

In [15]:
data['followers'] = data.followers.apply(clean_followers)

In [16]:
data.head()

Unnamed: 0,Dao_title,categories,about,simdaos,chain,blockchain,followers
0,yog8 dao,[Social Impact],Yog8 DAO encourages wellness in Web3 by offeri...,"['Antidote DAO', 'BoldDao', 'Boss Beauties DAO...","[Ethereum, 89.0]",Ethereum,0.0
1,lit protocol dao,"[DAO Tools, Protocol]",Decentralized access control infrastructure de...,"['88mph', 'APY Vision', '0x', '1inch Network']","[Ethereum, 20.6K]",Ethereum,20600.0
2,minmax dao,[Investment],The IoTeX native cross-chain pegged assets AMM...,"['80Acres DAO', 'Alpha Finance Lab', '8DAO', '...","[Ethereum, 55.0]",Ethereum,0.0
3,blockade games dao,"[DAO Tools, Gaming]",Leading blockchain video game development stud...,"['88mph', 'APY Vision', '0x', '1inch Network']","[Ethereum, 13.1K]",Ethereum,13100.0
4,the lao dao,"[other, other]","The LAO is a limited-liability, for-profit DAO...",[],[],,0.0


In [17]:
data.drop(['simdaos','chain'], axis=1, inplace=True)

In [18]:
data.head()

Unnamed: 0,Dao_title,categories,about,blockchain,followers
0,yog8 dao,[Social Impact],Yog8 DAO encourages wellness in Web3 by offeri...,Ethereum,0.0
1,lit protocol dao,"[DAO Tools, Protocol]",Decentralized access control infrastructure de...,Ethereum,20600.0
2,minmax dao,[Investment],The IoTeX native cross-chain pegged assets AMM...,Ethereum,0.0
3,blockade games dao,"[DAO Tools, Gaming]",Leading blockchain video game development stud...,Ethereum,13100.0
4,the lao dao,"[other, other]","The LAO is a limited-liability, for-profit DAO...",,0.0


In [19]:
data.to_csv("dao_cleaned_data.csv",  index=False)

In [20]:
from gensim.utils import simple_preprocess

In [21]:
data['about'] = data.about.apply(lambda x: " ".join(simple_preprocess(x)))

In [22]:
data.head()

Unnamed: 0,Dao_title,categories,about,blockchain,followers
0,yog8 dao,[Social Impact],yog dao encourages wellness in web by offering...,Ethereum,0.0
1,lit protocol dao,"[DAO Tools, Protocol]",decentralized access control infrastructure de...,Ethereum,20600.0
2,minmax dao,[Investment],the iotex native cross chain pegged assets amm...,Ethereum,0.0
3,blockade games dao,"[DAO Tools, Gaming]",leading blockchain video game development studio,Ethereum,13100.0
4,the lao dao,"[other, other]",the lao is limited liability for profit dao th...,,0.0


In [23]:
final_data = pd.concat([data, cat], axis=1)

In [24]:
data.shape, cat.shape

((1006, 5), (1006, 15))

In [25]:
final_data.head()

Unnamed: 0,Dao_title,categories,about,blockchain,followers,Collector,DAO Tools,DeFi,Entertainment,Gaming,Grant,Investment,Media,Metaverse,Operating Systems,Protocol,Social,Social Impact,Venture Capital,other
0,yog8 dao,[Social Impact],yog dao encourages wellness in web by offering...,Ethereum,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,lit protocol dao,"[DAO Tools, Protocol]",decentralized access control infrastructure de...,Ethereum,20600.0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
2,minmax dao,[Investment],the iotex native cross chain pegged assets amm...,Ethereum,0.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,blockade games dao,"[DAO Tools, Gaming]",leading blockchain video game development studio,Ethereum,13100.0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,the lao dao,"[other, other]",the lao is limited liability for profit dao th...,,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [26]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# corpus = final_data.about.tolist()
# vectorizer = TfidfVectorizer(max_features=5000, preprocessor= lambda x: " ".join(simple_preprocess(x)))
# X = vectorizer.fit_transform(corpus)

# print(X.shape)
# X

(1006, 4673)


<1006x4673 sparse matrix of type '<class 'numpy.float64'>'
	with 23396 stored elements in Compressed Sparse Row format>

In [27]:
X[0].shape

(1, 4673)

In [28]:
final_data.iloc[0].about

'yog dao encourages wellness in web by offering financial incentives for meditation and yoga to normalize self care and sacred geometry animated yoga boss nfts with chakra frequency music to induce meditative experience yog dao is yoga centered social impact dao providing mental health wellness metaverse to incentivize self care by merging gamification with meditation by offering financial rewards for individual and community meditation and yoga sessions we believe yoga and meditation are free mental health tools that should be easily accessible and our goal is to share yoga and meditation with billion people worldwide to normalize self care practices for the well being of our global community conscious contributions include sacred geometry animated yoga boss nft with chakra frequency music for an immediate meditative immersion in addition to the yog dao mental health wellness metaverse platform providing instant access to yoga meditation holistic health therapies and global yoga retre

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

dao_corpus = final_data.about.tolist()
corpus = dao_corpus.copy()

user_corpus = users.choice.tolist()

corpus.extend(user_corpus)

vectorizer = TfidfVectorizer(max_features=5000, preprocessor= lambda x: " ".join(simple_preprocess(x)))
_ = vectorizer.fit(corpus)
X = vectorizer.transform(dao_corpus)

print(X.shape)


(1006, 4697)


In [44]:
from sklearn.neighbors import NearestNeighbors

In [50]:
knn = NearestNeighbors(n_neighbors=5, metric='cosine')
knn.fit(X)

_, idx = knn.kneighbors(X[0])

idx

data.iloc[idx[0]]

Unnamed: 0,Dao_title,categories,about,blockchain,followers
0,yog8 dao,[Social Impact],yog dao encourages wellness in web by offering...,Ethereum,0.0
664,8dao,"[Social, Investment]",our mission is to connect like minded individu...,,0.0
84,social impact dao llc dao,[Social Impact],to empower communities through disrupting and ...,,0.0
105,wagmisaurus dao,"[Collector, Media]",the wagmisaurus project was created with one g...,,0.0
541,miami tech runs dao,[other],building the ultimate playground at the inters...,,0.0


In [37]:
users = pd.read_csv("user_dummy_data.csv")

In [38]:
users.head()

Unnamed: 0,age,gender,country,city,income,blockchains,category,choice
0,21,female,India,Bangalore,35000,[],"['music', 'sports', 'books', 'travel', 'finance']",music techno sports badminton cricket books m...
1,62,male,US,Philadelphia,95000,['Ripple'],"['sports', 'travel', 'finance']",sports table_tennis basketball cricket travel...
2,47,female,UK,Newcastle,100000,"['Ripple', 'Binance Smart Chain', 'Ethereum']","['music', 'sports', 'books', 'travel', 'finance']",music techno pop sports table_tennis books my...
3,61,female,US,San Antonio,115000,"['Ripple', 'Ethereum', 'Binance Smart Chain']","['music', 'books', 'travel', 'finance']",music country books science_fiction romance t...
4,43,female,US,Dallas,65000,"['Binance Smart Chain', 'Flow']","['music', 'sports', 'books', 'travel']",music rhythmandblues(r&b) country pop sports ...


In [49]:
 vectorizer.transform([users.choice.iloc[0]])

<1x4697 sparse matrix of type '<class 'numpy.float64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [55]:
users.choice.iloc[4]

' music rhythmandblues(r&b) country pop sports table_tennis books mysteries romance travel the_caravan/rv_road_trip '

In [56]:
#knn = NearestNeighbors(n_neighbors=5, metric='cosine')
#knn.fit(X)

_, idx = knn.kneighbors(vectorizer.transform([users.choice.iloc[2]+" solana ripple"]))

data.iloc[idx[0]]

Unnamed: 0,Dao_title,categories,about,blockchain,followers
769,switchboard dao,[Social],community curated oracle network on solana,Solana,11600.0
968,monkedao,"[DeFi, Collector, Social]",solana first community owned and operated nft dao,Solana,70100.0
50,socean dao,[DeFi],the best risk free yields on solana,Solana,2500.0
491,drift protocol dao,"[DeFi, Protocol]",decentralized exchange for perpetual futures b...,Solana,51900.0
816,solend dao,[DeFi],algorithmic decentralized lending and borrowin...,Solana,14800.0


In [58]:
"solana" in vectorizer.vocabulary_

True