In [1]:
# data manipulation
import pandas as pd
import numpy as np

import nltk

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn import preprocessing

from nltk.corpus import stopwords
from nltk import SnowballStemmer
import string

In [None]:
df = pd.read_csv("../raw/Onboard_Survey.csv")

df.head()

In [None]:
# skip first seven columns df.iloc[:, 0:7].head()

# selecting only open-ended responses 
df.iloc[:, 6:14].head()

open_ended = df.iloc[:, 6:14]

open_ended.head()

In [4]:
# rename columns to better manage columns
open_ended.columns = ['walletwhat_walletwhy', 'wallet_pain', 'defi_when', 'defiwhat_defiwhy', 'defi_pain', 'defi_outcome', 'defi_interest', 'defi_endgame']


open_ended.head()

Unnamed: 0,walletwhat_walletwhy,wallet_pain,defi_when,defiwhat_defiwhy,defi_pain,defi_outcome,defi_interest,defi_endgame
0,Trezor - needed cold storage.,keeping up with all the security parameters,Within the last year,uniswap - seems to have a stellar reputation.,Learning how to navigate web3 websites.,Discovered new financial products and revenue ...,Alchemix,Passive income through DeFi
1,"Trustwallet, was not knowing much,","still not coming to terms, which wallet to use...",I have never used DeFi,,,,,
2,"Coinbase, ease of transactions",,Within the last year,,,,AAVE,Move my traditional investments over
3,"trezor, it just works and its secure","setting up is painful, and dealing with the se...",Within the last year,"Uniswap, i had to trade between assets",Gas fees are fluctuating each second,lost money from weird protocols,Options,Become a DeFi native and have more DeFi assets...
4,Coinbase bc it was a whileee ago,"Feees, centralization etc",3-5 years ago,,,,,


In [None]:
# This is Part 2 of Onboard Survey Exploratory Analysis

# For Part 1 see onboard_survey_open_ended.ipynb
# For Part 1 https://forum.bankless.community/t/onboard-survey-exploratory-analysis/1048

# Part 2 Open-Ended questions to address include:

# What has been painful about using DeFi apps or what has or is an obstacle in your way to using a DeFi app? [column: defi_pain]
# Tell us about one positive or unexpected outcome you had from using a DeFi app? [column: defi_outcome]
# What DeFi app are you most interested in using? [column: defi_interest]
# What is your DeFi endgame? [column: defi_endgame]

In [6]:
# actual columns of interest
open_ended.iloc[:, 4:].head()

Unnamed: 0,defi_pain,defi_outcome,defi_interest,defi_endgame
0,Learning how to navigate web3 websites.,Discovered new financial products and revenue ...,Alchemix,Passive income through DeFi
1,,,,
2,,,AAVE,Move my traditional investments over
3,Gas fees are fluctuating each second,lost money from weird protocols,Options,Become a DeFi native and have more DeFi assets...
4,,,,


In [None]:
# focus on 1 column first, defi_pain
# What has been painful about using DeFi apps or what has or is an obstacle in your way to using a DeFi app? [column: defi_pain]

open_ended['defi_pain']

In [None]:
# Topic Modeling

# Preparing Text Data for NLP
# Goal: Turn text data in to matrix (row = document, column = feature)

# Steps: 

# forming a corpus of text
# stemming and lemmatization
# tokenization
# removing stop-words
# finding words co-located together (N-grams)

In [8]:
# Example of how a Stemmer works

stemmer = SnowballStemmer('english')
print(stemmer.stem('lies'))
print(stemmer.stem('lying'))
print(stemmer.stem('systematic'))
print(stemmer.stem('running'))

lie
lie
systemat
run


In [13]:
# Apply Stemming & Lemmatization to defi_pain

# take entire column in open_ended df
# split sentences (each row) into words
# store in empty list

defi_pain_list = []

# 12 Rows Removed
for row in open_ended['defi_pain']:
    try:
        defi_pain_list.append(row.split())
    except:
        continue
        
defi_pain_list  # this is a Nested list - list of list; 

[['Learning', 'how', 'to', 'navigate', 'web3', 'websites.'],
 ['Gas', 'fees', 'are', 'fluctuating', 'each', 'second'],
 ['Terrible', 'UIUX', 'and', 'I', "don't", 'understand', 'it', 'all.'],
 ['Gas', 'fees'],
 ['gas',
  'fees',
  'on',
  'eth',
  'main',
  'net.',
  'resolved',
  'by',
  'doing',
  'most',
  'things',
  'on',
  'polygon'],
 ['Gas', 'fees'],
 ['Lack',
  'of',
  'user',
  'friendly',
  'interfaces',
  'or',
  'documentation',
  'that',
  'is',
  'not',
  'detailed',
  'enough'],
 ['gas', 'costs'],
 ['Ignoring',
  'the',
  'obvious',
  'gas',
  'fees',
  'answer,',
  "I'm",
  'bad',
  'at',
  'math,',
  'and',
  'doing',
  'the',
  'more',
  'advanced',
  'things',
  'like',
  'providing',
  'liquidity',
  'concern',
  'me.',
  'I',
  'want',
  'to',
  'strike',
  'a',
  'more',
  'conservative',
  'risk/reward',
  'balance.'],
 ['Gas', 'fees.', 'Also', 'not', 'understand', 'liquidation', 'or', 'IL.'],
 ['Not',
  'having',
  'an',
  'easy',
  'way',
  'to',
  'play',
  'w

In [None]:
# Loop through defi_pain_list[0] and apply stemming

for word in defi_pain_list[0]:
    print(stemmer.stem(word))

In [19]:
# Loop through defi_pain_list (list of list) and apply stemming

for list in defi_pain_list:
    for word in list:
        print(stemmer.stem(word))

learn
how
to
navig
web3
websites.
gas
fee
are
fluctuat
each
second
terribl
uiux
and
i
don't
understand
it
all.
gas
fee
gas
fee
on
eth
main
net.
resolv
by
do
most
thing
on
polygon
gas
fee
lack
of
user
friend
interfac
or
document
that
is
not
detail
enough
gas
cost
ignor
the
obvious
gas
fee
answer,
i'm
bad
at
math,
and
do
the
more
advanc
thing
like
provid
liquid
concern
me.
i
want
to
strike
a
more
conserv
risk/reward
balance.
gas
fees.
also
not
understand
liquid
or
il.
not
have
an
easi
way
to
play
with
defi
apps.
best
thing
i
can
do
is
go
to
an
l2
network
and
play
around,
but
there
are
disadvantag
to
that
method.
basic
just
have
a
sandbox
to
touch
and
feel
the
protocol.
rug
pulls;
unclear
token
mechan
understand
whi
pair
are
necessari
for
amms,
then
thing
like
imperman
loss
not
realli
understand
the
impact
of
il,
the
cost
of
execut
a
smart
contract,
keep
track
of
p&l
most
assum
that
you
alreadi
know
what
you'r
doing.
less
clear
for
noobs.
lack
of
document
wait
for
confirm
or
have
to
do
di

mention
issu
with
metamask/ledg
combo
(just
make
everyth
take
longer
than
it
should
and
i'm
not
a
veri
patient
person!),
and
2)
the
gas
fee
associ
with
use
defi
app
on
the
eth
chain.
calcul
ip
some
are
too
complic
and
it
scare
me
away
i
find
it
hard
to
gaug
a
project
popularity,
and
legitimacy.
it
ok.
some
could
have
better
ux.
earli
on
it
not
alway
clear
what
to
do
when
you
do
not
understand
staking,
lps,
etc.
some
front
end
are
veri
sketchi
look
(curve).
then
the
crazi
high
price
of
gas
made
ani
transact
not
worth
it.
thank
that
seem
better
now.
too
littl
documentation.
they
are
also
get
a
bit
more
complic
now.
usual
document
for
new
use
is
poor.
onboard
experi
isn't
for
the
faint
of
heart.
understand
concept
difficult
concepts.
veri
high
fees.
trial
and
error
cost
money.
fear
of
imperman
loss.
stablecoin
return
seem
(i
can
be
wrong)
lower
than
in
cefi,
and
sinc
they
move,
unclear
if
i
incurr
in
opportun
cost.
access
to
less
known
tokens;
high
gas
fee
initi
il,
recent
gas
to
execut
m

In [26]:
# Example of Removing Punctuation

# Before
defi_pain_list[0]

# Create translator
translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))

# After removes "." period in "websites."
defi_pain_list[0][5].translate(translator)

# NOTE: This only works on indiviual strings/words, NOT on lists

'websites '

In [31]:
## Tokenizing

# Create a function to take a string, split into individual words, 
# Remove punctuation, stemming and tokenizing all in ONE function

# overlaps slightly with above

# 12 rows got removed with defi_pain_list

defi_pain_list

[['Learning', 'how', 'to', 'navigate', 'web3', 'websites.'],
 ['Gas', 'fees', 'are', 'fluctuating', 'each', 'second'],
 ['Terrible', 'UIUX', 'and', 'I', "don't", 'understand', 'it', 'all.'],
 ['Gas', 'fees'],
 ['gas',
  'fees',
  'on',
  'eth',
  'main',
  'net.',
  'resolved',
  'by',
  'doing',
  'most',
  'things',
  'on',
  'polygon'],
 ['Gas', 'fees'],
 ['Lack',
  'of',
  'user',
  'friendly',
  'interfaces',
  'or',
  'documentation',
  'that',
  'is',
  'not',
  'detailed',
  'enough'],
 ['gas', 'costs'],
 ['Ignoring',
  'the',
  'obvious',
  'gas',
  'fees',
  'answer,',
  "I'm",
  'bad',
  'at',
  'math,',
  'and',
  'doing',
  'the',
  'more',
  'advanced',
  'things',
  'like',
  'providing',
  'liquidity',
  'concern',
  'me.',
  'I',
  'want',
  'to',
  'strike',
  'a',
  'more',
  'conservative',
  'risk/reward',
  'balance.'],
 ['Gas', 'fees.', 'Also', 'not', 'understand', 'liquidation', 'or', 'IL.'],
 ['Not',
  'having',
  'an',
  'easy',
  'way',
  'to',
  'play',
  'w

In [32]:
# Tokenize Function

def tokenize(text):
    translator=str.maketrans(string.punctuation, ' '*len(string.punctuation)) # translator replace punct w empty space
    return [stemmer.stem(i) for i in text.translate(translator).split()]



In [38]:
# Loop through list of list (defi_pain_list) 
# Apply tokenize() function
# save output to new list
# output needs to be a vector of individual words

# NOTE: Because tokenize() function returns a list, each word will be put into it's own list

defi_pain_tokenize = []

for list in defi_pain_list:
    for word in list:
        defi_pain_tokenize.append(tokenize(word))  # This ia a "Bag of Words" - a list
        
defi_pain_tokenize

# Last step need to FLATTEN a list of lists into one list/vector of words - "Bag of Words"
# Bag of word, a list cleaned of punctuation, stemmed, now a vector of individual words

defi_pain_tokenize_flat = [item for sublist in defi_pain_tokenize for item in sublist]

defi_pain_tokenize_flat

['learn',
 'how',
 'to',
 'navig',
 'web3',
 'websit',
 'gas',
 'fee',
 'are',
 'fluctuat',
 'each',
 'second',
 'terribl',
 'uiux',
 'and',
 'i',
 'don',
 't',
 'understand',
 'it',
 'all',
 'gas',
 'fee',
 'gas',
 'fee',
 'on',
 'eth',
 'main',
 'net',
 'resolv',
 'by',
 'do',
 'most',
 'thing',
 'on',
 'polygon',
 'gas',
 'fee',
 'lack',
 'of',
 'user',
 'friend',
 'interfac',
 'or',
 'document',
 'that',
 'is',
 'not',
 'detail',
 'enough',
 'gas',
 'cost',
 'ignor',
 'the',
 'obvious',
 'gas',
 'fee',
 'answer',
 'i',
 'm',
 'bad',
 'at',
 'math',
 'and',
 'do',
 'the',
 'more',
 'advanc',
 'thing',
 'like',
 'provid',
 'liquid',
 'concern',
 'me',
 'i',
 'want',
 'to',
 'strike',
 'a',
 'more',
 'conserv',
 'risk',
 'reward',
 'balanc',
 'gas',
 'fee',
 'also',
 'not',
 'understand',
 'liquid',
 'or',
 'il',
 'not',
 'have',
 'an',
 'easi',
 'way',
 'to',
 'play',
 'with',
 'defi',
 'app',
 'best',
 'thing',
 'i',
 'can',
 'do',
 'is',
 'go',
 'to',
 'an',
 'l2',
 'network',
 'an

In [None]:
# CountVectorizer, a library imported from sklearn, that will tokenize, 
# but also count duplicates of words and create a matrix that contains the frequency of each word
# This is large matrix, so the output is a sparse matrix

# Process: (similar to fitting models in sklearn), we create the vectorizer object
# then fit each word to give an overall corpus bag of words and list of features (unique words)

In [48]:
vectorizer = CountVectorizer(analyzer= "word",
                            tokenizer=tokenize,
                            ngram_range=(0,1),
                            strip_accents='unicode',
                            min_df = 0.0,
                            max_df = 1)        # got an error to lower min_df and raise max_df

In [51]:
defi_pain_bag_of_words = vectorizer.fit_transform(defi_pain_tokenize_flat) # transform our corpus into a bag of words
defi_pain_features = vectorizer.get_feature_names()


defi_pain_features

['0',
 '1',
 '10',
 '1001',
 '1559',
 '2',
 '2020',
 '2nd',
 '37',
 '422',
 '5',
 'abov',
 'accomplish',
 'account',
 'achiev',
 'across',
 'action',
 'add',
 'address',
 'advanc',
 'afraid',
 'again',
 'against',
 'alchemix',
 'almost',
 'along',
 'alreadi',
 'anxious',
 'anyth',
 'appear',
 'applic',
 'approv',
 'area',
 'argent',
 'asid',
 'assum',
 'attend',
 'audit',
 'authent',
 'averag',
 'awar',
 'background',
 'badger',
 'bankless',
 'bare',
 'barrier',
 'basic',
 'behind',
 'beng',
 'biggest',
 'biuy',
 'blockcash',
 'blog',
 'break',
 'bridg',
 'bsc',
 'bug',
 'buggi',
 'build',
 'bullish',
 'case',
 'cefi',
 'central',
 'certain',
 'chain',
 'changer',
 'chore',
 'combo',
 'come',
 'communiti',
 'compat',
 'confid',
 'consequ',
 'conserv',
 'consid',
 'convolut',
 'costum',
 'crazi',
 'creat',
 'cross',
 'damocl',
 'dead',
 'decid',
 'delight',
 'depend',
 'deposit',
 'deriv',
 'did',
 'didn',
 'differenti',
 'difficulti',
 'dig',
 'direct',
 'disadvantag',
 'doubt',
 'drop

In [54]:
#print(defi_pain_bag_of_words)
#print(defi_pain_features)

#defi_pain_features[0:10]

['0', '1', '10', '1001', '1559', '2', '2020', '2nd', '37', '422']

In [55]:
# Latent Dirichlet Allocation

lda = LatentDirichletAllocation(learning_method='online') 

doctopic = lda.fit_transform( defi_pain_bag_of_words )

doctopic

array([[0.1 , 0.1 , 0.1 , ..., 0.1 , 0.1 , 0.1 ],
       [0.1 , 0.1 , 0.1 , ..., 0.1 , 0.1 , 0.1 ],
       [0.1 , 0.1 , 0.1 , ..., 0.1 , 0.1 , 0.1 ],
       ...,
       [0.1 , 0.1 , 0.1 , ..., 0.1 , 0.1 , 0.1 ],
       [0.1 , 0.1 , 0.1 , ..., 0.1 , 0.1 , 0.1 ],
       [0.05, 0.55, 0.05, ..., 0.05, 0.05, 0.05]])

In [57]:
# NOTE: This might not look helpful at first

defi_pain_keywords_list = []

for i, topic in enumerate(lda.components_):
    word_idx = np.argsort(topic)[::-1][:5]
    defi_pain_keywords = ', '.join(defi_pain_features[i] for i in word_idx)
    defi_pain_keywords_list.append(defi_pain_keywords)
    print(i, defi_pain_keywords)
    


0 won, rural, platform, thank, crazi
1 volatil, prohibit, teach, certain, simpl
2 recent, initi, gaug, project, 2
3 faith, area, fair, v3, simplic
4 almost, ground, 5, riski, sketchi
5 uni, featur, huge, dead, howev
6 exposur, haven, lower, legitimaci, ip
7 found, trial, select, faint, person
8 function, emiss, rate, leap, rare
9 your, due, sinc, error, wrong


In [None]:
## NOTE: The above does not appear to contain stop words


In [None]:
## N-Grams: Adding context by creating N-Grams

# instead of treating each word as an individual unit
# treat each group of 2 words or 3 words or n-words as a unit
# "Bag of n-grams", where n is the number of words in each chunk