In [1]:
# data manipulation
import pandas as pd
import numpy as np

import nltk

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn import preprocessing

from nltk.corpus import stopwords
from nltk import SnowballStemmer
import string

In [None]:
df = pd.read_csv("../raw/Onboard_Survey.csv")

df.head()

In [None]:
# skip first seven columns df.iloc[:, 0:7].head()

# selecting only open-ended responses 
df.iloc[:, 6:14].head()

open_ended = df.iloc[:, 6:14]

open_ended.head()

In [4]:
# rename columns to better manage columns
open_ended.columns = ['walletwhat_walletwhy', 'wallet_pain', 'defi_when', 'defiwhat_defiwhy', 'defi_pain', 'defi_outcome', 'defi_interest', 'defi_endgame']


open_ended.head()

Unnamed: 0,walletwhat_walletwhy,wallet_pain,defi_when,defiwhat_defiwhy,defi_pain,defi_outcome,defi_interest,defi_endgame
0,Trezor - needed cold storage.,keeping up with all the security parameters,Within the last year,uniswap - seems to have a stellar reputation.,Learning how to navigate web3 websites.,Discovered new financial products and revenue ...,Alchemix,Passive income through DeFi
1,"Trustwallet, was not knowing much,","still not coming to terms, which wallet to use...",I have never used DeFi,,,,,
2,"Coinbase, ease of transactions",,Within the last year,,,,AAVE,Move my traditional investments over
3,"trezor, it just works and its secure","setting up is painful, and dealing with the se...",Within the last year,"Uniswap, i had to trade between assets",Gas fees are fluctuating each second,lost money from weird protocols,Options,Become a DeFi native and have more DeFi assets...
4,Coinbase bc it was a whileee ago,"Feees, centralization etc",3-5 years ago,,,,,


In [None]:
# This is Part 2 of Onboard Survey Exploratory Analysis

# For Part 1 see onboard_survey_open_ended.ipynb
# For Part 1 https://forum.bankless.community/t/onboard-survey-exploratory-analysis/1048

# Part 2 Open-Ended questions to address include:

# What has been painful about using DeFi apps or what has or is an obstacle in your way to using a DeFi app? [column: defi_pain]
# Tell us about one positive or unexpected outcome you had from using a DeFi app? [column: defi_outcome]
# What DeFi app are you most interested in using? [column: defi_interest]
# What is your DeFi endgame? [column: defi_endgame]

In [None]:
# actual columns of interest
open_ended.iloc[:, 4:].head()

In [None]:
# focus on 1 column first, defi_pain
# What has been painful about using DeFi apps or what has or is an obstacle in your way to using a DeFi app? [column: defi_pain]

open_ended['defi_pain']

In [None]:
# Topic Modeling

# Preparing Text Data for NLP
# Goal: Turn text data in to matrix (row = document, column = feature)

# Steps: 

# forming a corpus of text
# stemming and lemmatization
# tokenization
# removing stop-words
# finding words co-located together (N-grams)

In [10]:
# Example of how a Stemmer works

stemmer = SnowballStemmer('english')
print(stemmer.stem('lies'))
print(stemmer.stem('lying'))
print(stemmer.stem('systematic'))
print(stemmer.stem('running'))

lie
lie
systemat
run


In [None]:
# Apply Stemming & Lemmatization to defi_pain

# take entire column in open_ended df
# split sentences (each row) into words
# store in empty list

defi_pain_list = []

# 12 Rows Removed
for row in open_ended['defi_pain']:
    try:
        defi_pain_list.append(row.split())
    except:
        continue
        
defi_pain_list  # this is a Nested list - list of list; 

In [None]:
# Loop through defi_pain_list[0] and apply stemming

for word in defi_pain_list[0]:
    print(stemmer.stem(word))

In [None]:
# Loop through defi_pain_list (list of list) and apply stemming

for list in defi_pain_list:
    for word in list:
        print(stemmer.stem(word))

In [26]:
# Example of Removing Punctuation

# Before
defi_pain_list[0]

# Create translator
translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))

# After removes "." period in "websites."
defi_pain_list[0][5].translate(translator)

# NOTE: This only works on indiviual strings/words, NOT on lists

'websites '

In [None]:
## Tokenizing

# Create a function to take a string, split into individual words, 
# Remove punctuation, stemming and tokenizing all in ONE function

# overlaps slightly with above

# 12 rows got removed with defi_pain_list

defi_pain_list

In [14]:
# Tokenize Function

def tokenize(text):
    translator=str.maketrans(string.punctuation, ' '*len(string.punctuation)) # translator replace punct w empty space
    return [stemmer.stem(i) for i in text.translate(translator).split()]



In [None]:
# Loop through list of list (defi_pain_list) 
# Apply tokenize() function
# save output to new list
# output needs to be a vector of individual words

# NOTE: Because tokenize() function returns a list, each word will be put into it's own list

defi_pain_tokenize = []

for list in defi_pain_list:
    for word in list:
        defi_pain_tokenize.append(tokenize(word))  # This ia a "Bag of Words" - a list
        
defi_pain_tokenize

# Last step need to FLATTEN a list of lists into one list/vector of words - "Bag of Words"
# Bag of word, a list cleaned of punctuation, stemmed, now a vector of individual words

defi_pain_tokenize_flat = [item for sublist in defi_pain_tokenize for item in sublist]

defi_pain_tokenize_flat

In [None]:
# CountVectorizer, a library imported from sklearn, that will tokenize, 
# but also count duplicates of words and create a matrix that contains the frequency of each word
# This is large matrix, so the output is a sparse matrix

# Process: (similar to fitting models in sklearn), we create the vectorizer object
# then fit each word to give an overall corpus bag of words and list of features (unique words)

In [48]:
vectorizer = CountVectorizer(analyzer= "word",
                            tokenizer=tokenize,
                            ngram_range=(0,1),
                            strip_accents='unicode',
                            min_df = 0.0,
                            max_df = 1)        # got an error to lower min_df and raise max_df

In [None]:
defi_pain_bag_of_words = vectorizer.fit_transform(defi_pain_tokenize_flat) # transform our corpus into a bag of words
defi_pain_features = vectorizer.get_feature_names()


defi_pain_features

In [None]:
#print(defi_pain_bag_of_words)
#print(defi_pain_features)

#defi_pain_features[0:10]

In [None]:
# Latent Dirichlet Allocation

lda = LatentDirichletAllocation(learning_method='online') 

doctopic = lda.fit_transform( defi_pain_bag_of_words )

doctopic

In [None]:
# NOTE: This might not look helpful at first

defi_pain_keywords_list = []

for i, topic in enumerate(lda.components_):
    word_idx = np.argsort(topic)[::-1][:5]      # NOTE: 5
    defi_pain_keywords = ', '.join(defi_pain_features[i] for i in word_idx)
    defi_pain_keywords_list.append(defi_pain_keywords)
    print(i, defi_pain_keywords)
    


In [None]:
## NOTE: The above does not appear to contain stop words


In [None]:
## N-Grams: Adding context by creating N-Grams

# instead of treating each word as an individual unit
# treat each group of 2 words or 3 words or n-words as a unit
# "Bag of n-grams", where n is the number of words in each chunk

In [59]:
bi_vectorizer = CountVectorizer(analyzer= "word",
                                tokenizer=tokenize,
                                ngram_range=(0,2),          # Allow for bigrams
                                strip_accents='unicode',
                                min_df = 0.0,
                                max_df = 1)      # got an error to lower min_df and raise max_df

# Creating bag of words
bi_defi_pain_bag_of_words = bi_vectorizer.fit_transform(defi_pain_tokenize_flat) # transform our corpus into a bag of words
bi_defi_pain_features = bi_vectorizer.get_feature_names()

# Fitting LDA Model
bi_lda = LatentDirichletAllocation(n_components = 5, learning_method='online')   # NOTE: n_components = 5
bi_doctopic = bi_lda.fit_transform(bi_defi_pain_bag_of_words)

# Display the top keywords in each topic
bi_defi_pain_keywords_list = []

for i, topic in enumerate(bi_lda.components_):
    word_idx = np.argsort(topic)[::-1][:10]     # NOTE: 10 instead of 5
    bi_defi_pain_keywords = ', '.join(bi_defi_pain_features[i] for i in word_idx)
    bi_defi_pain_keywords_list.append(bi_defi_pain_keywords)
    print(i, bi_defi_pain_keywords)

0 uni, rural, v3, area, ground, huge, trial, wrong, action, known
1 won, faith, leap, found, howev, certain, error, gwei, move, incurr
2 rare, teach, riski, simplic, exact, recent, near, stablecoin, select, initi
3 function, almost, simpl, featur, fair, emiss, dead, exposur, prohibit, rate
4 your, volatil, platform, faint, thank, gaug, lps, heard, justifi, combo


In [None]:
## TF-IDF: Weighting terms based on frequency

# re-weights words to emphasize words that are unique to a document


In [17]:
# Example Stopwords
stop = stopwords.words('english') + ['invent', 'produce', 'method', 'use', 'first', 'second']
full_stopwords = [tokenize(s)[0] for s in stop]

full_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'our',
 'ourselv',
 'you',
 'you',
 'you',
 'you',
 'you',
 'your',
 'your',
 'yourself',
 'yourselv',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'she',
 'her',
 'her',
 'herself',
 'it',
 'it',
 'it',
 'itself',
 'they',
 'them',
 'their',
 'their',
 'themselv',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'that',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'be',
 'have',
 'has',
 'had',
 'have',
 'do',
 'doe',
 'did',
 'do',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'becaus',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'dure',
 'befor',
 'after',
 'abov',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'onc',
 'here',
 'there',
 'when',
 'where',
 'whi',
 'how',
 'all',
 'ani',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'som

In [18]:
tf_defi_pain_vectorizer = CountVectorizer(analyzer= 'word',  # unit of features are single words rather than phrases
                               tokenizer=tokenize, # function to create tokens
                               ngram_range=(0,2),   # Allow for bigrams
                               strip_accents='unicode',
                               stop_words=full_stopwords,  # see above Example Stopwords, other examples did NOT hv stop_words
                               min_df = 0.0,
                               max_df = 1)   # got an error to lower min_df and raise max_df

# Creating bag of words 
tf_defi_pain_bag_of_words = tf_defi_pain_vectorizer.fit_transform(defi_pain_tokenize_flat) # transform our corpus into a bag of words
tf_defi_pain_features = tf_defi_pain_vectorizer.get_feature_names()

# Use TfidfTransformer (see library import) to re-weight bag of words
tf_defi_pain_transformer = TfidfTransformer(norm = None, smooth_idf = True, sublinear_tf = True)
tf_defi_pain_tfidf = tf_defi_pain_transformer.fit_transform(tf_defi_pain_bag_of_words)

# Fitting LDA Model
tf_defi_pain_lda = LatentDirichletAllocation(n_components = 5, learning_method='online')  # NOTE: n_components = 5
tf_defi_pain_doctopic = tf_defi_pain_lda.fit_transform(tf_defi_pain_tfidf)

# Displaying the top keywords in each topic
tf_defi_pain_keywords_list = []


for i, topic in enumerate(tf_defi_pain_lda.components_):
    word_idx = np.argsort(topic)[::-1][:10]     # NOTE: 10 instead of 5
    tf_defi_pain_keywords = ', '.join(tf_defi_pain_features[i] for i in word_idx)
    tf_defi_pain_keywords_list.append(tf_defi_pain_keywords)
    print(i, tf_defi_pain_keywords)

0 faith, area, dead, incurr, cefi, error, simplic, stablecoin, trial, action
1 rural, emiss, howev, v3, almost, huge, due, exact, sinc, project
2 volatil, rare, featur, ground, initi, faint, goal, two, worri, ledger
3 teach, prohibit, found, simpl, platform, uni, certain, fair, exposur, recent
4 function, rate, leap, gwei, riski, 5, lower, move, known, wrong


In [19]:
tf_defi_pain_keywords_list

['faith, area, dead, incurr, cefi, error, simplic, stablecoin, trial, action',
 'rural, emiss, howev, v3, almost, huge, due, exact, sinc, project',
 'volatil, rare, featur, ground, initi, faint, goal, two, worri, ledger',
 'teach, prohibit, found, simpl, platform, uni, certain, fair, exposur, recent',
 'function, rate, leap, gwei, riski, 5, lower, move, known, wrong']

In [20]:
tf_defi_pain_doctopic

array([[0.2       , 0.2       , 0.2       , 0.2       , 0.2       ],
       [0.2       , 0.2       , 0.2       , 0.2       , 0.2       ],
       [0.2       , 0.2       , 0.2       , 0.2       , 0.2       ],
       ...,
       [0.2       , 0.2       , 0.2       , 0.2       , 0.2       ],
       [0.2       , 0.2       , 0.2       , 0.2       , 0.2       ],
       [0.02195422, 0.02195423, 0.9121831 , 0.02195422, 0.02195422]])

In [21]:
defi_pain_df = pd.DataFrame(tf_defi_pain_doctopic, columns = tf_defi_pain_keywords_list)

defi_pain_df.head()

Unnamed: 0,"faith, area, dead, incurr, cefi, error, simplic, stablecoin, trial, action","rural, emiss, howev, v3, almost, huge, due, exact, sinc, project","volatil, rare, featur, ground, initi, faint, goal, two, worri, ledger","teach, prohibit, found, simpl, platform, uni, certain, fair, exposur, recent","function, rate, leap, gwei, riski, 5, lower, move, known, wrong"
0,0.2,0.2,0.2,0.2,0.2
1,0.2,0.2,0.2,0.2,0.2
2,0.2,0.2,0.2,0.2,0.2
3,0.021955,0.021955,0.021955,0.021955,0.912181
4,0.021955,0.021955,0.021955,0.021955,0.912181
