In [1]:
# data manipulation
import pandas as pd
import numpy as np

import nltk

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn import preprocessing

from nltk.corpus import stopwords
from nltk import SnowballStemmer
import string

In [None]:
df = pd.read_csv("../raw/Onboard_Survey.csv")

df.head()

In [None]:

# skip first seven columns df.iloc[:, 0:7].head()

# selecting only open-ended responses 
df.iloc[:, 6:14].head()

open_ended = df.iloc[:, 6:14]

open_ended.head()

In [4]:

# rename columns to better manage columns
open_ended.columns = ['walletwhat_walletwhy', 'wallet_pain', 'defi_when', 'defiwhat_defiwhy', 'defi_pain', 'defi_outcome', 'defi_interest', 'defi_endgame']


open_ended.head()

Unnamed: 0,walletwhat_walletwhy,wallet_pain,defi_when,defiwhat_defiwhy,defi_pain,defi_outcome,defi_interest,defi_endgame
0,Trezor - needed cold storage.,keeping up with all the security parameters,Within the last year,uniswap - seems to have a stellar reputation.,Learning how to navigate web3 websites.,Discovered new financial products and revenue ...,Alchemix,Passive income through DeFi
1,"Trustwallet, was not knowing much,","still not coming to terms, which wallet to use...",I have never used DeFi,,,,,
2,"Coinbase, ease of transactions",,Within the last year,,,,AAVE,Move my traditional investments over
3,"trezor, it just works and its secure","setting up is painful, and dealing with the se...",Within the last year,"Uniswap, i had to trade between assets",Gas fees are fluctuating each second,lost money from weird protocols,Options,Become a DeFi native and have more DeFi assets...
4,Coinbase bc it was a whileee ago,"Feees, centralization etc",3-5 years ago,,,,,


In [None]:

# This is Part 2 of Onboard Survey Exploratory Analysis

# For Part 1 see onboard_survey_open_ended.ipynb
# For Part 1 https://forum.bankless.community/t/onboard-survey-exploratory-analysis/1048

# Part 2 Open-Ended questions to address include:

# What has been painful about using DeFi apps or what has or is an obstacle in your way to using a DeFi app? [column: defi_pain]

In [None]:
# Topic Modeling

# Preparing Text Data for NLP
# Goal: Turn text data in to matrix (row = document, column = feature)

# Steps: 

# forming a corpus of text
# stemming and lemmatization
# tokenization
# removing stop-words
# finding words co-located together (N-grams)

In [None]:
# Example of how a Stemmer works

stemmer = SnowballStemmer('english')
print(stemmer.stem('lies'))
print(stemmer.stem('lying'))
print(stemmer.stem('systematic'))
print(stemmer.stem('running'))

In [None]:
# Apply Stemming & Lemmatization to defi_pain

# take entire column in open_ended df
# split sentences (each row) into words
# store in empty list

defi_pain_list = []

# 12 Rows Removed
for row in open_ended['defi_pain']:
    try:
        defi_pain_list.append(row.split())
    except:
        continue
        
defi_pain_list  # this is a Nested list - list of list;

In [7]:
# Tokenize Function

def tokenize(text):
    translator=str.maketrans(string.punctuation, ' '*len(string.punctuation)) # translator replace punct w empty space
    return [stemmer.stem(i) for i in text.translate(translator).split()]

In [None]:
# Loop through list of list (defi_pain_list) 
# Apply tokenize() function
# save output to new list
# output needs to be a vector of individual words

# NOTE: Because tokenize() function returns a list, each word will be put into it's own list

defi_pain_tokenize = []

for list in defi_pain_list:
    for word in list:
        defi_pain_tokenize.append(tokenize(word))  # This ia a "Bag of Words" - a list
        
defi_pain_tokenize

# Last step need to FLATTEN a list of lists into one list/vector of words - "Bag of Words"
# Bag of word, a list cleaned of punctuation, stemmed, now a vector of individual words

defi_pain_tokenize_flat = [item for sublist in defi_pain_tokenize for item in sublist]

defi_pain_tokenize_flat

In [None]:

## TF-IDF: Weighting terms based on frequency

# re-weights words to emphasize words that are unique to a document

In [None]:
# Example Stopwords
stop = stopwords.words('english') + ['invent', 'produce', 'method', 'use', 'first', 'second']
full_stopwords = [tokenize(s)[0] for s in stop]

full_stopwords

In [10]:
tf_defi_pain_vectorizer = CountVectorizer(analyzer= 'word',  # unit of features are single words rather than phrases
                               tokenizer=tokenize, # function to create tokens
                               ngram_range=(0,2),   # Allow for bigrams
                               strip_accents='unicode',
                               stop_words=full_stopwords,  # see above Example Stopwords, other examples did NOT hv stop_words
                               min_df = 0.0,
                               max_df = 1)   # got an error to lower min_df and raise max_df

# Creating bag of words 
tf_defi_pain_bag_of_words = tf_defi_pain_vectorizer.fit_transform(defi_pain_tokenize_flat) # transform our corpus into a bag of words
tf_defi_pain_features = tf_defi_pain_vectorizer.get_feature_names()

# Use TfidfTransformer (see library import) to re-weight bag of words
tf_defi_pain_transformer = TfidfTransformer(norm = None, smooth_idf = True, sublinear_tf = True)
tf_defi_pain_tfidf = tf_defi_pain_transformer.fit_transform(tf_defi_pain_bag_of_words)

# Fitting LDA Model
tf_defi_pain_lda = LatentDirichletAllocation(n_components = 5, learning_method='online')  # NOTE: n_components = 5
tf_defi_pain_doctopic = tf_defi_pain_lda.fit_transform(tf_defi_pain_tfidf)

# Displaying the top keywords in each topic
tf_defi_pain_keywords_list = []


for i, topic in enumerate(tf_defi_pain_lda.components_):
    word_idx = np.argsort(topic)[::-1][:10]     # NOTE: 10 instead of 5
    tf_defi_pain_keywords = ', '.join(tf_defi_pain_features[i] for i in word_idx)
    tf_defi_pain_keywords_list.append(tf_defi_pain_keywords)
    print(i, tf_defi_pain_keywords)

0 uni, v3, area, dead, rural, certain, fair, leap, sinc, scare
1 teach, howev, emiss, almost, platform, prohibit, rate, simpl, known, 5
2 near, move, incurr, cefi, initi, recent, error, action, 2, legitimaci
3 found, simplic, exact, riski, due, select, trial, stablecoin, lower, lps
4 faith, volatil, function, rare, ground, exposur, huge, featur, gwei, thank


In [11]:
tf_defi_pain_keywords_list

['uni, v3, area, dead, rural, certain, fair, leap, sinc, scare',
 'teach, howev, emiss, almost, platform, prohibit, rate, simpl, known, 5',
 'near, move, incurr, cefi, initi, recent, error, action, 2, legitimaci',
 'found, simplic, exact, riski, due, select, trial, stablecoin, lower, lps',
 'faith, volatil, function, rare, ground, exposur, huge, featur, gwei, thank']

In [12]:
tf_defi_pain_doctopic

array([[0.2       , 0.2       , 0.2       , 0.2       , 0.2       ],
       [0.2       , 0.2       , 0.2       , 0.2       , 0.2       ],
       [0.2       , 0.2       , 0.2       , 0.2       , 0.2       ],
       ...,
       [0.2       , 0.2       , 0.2       , 0.2       , 0.2       ],
       [0.2       , 0.2       , 0.2       , 0.2       , 0.2       ],
       [0.02195424, 0.02195424, 0.02195424, 0.02195424, 0.91218305]])

In [13]:
defi_pain_df = pd.DataFrame(tf_defi_pain_doctopic, columns = tf_defi_pain_keywords_list)

defi_pain_df.head()

Unnamed: 0,"uni, v3, area, dead, rural, certain, fair, leap, sinc, scare","teach, howev, emiss, almost, platform, prohibit, rate, simpl, known, 5","near, move, incurr, cefi, initi, recent, error, action, 2, legitimaci","found, simplic, exact, riski, due, select, trial, stablecoin, lower, lps","faith, volatil, function, rare, ground, exposur, huge, featur, gwei, thank"
0,0.2,0.2,0.2,0.2,0.2
1,0.2,0.2,0.2,0.2,0.2
2,0.2,0.2,0.2,0.2,0.2
3,0.021955,0.021955,0.021955,0.912181,0.021955
4,0.021955,0.021955,0.021955,0.021955,0.91218
