In [9]:
# data manipulation
import pandas as pd
import numpy as np

import nltk

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn import preprocessing

from nltk.corpus import stopwords
from nltk import SnowballStemmer
import string

from pprint import pprint  # PRETTY PRINT long dictionaries

In [None]:
df = pd.read_csv("../raw/Onboard_Survey.csv")

df.head()

In [None]:
# skip first seven columns df.iloc[:, 0:7].head()

# selecting only open-ended responses 
df.iloc[:, 6:14].head()

open_ended = df.iloc[:, 6:14]

open_ended.head()

In [None]:

# rename columns to better manage columns
open_ended.columns = ['walletwhat_walletwhy', 'wallet_pain', 'defi_when', 'defiwhat_defiwhy', 'defi_pain', 'defi_outcome', 'defi_interest', 'defi_endgame']


open_ended.head()

In [None]:
## SKIP stemming & tokenzation to see if yields more interpretable results ##

In [None]:

# This is Part 2 of Onboard Survey Exploratory Analysis

# For Part 1 see onboard_survey_open_ended.ipynb
# For Part 1 https://forum.bankless.community/t/onboard-survey-exploratory-analysis/1048

# Part 2 Open-Ended questions to address include:

# What has been painful about using DeFi apps or what has or is an obstacle in your way to using a DeFi app? [column: defi_pain]

In [None]:
# Topic Modeling

# Preparing Text Data for NLP
# Goal: Turn text data in to matrix (row = document, column = feature)

# Steps: 

# forming a corpus of text
# stemming and lemmatization --- SKIP
# tokenization               --- SKIP
# removing stop-words
# finding words co-located together (N-grams)

In [None]:
# Apply Stemming & Lemmatization to defi_pain

# take entire column in open_ended df
# split sentences (each row) into words
# store in empty list

defi_pain_list = []

# 12 Rows Removed
for row in open_ended['defi_pain']:
    try:
        defi_pain_list.append(row.split())
    except:
        continue
        
defi_pain_list  # this is a Nested list - list of list;

In [None]:
# Last step need to FLATTEN a list of lists into one list/vector of words - "Bag of Words"
# Bag of word, a list cleaned of punctuation, stemmed, now a vector of individual words

defi_pain_list_flat = [item for sublist in defi_pain_list for item in sublist]

defi_pain_list_flat

In [None]:
## TF-IDF: Weighting terms based on frequency

# re-weights words to emphasize words that are unique to a document

In [15]:
# Example of how a Stemmer works - - Only used for Stopwords in THIS Notebook

stemmer = SnowballStemmer('english')
print(stemmer.stem('lies'))
print(stemmer.stem('lying'))
print(stemmer.stem('systematic'))
print(stemmer.stem('running'))

lie
lie
systemat
run


In [16]:
# Tokenize Function - - Only used for Stopwords in THIS Notebook

def tokenize(text):
    translator=str.maketrans(string.punctuation, ' '*len(string.punctuation)) # translator replace punct w empty space
    return [stemmer.stem(i) for i in text.translate(translator).split()]

In [None]:
# Example Stopwords
stop = stopwords.words('english') + ['invent', 'produce', 'method', 'use', 'first', 'second']
full_stopwords = [tokenize(s)[0] for s in stop]

full_stopwords

In [None]:
# Try tokenize, then sort dictionary
# Conclusion: Tokenize, get rid of stop-words, then sort

defi_pain_tokenize = []

for list in defi_pain_list:
    for word in list:
        defi_pain_tokenize.append(tokenize(word))  # This ia a "Bag of Words" - a list
        
defi_pain_tokenize

# Last step need to FLATTEN a list of lists into one list/vector of words - "Bag of Words"
# Bag of word, a list cleaned of punctuation, stemmed, now a vector of individual words

defi_pain_tokenize_flat = [item for sublist in defi_pain_tokenize for item in sublist]

defi_pain_tokenize_flat



In [None]:
empty_dict = {}

# loop through all elements in list and store count
for word in defi_pain_tokenize_flat:
    if word not in empty_dict:
        empty_dict[word] = 1
    else:
        empty_dict[word] += 1

# sorted
sort_empty_dict = sorted(empty_dict.items(), key=lambda x:x[1])
sort_empty_dict2 = dict(sort_empty_dict)
pprint(sort_empty_dict2)



In [20]:
tf_defi_pain_vectorizer = CountVectorizer(analyzer= 'word',  # unit of features are single words rather than phrases
                               tokenizer=tokenize, # function to create tokens
                               ngram_range=(0,2),   # Allow for bigrams
                               strip_accents='unicode',
                               stop_words=full_stopwords,  # see above Example Stopwords, other examples did NOT hv stop_words
                               min_df = 0.0,
                               max_df = 1)   # got an error to lower min_df and raise max_df

# Creating bag of words 
tf_defi_pain_bag_of_words = tf_defi_pain_vectorizer.fit_transform(defi_pain_list_flat) # IMPORTANT transform our UN-Tokenized (no stemming) corpus into a bag of words
tf_defi_pain_features = tf_defi_pain_vectorizer.get_feature_names()

# Use TfidfTransformer (see library import) to re-weight bag of words
tf_defi_pain_transformer = TfidfTransformer(norm = None, smooth_idf = True, sublinear_tf = True)
tf_defi_pain_tfidf = tf_defi_pain_transformer.fit_transform(tf_defi_pain_bag_of_words)

# Fitting LDA Model
tf_defi_pain_lda = LatentDirichletAllocation(n_components = 5, learning_method='online')  # NOTE: n_components = 5
tf_defi_pain_doctopic = tf_defi_pain_lda.fit_transform(tf_defi_pain_tfidf)

# Displaying the top keywords in each topic
tf_defi_pain_keywords_list = []


for i, topic in enumerate(tf_defi_pain_lda.components_):
    word_idx = np.argsort(topic)[::-1][:10]     # NOTE: 10 instead of 5
    tf_defi_pain_keywords = ', '.join(tf_defi_pain_features[i] for i in word_idx)
    tf_defi_pain_keywords_list.append(tf_defi_pain_keywords)
    print(i, tf_defi_pain_keywords)

0 volatil, faith, almost, near, huge, riski, due, uni, sketchi, earli
1 rural, fair, v3, simplic, gwei, dead, select, found, howev, featur
2 area, ground, leap, prohibit, function, emiss, teach, rate, gas transact, made
3 exposur, exact, crazi, incurr, faint, sinc, recent, wrong, lower, lps
4 certain, rare, platform, 5, action, error, cefi, initi, move, legitimaci


In [13]:
tf_defi_pain_keywords_list

['gwei, select, simplic, action, riski, howev, huge, 5, earli, trial',
 'certain, platform, rate, dead, v3, found, featur, sketchi, lps, ledger',
 'faith, rare, emiss, leap, ground, teach, near, thank, lower, crazi',
 'exposur, area, volatil, function, fair, rural, exact, uni, simpl, due',
 'almost, prohibit, gas transact, stablecoin, sinc, faint, ok, cefi, incurr, known']

In [14]:
tf_defi_pain_doctopic

array([[0.2       , 0.2       , 0.2       , 0.2       , 0.2       ],
       [0.2       , 0.2       , 0.2       , 0.2       , 0.2       ],
       [0.2       , 0.2       , 0.2       , 0.2       , 0.2       ],
       ...,
       [0.2       , 0.2       , 0.2       , 0.2       , 0.2       ],
       [0.2       , 0.2       , 0.2       , 0.2       , 0.2       ],
       [0.02200125, 0.02200123, 0.02200123, 0.91199506, 0.02200124]])

In [15]:
defi_pain_df = pd.DataFrame(tf_defi_pain_doctopic, columns = tf_defi_pain_keywords_list)

defi_pain_df.head()

Unnamed: 0,"gwei, select, simplic, action, riski, howev, huge, 5, earli, trial","certain, platform, rate, dead, v3, found, featur, sketchi, lps, ledger","faith, rare, emiss, leap, ground, teach, near, thank, lower, crazi","exposur, area, volatil, function, fair, rural, exact, uni, simpl, due","almost, prohibit, gas transact, stablecoin, sinc, faint, ok, cefi, incurr, known"
0,0.2,0.2,0.2,0.2,0.2
1,0.2,0.2,0.2,0.2,0.2
2,0.2,0.2,0.2,0.2,0.2
3,0.022002,0.022001,0.022001,0.911994,0.022001
4,0.911994,0.022001,0.022001,0.022001,0.022001


In [None]:
## Conclusion: Tokenize, get rid of stop-words, then sort & count values ##