In [133]:
import pandas as pd
import re
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
import pyLDAvis.gensim_models
import warnings
warnings.filterwarnings('ignore')

In [112]:
def convert_currency(amount):

    number = 1000 * float(re.search(r'\s(\d+\.\d+)', amount).group(1))

    currency = amount[:3]
    if currency != "EUR":
        if currency == "BOB":
            conv = 8.3725
        elif currency == "DKK":
            conv = 7.4374
        elif currency == "GEL":
            conv = 3.8279
        elif currency == "INR":
            conv = 88.6972
        elif currency == "JOD":
            conv = 0.8585
        elif currency == "KES":
            conv = 130.5909
        elif currency == "LKR":
            conv = 239.5998
        elif currency == "NGN":
            conv = 498.8897
        elif currency == "NPR":
            conv = 142.5807
        elif currency == "TRY":
            conv = 10.1572
        elif currency == "USD":
            conv = 1.2108
        elif currency == "ZAR":
            conv = 16.6102
        else:
            print("warning, non-defined currency in dataset")
            conv = 1

        number = number / conv

    return number

def check_search_term(description, search_terms):
    for term in search_terms:
        if term in description:
            return 1
    return 0

In [113]:
#Reading the data
folder = Path("/Users/nielskreuk/Dropbox/DataScience/Exercises/NYCDSA/ScrapingProject/fmo/scrapeddata")
file_to_open = folder / "fmo.csv"
cols = pd.read_csv(file_to_open, sep=";", nrows=1).columns
df = pd.read_csv(file_to_open, sep=";", usecols=cols[:6]) #Leave out couple of empty columns at end of file

In [114]:
df.columns

Index(['amount', 'country', 'date', 'description', 'industry', 'project_name'], dtype='object')

In [115]:
#add a column with converted amounts in EUR as numbers
df["EUR_amounts"] = [convert_currency(amount) for amount in df["amount"]]

In [116]:
# Preprocessing

# Note: Text already made lowercase during scraping
# Remove numbers
df['description'] = df['description'].apply(lambda x: re.sub('[0-9]+', ' ', x))
# Remove links
df['description'] = df['description'].apply(lambda x: re.sub('http\S*', '', x))
# Remove double spaces
df['description'] = df['description'].apply(lambda x: re.sub('\s+', ' ', x))



In [117]:
stemmer = PorterStemmer()
df["description"] = [stemmer.stem(word) for word in df["description"]]

In [118]:
count_vec = CountVectorizer(min_df=5)
count = count_vec.fit_transform(df["description"])

In [119]:
count.shape

(954, 2661)

In [120]:
print(count_vec.get_feature_names()[:100])

['abidjan', 'ability', 'able', 'about', 'above', 'abroad', 'abundant', 'accelerate', 'acceptable', 'access', 'accessibility', 'accessible', 'accessing', 'accion', 'accordance', 'according', 'account', 'accounts', 'achieve', 'achieved', 'achieving', 'acquire', 'acquired', 'acquiring', 'acquisition', 'across', 'act', 'acted', 'acting', 'action', 'actions', 'active', 'actively', 'activities', 'activity', 'acts', 'adb', 'add', 'added', 'adding', 'addition', 'additional', 'additionality', 'additionally', 'address', 'addressed', 'addresses', 'addressing', 'adds', 'adequate', 'adequately', 'adhere', 'adherence', 'adjacent', 'adopted', 'advance', 'advanced', 'advantage', 'advantages', 'adverse', 'advice', 'advisor', 'advisors', 'advisory', 'aef', 'afc', 'affairs', 'affected', 'affordability', 'affordable', 'africa', 'african', 'africinvest', 'after', 'ag', 'against', 'agencies', 'agency', 'agenda', 'agent', 'aggregate', 'agnostic', 'agreed', 'agreement', 'agreements', 'agri', 'agribusiness', '

In [121]:
tf_idf = TfidfVectorizer(min_df = 5, max_df = 0.75, stop_words = "english")
count_tf = tf_idf.fit_transform(df["description"])
count_tf.shape

(954, 2451)

In [122]:
print(tf_idf.get_feature_names()[::100])

['abidjan', 'allowing', 'begin', 'categories', 'complements', 'corridor', 'development', 'eligible', 'expert', 'form', 'growth', 'inception', 'issue', 'lines', 'meter', 'norfund', 'participation', 'practices', 'psc', 'replacing', 'sector', 'source', 'supplies', 'transportation', 'volume']


In [123]:
dictionary = gensim.corpora.Dictionary(df["description"].map(lambda x: x.split(' ')))

In [124]:
bow_corpus = [dictionary.doc2bow(wrd) for wrd in df["description"].map(lambda x: x.split(' '))]

In [125]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=6, id2word=dictionary, passes=2)
lda_model.save('lda.model')

In [126]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} Word: {}\n'.format(idx, topic))

Topic: 0 Word: 0.046*"the" + 0.032*"in" + 0.030*"and" + 0.019*"a" + 0.017*"to" + 0.017*"of" + 0.015*"is" + 0.010*"for" + 0.008*"with" + 0.008*"fund"

Topic: 1 Word: 0.052*"the" + 0.035*"and" + 0.034*"in" + 0.025*"to" + 0.024*"of" + 0.020*"a" + 0.017*"is" + 0.011*"will" + 0.008*"for" + 0.007*"by"

Topic: 2 Word: 0.038*"the" + 0.031*"in" + 0.024*"and" + 0.020*"of" + 0.020*"a" + 0.017*"to" + 0.015*"is" + 0.007*"with" + 0.006*"by" + 0.006*"will"

Topic: 3 Word: 0.067*"the" + 0.039*"and" + 0.033*"of" + 0.026*"to" + 0.024*"in" + 0.019*"is" + 0.017*"a" + 0.009*"with" + 0.009*"will" + 0.008*"project"

Topic: 4 Word: 0.048*"the" + 0.047*"and" + 0.035*"to" + 0.030*"in" + 0.028*"of" + 0.021*"a" + 0.018*"is" + 0.012*"with" + 0.011*"will" + 0.009*"for"

Topic: 5 Word: 0.039*"the" + 0.035*"and" + 0.032*"in" + 0.026*"to" + 0.024*"a" + 0.023*"of" + 0.022*"is" + 0.011*"with" + 0.010*"will" + 0.007*"fund"



In [127]:
tfidf = gensim.models.TfidfModel(bow_corpus)
tfidf_corpus = tfidf[bow_corpus]

In [128]:
lda_model_tfidf = gensim.models.LdaMulticore(tfidf_corpus, num_topics=6, id2word=dictionary, passes=2)
lda_model_tfidf.save('lda_tfidf.model')

In [129]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.001*"fund" + 0.001*"energy" + 0.001*"project" + 0.001*"water" + 0.001*"ps" + 0.001*"wind" + 0.001*"sme" + 0.001*"usd" + 0.001*"power" + 0.001*"company"
Topic: 1 Word: 0.001*"energy" + 0.001*"fund" + 0.001*"bank" + 0.001*"project" + 0.001*"ps" + 0.001*"solar" + 0.001*"power" + 0.001*"financial" + 0.001*"wind" + 0.001*"projects"
Topic: 2 Word: 0.001*"bank" + 0.001*"energy" + 0.001*"project" + 0.001*"power" + 0.001*"fund" + 0.001*"ps" + 0.001*"financial" + 0.001*"company" + 0.001*"wind" + 0.001*"development"
Topic: 3 Word: 0.001*"fund" + 0.001*"energy" + 0.001*"bank" + 0.001*"brac" + 0.001*"project" + 0.001*"." + 0.001*"investment" + 0.001*"access" + 0.001*"growth" + 0.001*"loan"
Topic: 4 Word: 0.001*"ps" + 0.001*":" + 0.001*"solar" + 0.001*"energy" + 0.001*"project" + 0.001*"fund" + 0.001*"impacts" + 0.001*"power" + 0.001*"-" + 0.001*"financial"
Topic: 5 Word: 0.001*"fund" + 0.001*"energy" + 0.001*"bank" + 0.001*"project" + 0.001*"usd" + 0.001*"microfinance" + 0.001*"coc

In [131]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(lda_model, bow_corpus, dictionary)

In [1]:
pyLDAvis.gensim_models.prepare(lda_model_tfidf, tfidf_corpus, dictionary)

NameError: name 'pyLDAvis' is not defined