In [9]:
# Usual imports
import numpy as np
import pandas as pd
from tqdm import tqdm
import string
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import concurrent.futures
import time
import pyLDAvis.sklearn
from pylab import bone, pcolor, colorbar, plot, show, rcParams, savefig
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
import os

# spaCy based imports
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

Collecting en-core-web-lg==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.1.0/en_core_web_lg-3.1.0-py3-none-any.whl (777.1 MB)
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.1.0
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_lg')


2021-10-06 21:39:43.292844: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2021-10-06 21:39:43.292884: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [73]:
go = pd.read_csv('./in_progress.csv', sep='\t', index_col = 0)
go.dropna(subset = ['post_split'], inplace=True)

In [74]:
nlp = spacy.load('en_core_web_lg')

doc = nlp(go['post_split'][0])
spacy.displacy.render(doc, style='ent', jupyter=True)

In [109]:
parser = English()
stopwords = list(STOP_WORDS)
punctuations = string.punctuation

def spacy_tokenizer(x):
    tokens = parser(x)
    tokens = [word.lemma_.lower().strip() if word.lemma_ != '-PRON-' else word.lower_ for word in tokens]
    tokens = [word for word in tokens if word not in stopwords and word not in punctuations]
    return tokens

In [57]:
import langid # language identification (i.e. what language is this?)
from nltk.classify.textcat import TextCat

go['ids_langid'] = go['post'].apply(langid.classify)

# get just the language label
go['langs'] = go['ids_langid'].apply(lambda tuple: tuple[0])

In [61]:
go = go[go['langs'] =='en']
len(go)

948

In [64]:
def spacy_bigram_tokenizer(phrase):
    doc = parser(phrase) # create spacy object
    token_not_noun = []
    notnoun_noun_list = []
    noun = ""

    for item in doc:
        if item.pos_ != "NOUN": # separate nouns and not nouns
            token_not_noun.append(item.text)
        if item.pos_ == "NOUN":
            noun = item.text
        
        for notnoun in token_not_noun:
            notnoun_noun_list.append(notnoun + " " + noun)

    return " ".join([i for i in notnoun_noun_list])

In [113]:
go['spacy_token'] = go['post_split'].astype('str').apply(spacy_tokenizer)

In [118]:
#go['spacy_token'] = go['post_lemma'].apply(lambda x: word for word in x if word != '-PRON-')
go['post_lemma']

0      ['hi', 'name', 'be', 'kristina', 'webb', 'be',...
1      ['myan', 'be', 'in', 'a', 'bad', 'car', 'accid...
3      ['giulianna', 'zetterlund', 'be', 'organize', ...
4      ['hi', 'everyone', 'be', 'devastate', 'to', 'r...
5      ['hi', 'all', 'most', 'of', 'know', 'amazing',...
                             ...                        
985    ['yesterday', 'freddie', 'also', 'know', 'as',...
986    ['on', 'september', '27th', '2021', 'sister', ...
987    ['melissa', 'powarzynski', 'be', 'organize', '...
988    ['on', 'friday', 'september', '24', 'beloved',...
989    ['amber', 'lucente', 'be', 'organize', 'this',...
Name: post_lemma, Length: 989, dtype: object

In [121]:
bivectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, ngram_range=(1,2))
bigram_vectorized = bivectorizer.fit_transform(go["post_lemma"])

In [119]:
vectorizer = CountVectorizer(lowercase=True)
data_vectorized = vectorizer.fit_transform(go['post_lemma'])

In [122]:
NUM_TOPICS = 10

# Latent Dirichlet Allocation Model
lda_spacy = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online',verbose=True)
data_lda = lda_spacy.fit_transform(data_vectorized)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


## Another fucking try

In [89]:
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags, strip_punctuation, strip_numeric, stem_text
from gensim.parsing.preprocessing import strip_multiple_whitespaces, strip_non_alphanum, remove_stopwords, strip_short

custom = [
    lambda x: x.lower(),
    strip_multiple_whitespaces,
    strip_numeric,
    remove_stopwords,
    strip_short,
    stem_text
]

go['post_gensim'] = go['post'].astype('str').apply(lambda x: preprocess_string(x, custom))
go['post_gensim']


0      [kristina, webb, organ, fundraiser.cr, dai, ag...
1      [christopoh, clark, organ, fundraiser.cr, dai,...
3      [giulianna, zetterlund, organ, fundrais, behal...
4      [rachael, robenolt, organ, fundraiser.cr, dai,...
5      [miranda, homrich, organ, fundraiser.cr, dai, ...
                             ...                        
985    [chantal, washington, organ, fundraiser.cr, da...
986    [andrea, silva, organ, fundraiser.cr, dai, ago...
987    [melissa, powarzynski, organ, fundraiser.hello...
988    [cindi, unes-cart, organ, fundraiser.cr, dai, ...
989    [amber, lucent, organ, fundraiser.saturday,, s...
Name: post_gensim, Length: 989, dtype: object

In [120]:
from gensim import corpora, models, similarities

dictionary = corpora.Dictionary(go['post_lemma'])

TypeError: doc2bow expects an array of unicode tokens on input, not a single string

In [91]:
corpus = [dictionary.doc2bow(text) for text in go['post_gensim']]

In [100]:
bag_model = models.LdaMulticore(corpus, num_topics=5, id2word=dictionary, passes=2, workers=2)
bag_model.show_topics()

[(0,
  '0.013*"help" + 0.009*"organ" + 0.008*"famili" + 0.007*"need" + 0.005*"fundrais" + 0.004*"support" + 0.004*"dai" + 0.004*"work" + 0.004*"love" + 0.004*"donat"'),
 (1,
  '0.012*"help" + 0.008*"famili" + 0.008*"organ" + 0.005*"know" + 0.004*"dai" + 0.004*"fundrais" + 0.004*"thank" + 0.004*"work" + 0.004*"need" + 0.004*"donat"'),
 (2,
  '0.008*"famili" + 0.008*"help" + 0.007*"need" + 0.006*"organ" + 0.006*"dai" + 0.005*"time" + 0.004*"thank" + 0.004*"support" + 0.004*"donat" + 0.004*"septemb"'),
 (3,
  '0.012*"famili" + 0.012*"help" + 0.007*"organ" + 0.005*"need" + 0.005*"home" + 0.005*"dai" + 0.004*"lost" + 0.004*"fund" + 0.004*"thank" + 0.004*"fundrais"'),
 (4,
  '0.012*"help" + 0.008*"organ" + 0.008*"famili" + 0.007*"know" + 0.006*"need" + 0.006*"love" + 0.005*"work" + 0.005*"dai" + 0.005*"donat" + 0.004*"time"')]

In [92]:
tfidf = models.TfidfModel(corpus)
transformed_tfidf = tfidf[corpus]

lda = models.LdaMulticore(transformed_tfidf, num_topics=5, id2word = dictionary)
lda.show_topics()

[(0,
  '0.001*"know" + 0.001*"famili" + 0.000*"love" + 0.000*"home" + 0.000*"need" + 0.000*"donat" + 0.000*"agoaccid" + 0.000*"hous" + 0.000*"lost" + 0.000*"friend"'),
 (1,
  '0.000*"know" + 0.000*"dai" + 0.000*"need" + 0.000*"support" + 0.000*"lost" + 0.000*"hous" + 0.000*"work" + 0.000*"fundraiser.cr" + 0.000*"famili" + 0.000*"life"'),
 (2,
  '0.000*"home" + 0.000*"famili" + 0.000*"work" + 0.000*"live" + 0.000*"support" + 0.000*"dai" + 0.000*"ask" + 0.000*"know" + 0.000*"hous" + 0.000*"agoaccid"'),
 (3,
  '0.001*"famili" + 0.000*"love" + 0.000*"need" + 0.000*"car" + 0.000*"medic" + 0.000*"financi" + 0.000*"know" + 0.000*"son" + 0.000*"home" + 0.000*"care"'),
 (4,
  '0.000*"famili" + 0.000*"love" + 0.000*"car" + 0.000*"work" + 0.000*"accid" + 0.000*"time" + 0.000*"donat" + 0.000*"hospit" + 0.000*"friend" + 0.000*"need"')]

In [99]:
bow_vector = dictionary.doc2bow(go['post_gensim'][5])
for index, score in sorted(lda[bow_vector], key=lambda tup: -1*tup[1]):
    print(score, index)


0.9920701 1


In [102]:
#bigram
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [106]:

bigram = gensim.models.phrases(go['post_gensim'], min_count=5, threshold=100)

TypeError: 'module' object is not callable

## Spacy try again

In [130]:
go.head()

Unnamed: 0,update,title,post,fundraiser,image_labels,current_raised,goal_fund,Self-raised,goal_precentage,post_split,post_split_tokenized,post_lemma,post_gensim,trial
0,,Marley's Recovery,Kristina Webb is organizing this fundraiser.Cr...,Kristina Webb is organizing this fundraiser.,"['Nose', 'Cheek', 'Lip', 'Eyebrow', 'Eyelash',...",7620,5000,1,1.52,"hi, my name is kristina webb. i'm starting thi...","['hi', 'my', 'name', 'is', 'kristina', 'webb',...","['hi', 'name', 'be', 'kristina', 'webb', 'be',...","[kristina, webb, organ, fundraiser.cr, dai, ag...","[h, , m, y, , n, m, e, , s, , k, r, s, t, ..."
1,,Help myan get epidural stimulation surgery,Christopoher Clark is organizing this fundrais...,Christopoher Clark is organizing this fundraiser.,[],900,215000,1,0.0,myan was in a bad car accident a little over a...,"['myan', 'was', 'in', 'a', 'bad', 'car', 'acci...","['myan', 'be', 'in', 'a', 'bad', 'car', 'accid...","[christopoh, clark, organ, fundraiser.cr, dai,...","[m, y, n, , w, s, , n, , , b, d, , c, r, ..."
3,,Emergency Financial Aid to Support Salvatore,Giulianna Zetterlund is organizing this fundra...,Giulianna Zetterlund is organizing this fundra...,[],170761,200000,0,0.85,giulianna zetterlund is organizing this fundra...,"['giulianna', 'zetterlund', 'is', 'organizing'...","['giulianna', 'zetterlund', 'be', 'organize', ...","[giulianna, zetterlund, organ, fundrais, behal...","[g, u, l, n, n, , z, e, t, t, e, r, l, u, n, ..."
4,,360 Community Management- Help Eric's children,Rachael Robenolt is organizing this fundraiser...,Rachael Robenolt is organizing this fundraiser.,[],5585,5000,1,1.12,hi everyone. we are devastated to report that ...,"['hi', 'everyone', 'we', 'are', 'devastated', ...","['hi', 'everyone', 'be', 'devastate', 'to', 'r...","[rachael, robenolt, organ, fundraiser.cr, dai,...","[h, , e, v, e, r, y, o, n, e, , w, e, , r, ..."
5,,Help Support Lynda VanderBilt,Miranda Homrich is organizing this fundraiser....,Miranda Homrich is organizing this fundraiser.,[],4100,4500,1,0.91,"hi all,\n\n\n \n\n\nmost of you know my amazin...","['hi', 'all', 'most', 'of', 'you', 'know', 'my...","['hi', 'all', 'most', 'of', 'know', 'amazing',...","[miranda, homrich, organ, fundraiser.cr, dai, ...","[h, , l, l, , m, o, s, t, , o, f, , y, o, ..."


In [140]:
go['post_split'] = go['post_split'].apply(lambda x: x.replace('\n', ''))

In [147]:
go['trial'] = go['post_split'].apply(lambda text: " ".join(token.lemma_ for token in nlp(text) 
                                                   if not token.is_stop and not token.is_punct and not token.is_space and token.text.isalpha()))
go['trial']

0      hi kristina webb start fundraiser help support...
1      myan bad car accident little year ago leave qu...
3      giulianna zetterlund organize fundraiser behal...
4      hi devastate report maintenance employee suffe...
5      hi know amazing incredible aunt lynda integral...
                             ...                        
985    yesterday freddie know son pass away pm shoot ...
986    september sister maggi life flight boise hospi...
987    melissa powarzynski organize melissa start fun...
988    friday september beloved co worker friend fami...
989    amber lucente organize september evan sister a...
Name: trial, Length: 989, dtype: object

In [150]:
import nltk
go['trial_token'] = go['trial'].apply(nltk.word_tokenize)

In [151]:
from gensim import corpora, models, similarities

dictionary = corpora.Dictionary(go['trial_token'])

In [152]:
corpus = [dictionary.doc2bow(text) for text in go['trial_token']]

In [178]:
bag_model = models.LdaMulticore(corpus, num_topics=3, id2word=dictionary, passes=2, workers=2, random_state=1, alpha=0.65)
bag_model.show_topics()

[(0,
  '0.017*"family" + 0.014*"help" + 0.009*"need" + 0.007*"love" + 0.006*"year" + 0.006*"time" + 0.006*"know" + 0.006*"home" + 0.006*"work" + 0.005*"life"'),
 (1,
  '0.021*"help" + 0.013*"family" + 0.009*"know" + 0.008*"need" + 0.008*"home" + 0.007*"time" + 0.006*"work" + 0.006*"life" + 0.006*"support" + 0.005*"friend"'),
 (2,
  '0.011*"help" + 0.010*"family" + 0.008*"de" + 0.007*"time" + 0.007*"need" + 0.005*"y" + 0.005*"friend" + 0.005*"hospital" + 0.005*"work" + 0.005*"life"')]

In [177]:
tfidf = models.TfidfModel(corpus)
transformed_tfidf = tfidf[corpus]

lda = models.LdaMulticore(transformed_tfidf, num_topics=3, id2word = dictionary, random_state=1, alpha=0.6)
lda.show_topics()

[(0,
  '0.001*"family" + 0.001*"love" + 0.001*"fire" + 0.001*"home" + 0.001*"need" + 0.001*"house" + 0.001*"year" + 0.001*"work" + 0.001*"know" + 0.001*"bill"'),
 (1,
  '0.001*"home" + 0.001*"know" + 0.001*"fire" + 0.001*"family" + 0.001*"love" + 0.001*"support" + 0.001*"work" + 0.001*"need" + 0.001*"lose" + 0.001*"hospital"'),
 (2,
  '0.001*"fire" + 0.001*"house" + 0.001*"time" + 0.001*"family" + 0.001*"home" + 0.001*"know" + 0.001*"friend" + 0.001*"accident" + 0.001*"lose" + 0.001*"work"')]

In [171]:
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda, corpus, dictionary)
vis

In [188]:
import numpy as nt
import os, csv, nltk, lda
import pandas as pd
import numpy as np
from nltk.tokenize import PunktSentenceTokenizer, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

from nltk.tokenize import PunktSentenceTokenizer,RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

ModuleNotFoundError: No module named 'lda'

In [191]:
go.to_csv('./gogogo.csv')