In [1]:
import gensim
from gensim import corpora
from pprint import pprint
from gensim.utils import simple_preprocess

import pandas as pd

df = pd.read_csv("for_gensim.csv")

df.head()


Unnamed: 0,Date,Time,formtype,Improve,Imp1,Imp2,ImpCrit,CommentCoderImp,TweetImp,Best,Best1,Best2,BestCrit,Division2,Directorate2,Location
0,2011-10-01,11,adult,get treatment i requested,50.0,,1.0,,,It helped.,40.0,,1.0,Forensic services,Offender health,Offender health
1,2011-10-01,11,adult,provide gym staff to get gym up and running,47.0,,1.0,,,Early stages waiting to see course contents an...,46.0,,1.0,Forensic services,Offender health,Offender health
2,2011-10-01,11,adult,to look at the problems properly and give you ...,44.0,,2.0,,,The best thing about it is getting treatment e...,61.0,,1.0,Forensic services,Offender health,Offender health
3,2011-10-01,11,adult,to be more involved,27.0,,2.0,,,Keep in on meds.,44.0,,1.0,Forensic services,Offender health,Offender health
4,2011-10-01,11,adult,my drug problem was with subutex i was given m...,44.0,,2.0,,,When I asked to see the doctor regarding my me...,61.0,,1.0,Forensic services,Offender health,Offender health


In [2]:
# filter by date

df['Date'] = pd.to_datetime(df['Date'])

mask = (df['Date'] > "2018-09-01") & (df['Date'] <= "2018-12-31")

df_2018 = df.loc[mask]
len(df_2018)

comments = df["Improve"]



In [3]:
description = comments.apply(lambda x: x.split()).tolist()

In [4]:
# Tokenize the docs
tokenized_list = [simple_preprocess(doc) for doc in comments]

# Create the Corpus
mydict = corpora.Dictionary()
mycorpus = [mydict.doc2bow(doc, allow_update=True) for doc in tokenized_list]
# pprint(mycorpus)


In [5]:
from gensim.models.word2vec import Word2Vec

model = Word2Vec(description, min_count = 0, workers=-1)

In [6]:
model.wv.most_similar(['staff'])

[('sound', 0.4000924229621887),
 ('diarrhea', 0.39616334438323975),
 ('reordered', 0.36979395151138306),
 ('repeatsa', 0.3678136169910431),
 ('chair', 0.3641447424888611),
 ('arbitrarily', 0.3474430441856384),
 ('investigating', 0.339516282081604),
 ('fron', 0.3334224224090576),
 ('forum', 0.3302011489868164),
 ('favorites', 0.32857388257980347)]

In [7]:
# fast-text

# from gensim.test.utils import common_texts
# from gensim.models import FastText
# model2 = FastText(description, size=4, window=3, min_count=1, iter=10)
# model2.wv.most_similar(['treatment'])

In [9]:
from sklearn.manifold import TSNE
import pandas as pd

# Two parts to the answer: how to get the word labels, and how to plot the labels on a scatterplot.

# Word labels in gensim's word2vec

# model.wv.vocab is a dict of {word: object of numeric vector}. To load the data into X for t-SNE, I made one change.

vocab = list(model.wv.vocab)
X = model[vocab]

# This accomplishes two things: (1) it gets you a standalone vocab list for the final dataframe to plot, and (2) when you index model, you can be sure that you know the order of the words.

# Proceed as before with

tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)

# Now let's put X_tsne together with the vocab list. This is easy with pandas, so import pandas as pd if you don't have that yet.

for_plot = pd.DataFrame(X_tsne, index=vocab, columns=['x', 'y'])

# The vocab words are the indices of the dataframe now.

  # This is added back by InteractiveShellApp.init_path()


In [10]:
# bokeh plots

from bokeh.plotting import show, figure, save, output_file

output_file("first_plot.html")

_ = for_plot.plot.scatter('x', 'y', figsize=(12,12), marker='.', s=10, alpha=0.2)

subset_for_plot = for_plot.sample(n=1000)

p = figure(plot_width=800, plot_height=800)
_ = p.text(x=subset_for_plot.x, y=subset_for_plot.y, text=subset_for_plot.index)

show(p)

In [11]:
# I'm going to try to output some words and stuff for Shiny here

vocab = list(model.wv.vocab)

['get',
 'treatment',
 'i',
 'requested',
 'provide',
 'gym',
 'staff',
 'to',
 'up',
 'and',
 'running',
 'look',
 'at',
 'the',
 'problems',
 'properly',
 'give',
 'you',
 'right',
 'medication',
 'be',
 'more',
 'involved',
 'my',
 'drug',
 'problem',
 'was',
 'with',
 'subutex',
 'given',
 'methadone',
 'which',
 'has',
 'now',
 'made',
 'dependency',
 'a',
 'lot',
 'worse',
 'it',
 'would',
 'have',
 'been',
 'better',
 'for',
 'me',
 'prescribed',
 'dealt',
 'like',
 'that',
 'prisoner',
 'able',
 'choose',
 'what',
 'they',
 'were',
 'treated',
 'as',
 'results',
 'decide',
 'upon',
 'their',
 'own',
 'needstargetstreatments',
 'within',
 'system',
 'where',
 'prisoners',
 'take',
 'responsibility',
 'actions',
 'idts',
 'central',
 'issue',
 'most',
 'are',
 'very',
 'little',
 'autonomy',
 'should',
 'replaced',
 '1st',
 'line',
 'opiate',
 'misuse',
 'prison',
 'seems',
 'reluctant',
 'allow',
 'any',
 'power',
 'in',
 'deciding',
 'if',
 'people',
 'put',
 'on',
 'wellbeing'

In [14]:
len(vocab)

17222