In [1]:
%%capture
!pip install scattertext

In [2]:
import pandas as pd
import scattertext as st
from IPython.core.display import HTML

In [3]:
#load data
dickens_url = 'https://raw.githubusercontent.com/msaxton/nlp-data/main/dickens.csv'
dickens_df = pd.read_csv(dickens_url)

In [4]:
# sanity check
print(dickens_df.shape)
dickens_df.sample(5)

(24707, 6)


Unnamed: 0,author,title,text,nouns,adjectives,verbs
1048,dickens,cities,"“What is that?” he calmly asked, looking with ...",attention line stone colour,horizontal black,ask look
2548,dickens,expectations,"“You young dog,” said the man, licking his lip...",dog man lip fat,young,say lick cheek get
16547,dickens,bleak,"“I never heard of such a thing! Good gracious,...",thing man,good gracious,hear exclaim
23255,dickens,pickwick,"'I am afraid, sir,' said Mr. Pickwick, laying ...",hand place room friend,afraid afraid noisy crowded own quiet,say lay live pray consider want come see
24110,dickens,pickwick,"'Good,' said the stranger. 'Coachman, I get do...",stranger bag,good carpet-,say get


In [5]:
#load data
eliot_url = 'https://raw.githubusercontent.com/msaxton/nlp-data/main/eliot.csv'
eliot_df = pd.read_csv(eliot_url)

In [6]:
# sanity check
print(eliot_df.shape)
eliot_df.sample(5)

(18139, 6)


Unnamed: 0,author,title,text,nouns,adjectives,verbs
1917,eliot,middlemarch,Thus his intellectual ambition which seemed to...,ambition other security wound frame possibilit...,intellectual least embittering,seem absorb dry come begin
3818,eliot,silas,Dunsey perceived that he had overshot his mark...,mark point decision air,unconcern,perceive overshoot hesitate drive say
14483,eliot,romola,The muscles of Fra Girolamo’s face were eminen...,muscle face command case man personality speec...,powerful deliberate cautious strong mental liable,confide add control look answer consider hear ...
147,eliot,middlemarch,"“It is very painful,” said Dorothea, feeling s...",cottage eye tear,painful scourged more uncivil painful,say feel tell fill
146,eliot,middlemarch,"“Well, I am sorry for Sir James. I thought it ...",place way courage sister being speculation,sorry right wrong impossible plain unusual wide,think tell go look tread see see satisfy see ’...


In [7]:
# create samples
dickens_sample_df = dickens_df.sample(10_000)
eliot_sample_df = eliot_df.sample(10_000)

In [8]:
# combine DataFrames
df = pd.concat([dickens_sample_df, eliot_sample_df])

In [9]:
# drop all columns except 'author' and 'nouns'
adjectives_df = df[['author', 'adjectives']]

In [11]:
# create a scattertext corpus
corpus = st.CorpusFromPandas(adjectives_df, category_col='author', text_col='adjectives').build()

In [12]:
# transform corpus into html-based visualization with scattertext
html = st.produce_scattertext_explorer(corpus,
                                       category='eliot',  # this sets the y-axis
                                       category_name='Eliot', # label y-axis
                                       not_category_name='Dickens',  # label x-axis
                                       minimum_term_frequency=20,
                                       width_in_pixels=900)

In [13]:
# display visualization in notebook
HTML(html)

In [14]:
# Note: You can save this visualization as an html file
file_name = 'example.html'
with open(file_name, encoding='utf8', mode='w') as f:
  f.write(html)