In [1]:
import pandas as pd
import altair as alt
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem import PorterStemmer

In [2]:
humanist_vols = pd.read_csv('web_scraped_humanist_listserv_volumes.csv')


In [8]:
# Early Internet Era: from the start of the dataset up to and including 1998
early_internet_era = humanist_vols[humanist_vols['inferred_start_year'] <= 1999]

# Web 2.0 Era: from 1999 onwards
web_2_0_era = humanist_vols[humanist_vols['inferred_start_year'] >= 2000]

In [12]:
earlydoc = early_internet_era.volume_text.tolist()
webdoc = web_2_0_era.volume_text.tolist()

#Create a vectorizer
vectorizer = TfidfVectorizer(max_df=.7, min_df=1)

In [19]:
# Fit the vectorizer to our documents
earlytransformed_documents = vectorizer.fit_transform(earlydoc)

# Now get the top features for each document
earlytransformed_documents_as_array = earlytransformed_documents.toarray()

dates = early_internet_era.inferred_start_year.tolist()
earlytfidf_results = []
for counter, doc in enumerate(earlytransformed_documents_as_array):
    # construct a dataframe
    tf_idf_tuples = list(zip(vectorizer.get_feature_names_out(), doc))
    one_doc_as_df = pd.DataFrame.from_records(tf_idf_tuples, columns=['term', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True)
    one_doc_as_df['inferred_start_year'] = dates[counter]
    earlytfidf_results.append(one_doc_as_df)

In [20]:
# Fit the vectorizer to our documents
webtransformed_documents = vectorizer.fit_transform(webdoc)

# Now get the top features for each document
webtransformed_documents_as_array = webtransformed_documents.toarray()

dates = web_2_0_era.inferred_start_year.tolist()
webtfidf_results = []
for counter, doc in enumerate(webtransformed_documents_as_array):
    # construct a dataframe
    tf_idf_tuples = list(zip(vectorizer.get_feature_names_out(), doc))
    one_doc_as_df = pd.DataFrame.from_records(tf_idf_tuples, columns=['term', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True)
    one_doc_as_df['inferred_start_year'] = dates[counter]
    webtfidf_results.append(one_doc_as_df)

In [17]:
earlytfidf_df = pd.concat(earlytfidf_results)
earlytfidf_df = earlytfidf_df.sort_values(by=['score'], ascending=False)
earlytfidf_df.head(10)

Unnamed: 0,term,score,inferred_start_year
0,http,0.730132,1999
0,utorepas,0.726332,1987
0,http,0.721093,1998
0,http,0.699473,1997
0,http,0.695283,1996
0,http,0.603824,1995
1,www,0.581237,1998
1,www,0.579809,1997
1,www,0.570008,1999
1,www,0.551416,1995


In [21]:
webtfidf_df = pd.concat(webtfidf_results)
webtfidf_df = webtfidf_df.sort_values(by=['score'], ascending=False)
webtfidf_df.head(10)

Unnamed: 0,term,score,inferred_start_year
0,num,0.748285,2004
0,num,0.737968,2007
0,num,0.731689,2003
0,num,0.729739,2006
0,joyent,0.692576,2010
0,num,0.689697,2005
0,joyent,0.671967,2011
0,ninch,0.657385,2002
0,2016,0.643744,2017
0,joyent,0.617716,2012


In [23]:
selection = alt.selection_point(fields=['term'], bind='legend')
chart = alt.Chart(earlytfidf_df).mark_bar().encode(
    y='score',
    x='inferred_start_year:T',
    color=alt.Color('term', legend=alt.Legend(title='Term', orient='right', symbolLimit=len(earlytfidf_df['term'].unique()), columns=5), scale=alt.Scale(scheme='tableau20')),
    tooltip=['term', 'score', 'year(inferred_start_year)'],
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2))
).add_params(selection).properties(
    title='Top 10 Terms by TF-IDF Score in Humanist Volumes'
)
chart

MaxRowsError: The number of rows in your dataset is greater than the maximum allowed (5000).

Try enabling the VegaFusion data transformer which raises this limit by pre-evaluating data
transformations in Python.
    >> import altair as alt
    >> alt.data_transformers.enable("vegafusion")

Or, see https://altair-viz.github.io/user_guide/large_datasets.html for additional information
on how to plot large datasets.

alt.Chart(...)

In [25]:
top_terms['period'] = top_terms['period'].astype(str)
selection = alt.selection_point(fields=['term'], bind='legend')
chart = alt.Chart(top_terms).mark_bar().encode(
    y='score',
    x=alt.X('period', sort=['early_internet', 'web_2.0', 'contemporary'], axis=alt.Axis(title='Period')),
    color=alt.Color('term', legend=alt.Legend(title='Term', orient='right', symbolLimit=len(top_terms['term'].unique()), columns=5), scale=alt.Scale(scheme='tableau20')),
    tooltip=['term', 'score', 'period'],
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2))
).add_params(selection).properties(
    title='Top 30 Terms by TF-IDF Score in Humanist Volumes by Period'
)
chart

NameError: name 'top_terms' is not defined

In [24]:
selection = alt.selection_point(fields=['term'], bind='legend')
chart = alt.Chart(webtfidf_df).mark_bar().encode(
    y='score',
    x='inferred_start_year:T',
    color=alt.Color('term', legend=alt.Legend(title='Term', orient='right', symbolLimit=len(webtfidf_df['term'].unique()), columns=5), scale=alt.Scale(scheme='tableau20')),
    tooltip=['term', 'score', 'year(inferred_start_year)'],
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2))
).add_params(selection).properties(
    title='Top 10 Terms by TF-IDF Score in Humanist Volumes'
)
chart

MaxRowsError: The number of rows in your dataset is greater than the maximum allowed (5000).

Try enabling the VegaFusion data transformer which raises this limit by pre-evaluating data
transformations in Python.
    >> import altair as alt
    >> alt.data_transformers.enable("vegafusion")

Or, see https://altair-viz.github.io/user_guide/large_datasets.html for additional information
on how to plot large datasets.

alt.Chart(...)