In [1]:
# import 
import json
sorted_data = []
with open('sorted_NER.json', 'r') as f:
    sorted_data = json.load(f)
f.close()

In [2]:
import pandas as pd
import numpy as np
import altair as alt

# Convert to DataFrame
df = pd.DataFrame(sorted_data)

# Forward fill NaNs in 'year' column
df['year'] = df['year'].fillna(method='ffill')

# Convert 'year' to integer
df['year'] = df['year'].astype('Int64')  # 'Int64' (capital "I") can handle NaN values

# Group by decade
df['decade'] = (df['year'] // 10 * 10)

# Filter out 'DATE' entities
df_entity = df[df['value'] != 'DATE']

# Create a DataFrame to include both 'value' and 'entity' fields
df_entity_value_counts = df_entity.groupby(['decade', 'value', 'entity']).size().reset_index(name='counts')

# Get the top 3 entities in each decade for each value
top_entities_per_value = df_entity_value_counts.groupby(['decade', 'value']).apply(lambda x: x.nlargest(3, 'counts')).reset_index(drop=True)

# Create a combined field for value and entity to use in color encoding
top_entities_per_value['value_entity'] = top_entities_per_value['value'] + ': ' + top_entities_per_value['entity']

# Create area chart
area_chart = alt.Chart(top_entities_per_value).mark_area().encode(
    alt.X('decade:O', 
          axis=alt.Axis(title='Decade', grid=True, gridColor='#ddd', labelAngle=0)), # Horizontal labels on the x-axis
    alt.Y('counts:Q', 
          stack='center', 
          axis=alt.Axis(title='Counts', labels=False, ticks=False, grid=False)), # Hide y-axis labels and ticks, remove gridlines
    alt.Color('value_entity:N', 
              scale=alt.Scale(scheme='category20b'), 
              legend=None), # Use a color scheme and remove the legend
    tooltip=['entity', 'value', 'counts']  # Add tooltips
).properties(
    width='container',  # Adjust width to container size
    height=400,  # Adjust height
    title='Top Entity Trends Per Decade'  # Set the chart title
).interactive()

area_chart.display()



In [3]:
area_chart.save('area_chart.html')
