In [49]:
import pandas as pd
import matplotlib.pyplot as plt
import polars as pl
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

In [50]:
def null_pct(df):
    return dict(zip(df.columns, [round(100*len(df.loc[df[c].isnull()])/len(df),2) for c in df.columns]))

In [51]:
pen_df = pd.read_csv("pen america/PEN.csv", index_col=0)
pen_df["Dataset"] = "PEN America"
pen_df["DateBan"] = pd.to_datetime(pen_df["DateBan"]).dt.to_period('D')
pen_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5894 entries, 0 to 5893
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype    
---  ------       --------------  -----    
 0   Author       5893 non-null   object   
 1   Title        5893 non-null   object   
 2   State        5894 non-null   object   
 3   District     5894 non-null   object   
 4   DateBan      5894 non-null   period[D]
 5   Country      5894 non-null   object   
 6   Description  5894 non-null   object   
 7   Dataset      5894 non-null   object   
dtypes: object(7), period[D](1)
memory usage: 414.4+ KB


In [52]:
lidrekon_df = pd.read_csv('lidrekon/lidrekon.csv').rename(columns={'Date':'DateBan'})
lidrekon_df = lidrekon_df.drop([lidrekon_df.columns[0], 'TitleCandidates'], axis=1)\
                .drop_duplicates()
lidrekon_df['Country'] = 'RUS'
lidrekon_df['DateBan'] = pd.to_datetime(lidrekon_df['DateBan']).dt.to_period('D')
lidrekon_df['Dataset'] = "lidrekon"
print(lidrekon_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 5285 entries, 0 to 5342
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype    
---  ------       --------------  -----    
 0   DateBan      5285 non-null   period[D]
 1   Description  5285 non-null   object   
 2   Title        5281 non-null   object   
 3   Country      5285 non-null   object   
 4   Dataset      5285 non-null   object   
dtypes: object(4), period[D](1)
memory usage: 247.7+ KB
None


In [53]:
marshall_df = pd.read_csv('marshall/marshall.csv', index_col=0)
marshall_df['DateBan'] = pd.to_datetime(marshall_df['DateBan']).dt.to_period('D')
marshall_df['Dataset'] = "The Marshall Project"
marshall_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 55278 entries, 0 to 55277
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype    
---  ------       --------------  -----    
 0   Title        55278 non-null  object   
 1   Author       23003 non-null  object   
 2   Description  30229 non-null  object   
 3   State        55278 non-null  object   
 4   Country      55278 non-null  object   
 5   DateBan      47070 non-null  period[D]
 6   Dataset      55278 non-null  object   
dtypes: object(6), period[D](1)
memory usage: 3.4+ MB


In [54]:
import math
def to_period(year):
    if math.isnan(year):
        return None
    return pd.Period(year=int(year), freq="D")
kasseler_df = pd.read_csv('kasseler/kasseler.csv', index_col=0)
kasseler_df['DateBan'] = kasseler_df['DateBan'].apply(to_period)
kasseler_df['Dataset'] = "Die Kasseler Liste (Parthenon of Books)"
kasseler_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 121573 entries, 0 to 121572
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype    
---  ------       --------------   -----    
 0   Author       103096 non-null  object   
 1   Title        121556 non-null  object   
 2   District     34126 non-null   object   
 3   Country      121544 non-null  object   
 4   Description  121552 non-null  object   
 5   DateBan      85748 non-null   period[D]
 6   Dataset      121573 non-null  object   
dtypes: object(6), period[D](1)
memory usage: 7.4+ MB


In [55]:
df = pd.concat((pen_df, marshall_df, lidrekon_df, kasseler_df), axis=0, ignore_index=True)
print(df.info())
print(null_pct(df))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188030 entries, 0 to 188029
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype    
---  ------       --------------   -----    
 0   Author       131992 non-null  object   
 1   Title        188008 non-null  object   
 2   State        61172 non-null   object   
 3   District     40020 non-null   object   
 4   DateBan      143997 non-null  period[D]
 5   Country      188001 non-null  object   
 6   Description  162960 non-null  object   
 7   Dataset      188030 non-null  object   
dtypes: object(7), period[D](1)
memory usage: 11.5+ MB
None
{'Author': 29.8, 'Title': 0.01, 'State': 67.47, 'District': 78.72, 'DateBan': 23.42, 'Country': 0.02, 'Description': 13.33, 'Dataset': 0.0}


In [56]:
df['Year'] = df['DateBan'].dt.year
grouped_df = df.groupby(['Year', 'Country']).size().to_frame().reset_index()
grouped_df['Count'] = grouped_df[0]
grouped_df = grouped_df.sort_values(['Year', 'Count'])
print(grouped_df.info())
grouped_df

<class 'pandas.core.frame.DataFrame'>
Index: 1292 entries, 2 to 1291
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Year     1292 non-null   int64 
 1   Country  1292 non-null   object
 2   0        1292 non-null   int64 
 3   Count    1292 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 50.5+ KB
None


Unnamed: 0,Year,Country,0,Count
2,-1,BLR,1,1
3,-1,BRD,1,1
4,-1,CHL,1,1
11,-1,"DEU, ITA, YO",1,1
12,-1,"DEU, Kaiserreich",1,1
...,...,...,...,...
1287,2022,USA,4239,4239
1288,2023,CHN,5,5
1289,2023,RUS,80,80
1290,2023,USA,1379,1379


In [66]:
filtered_df = grouped_df[(grouped_df['Count']>10) & (grouped_df['Year']>0)]
fig = px.bar(filtered_df, x='Year', y='Count', color='Country')
fig.update_layout(xaxis=dict(rangeslider=dict(visible=True)))
fig.show()

In [65]:
print(df.loc[(df['Country']=='GC' )& (df['Dataset']=='Die Kasseler Liste (Parthenon of Books)'),:])

                     Author  \
66507    AALL Herman Harris   
66508   AALL Hermann Harris   
66538            ABB Gustav   
66551    ABBETMEYER Theodor   
66578       ABEGG Friedrich   
...                     ...   
171604                  NaN   
171605                  NaN   
171606                  NaN   
171607                  NaN   
171608                  NaN   

                                                    Title State  \
66507   *Das Schicksal des Nordens eine europäische Frage   NaN   
66508    *Weltherrschaft und die Rechtlosigkeit der Meere   NaN   
66538                 *Der wissenschaftliche Bibliothekar   NaN   
66551                      *Über moderne Theater-Unkultur   NaN   
66578              *Fahrt ins Leben Worte an Konfirmierte   NaN   
...                                                   ...   ...   
171604            *Zukunftsaufgaben der Wiener Wirtschaft   NaN   
171605  *Zum Gedächtnis an Admiral von Trotha 1. März ...   NaN   
171606              *Zur

In [68]:
import pycountry as pc
def get_country_name(x):
    if type(x)==str and len(x)==3:
        country = pc.countries.get(alpha_3=x)
        if country is not None:
            return country.name
        else:
            return None
    else:
        return None
grouped_df['CountryName'] = grouped_df['Country'].apply(get_country_name)

In [69]:
print(grouped_df.groupby('CountryName').size().sort_values(ascending=False))
print(grouped_df['CountryName'].isna().sum())

CountryName
Holy See (Vatican City State)     332
Türkiye                            84
Austria                            83
Australia                          68
United States                      53
New Zealand                        45
China                              45
Portugal                           42
South Africa                       35
Cameroon                           22
Italy                              18
Russian Federation                 16
Germany                            14
Iran, Islamic Republic of          10
Bangladesh                          8
Qatar                               8
Algeria                             5
Netherlands                         3
Poland                              3
France                              3
Chile                               2
Denmark                             2
Czechia                             2
United Kingdom                      2
Lebanon                             2
Spain                               2


In [28]:
fig = px.bar(df_country_ct.sort_values('Year'), x="CountryName", y="size",
  animation_frame="Year")
fig.show()

In [73]:
df_iso_ct = grouped_df.loc[grouped_df['Country'].str.len() ==3, :]
df_iso_ct["Count"] = df_iso_ct["Count"]*100
print(df_iso_ct.shape)
fig = px.scatter_geo(df_iso_ct, locations="Country", size="Count",
                     animation_frame="Year",
                     projection="natural earth")
fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 30
fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 5
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



(929, 5)
