# 0. libs and data import

In [None]:
!pip install -r requirements.txt

In [None]:
import pandas as pd
import  folium
from folium.plugins import HeatMap

In [None]:
from matplotlib import pyplot as plt
plt.style.use('ggplot')

In [None]:
import seaborn as sns

### coords for all locations

In [None]:
geodata = pd.read_json('../geodata/geo-coordinates-clean.json')

In [None]:
geodata = geodata.T

In [None]:
geodata.columns = ['lat', 'long']

In [None]:
geodata

### freqs for romanticism

In [None]:
romfreq = pd.read_csv('../geodata/romantism-locations-freqs.tsv', sep='\t', 
                      header=None, 
                      names=('location', 'frequency'))
romfreq.head(10)

In [None]:
romfreq = romfreq.set_index('location', drop=False)


### freqs for realism

In [None]:
realfreq = pd.read_csv('../geodata/realism-locations-freqs.tsv', sep='\t', 
                      header=None, 
                      names=('location', 'frequency'))

In [None]:
realfreq = realfreq.set_index('location', drop=False)

## 1. Combine GeoData with freqs

### romanticism

In [None]:
geodata_w_rom_freq = pd.merge(geodata, romfreq, left_index=True, right_index=True)
geodata_w_rom_freq.head(15)

### realism

In [None]:
geodata_w_real_freq = pd.merge(geodata, realfreq, left_index=True, right_index=True)
geodata_w_real_freq.head(15)

## 2. Merge data by coordinates 

If we look at this data:

In [None]:
geodata_w_rom_freq

There is an issue with duplicate toponyms, e.g:

In [None]:
geodata_w_rom_freq.query('location.str.contains("Гатч")')

We'll want te merge them by identical coords for barplots, aggregated map bubbles and such.

In [None]:
## to group them by identical coords lets create a unified field:

geodata_w_rom_freq['lat_long'] = geodata_w_rom_freq['lat'].astype(str) + '_' + geodata_w_rom_freq['long'].astype(str)
geodata_w_rom_freq['lat_long']

In [None]:
geodata_w_rom_freq.query('lat_long=="59.56841_30.122892"')

it works

In [None]:
rom_grouped_with_freq_sums = geodata_w_rom_freq.groupby('lat_long').sum()
rom_grouped_with_freq_sums

In [None]:
rom_grouped_with_freq_sums['lat_long'] = rom_grouped_with_freq_sums.index

In [None]:
rom_grouped_with_freq_sums

If everything worked, this should be four:

In [None]:
rom_grouped_with_freq_sums.loc["59.56841_30.122892", :]['frequency']

Yess

Now let us get back the names

In [None]:
def get_loc_names_by_latlon(latlon, refdf):
    res = refdf.query(f'lat_long=="{latlon}"')
    return res['location'].sum() # returns concatenation of all location strings e.g. "ГатчинуГатчине"

In [None]:
get_loc_names_by_latlon("59.56841_30.122892",refdf=geodata_w_rom_freq)

In [None]:
## chose this option
def get_loc_most_frequent_name_by_latlon(latlon, refdf):
    res = refdf.query(f'lat_long=="{latlon}"')
    res = res.sort_values(by='frequency', ascending=False)
    return res['location'][0] 

In [None]:
get_loc_most_frequent_name_by_latlon("59.56841_30.122892",refdf=geodata_w_rom_freq)

In [None]:
rom_grouped_with_freq_sums['label'] = rom_grouped_with_freq_sums['lat_long'].apply(get_loc_most_frequent_name_by_latlon, 
                                                                                   refdf=geodata_w_rom_freq)

In [None]:
rom_grouped_with_freq_sums

The coords in the grouped df broke because of the use of `.sum` as aggregation function (which we had to use to add frequencies of different variants of the same location. SO let's fix them:

In [None]:
def get_true_lat_by_latlon(latlon):
    return latlon.split('_')[0]

In [None]:
def get_true_lon_by_latlon(latlon):
    return latlon.split('_')[1]

In [None]:
rom_grouped_with_freq_sums['lat'] = rom_grouped_with_freq_sums['lat_long'].apply(get_true_lat_by_latlon)

In [None]:
rom_grouped_with_freq_sums['long'] = rom_grouped_with_freq_sums['lat_long'].apply(get_true_lon_by_latlon)

In [None]:
rom_grouped_with_freq_sums

In [None]:
rom_grouped_with_freq_sums.to_csv('aggregated_rom_locs.csv')

#### now same for realism

In [None]:
geodata_w_real_freq

In [None]:
geodata_w_real_freq.query('location.str.contains("Гатч")').sort_values(by='frequency', ascending=False)

In [None]:

geodata_w_real_freq['lat_long'] = geodata_w_real_freq['lat'].astype(str) + '_' + geodata_w_real_freq['long'].astype(str)
geodata_w_real_freq['lat_long']

In [None]:
real_grouped_with_freq_sums = geodata_w_real_freq.groupby('lat_long').sum()
real_grouped_with_freq_sums

In [None]:
real_grouped_with_freq_sums['lat_long'] = real_grouped_with_freq_sums.index

This should be much more, as there is much more Gatchina in the romanticism: 

In [None]:
real_grouped_with_freq_sums.loc["59.56841_30.122892", :]['frequency']

In [None]:
real_grouped_with_freq_sums['label'] = real_grouped_with_freq_sums['lat_long'].apply(get_loc_most_frequent_name_by_latlon, 
                                                                                   refdf=geodata_w_real_freq)

In [None]:
real_grouped_with_freq_sums['lat'] = real_grouped_with_freq_sums['lat_long'].apply(get_true_lat_by_latlon)

In [None]:
real_grouped_with_freq_sums['long'] = real_grouped_with_freq_sums['lat_long'].apply(get_true_lon_by_latlon)

In [None]:
real_grouped_with_freq_sums

In [None]:
real_grouped_with_freq_sums.to_csv('aggregated_real_locs.csv')

In [None]:
real_grouped_with_freq_sums

### 3. Plot most frequent merged locations

In [None]:
rom_grouped_with_freq_sums.sort_values(by='frequency')[-30:].plot.barh(y='frequency', x='label', 
                                                                        title='top romanticist locations')
ax = plt.gca()
ax.figure.tight_layout()
plt.savefig('romanticist_raw_freqs.png', dpi=300)


In [None]:
real_grouped_with_freq_sums.sort_values(by='frequency')[-30:].plot.barh(y='frequency', x='label', 
                                                                        title='top realist locations')
ax = plt.gca()
ax.figure.tight_layout()
plt.savefig('realist_raw_freqs.png', dpi=300)

### Filter

In [None]:
rom_grouped_with_freq_sums

In [None]:
rom_grouped_with_freq_sums.query('label=="Ростов"')

In [None]:
rom_grouped_with_freq_sums = rom_grouped_with_freq_sums.drop('57.18333_39.41667') ## ростов

In [None]:
real_grouped_with_freq_sums = real_grouped_with_freq_sums.drop('57.18333_39.41667') ## ростов

In [None]:
real_grouped_with_freq_sums = real_grouped_with_freq_sums.drop('56.32694_44.0075')# нижний

In [None]:
#real_grouped_with_freq_sums = real_grouped_with_freq_sums.drop('59.76_30.55') # ижорского
rom_grouped_with_freq_sums = rom_grouped_with_freq_sums.drop('59.76_30.55') # ижорского

### Add relative freqs

#### romanicism

In [None]:
rom_grouped_with_freq_sums

In [None]:
rom_grouped_with_freq_sums['rel_freq'] = rom_grouped_with_freq_sums['frequency']/rom_grouped_with_freq_sums['frequency'].sum()

#### realism

In [None]:
real_grouped_with_freq_sums['rel_freq'] = real_grouped_with_freq_sums['frequency']/real_grouped_with_freq_sums['frequency'].sum()

In [None]:
real_grouped_with_freq_sums

### Add Eng Labels

In [None]:
rom_to_plot = rom_grouped_with_freq_sums.sort_values(by='frequency')[-30:]

In [None]:
real_to_plot = real_grouped_with_freq_sums.sort_values(by='frequency')[-30:]

In [None]:
real_to_plot

In [None]:
list(rom_to_plot['label'])

In [None]:
rom_eng_labels = [
     'Red Square',
'Krakow',
 'Zaporizhzhia',
 'Nizhny Novgorod',
 'Constantinople',
 'Moscow River',
 'Germany',
 'Derbent',
     'Uglich',
 'Poltava',
 'Volga',
 'Sweden',
 'Tver',
 'Lithuania',
 'Orenburg',
 'Rome',
 'France',
     'Smolensk',
 'Neva',
 'Siberia',
 'Italy',
 'Paris',
 'Ukraine',
 'Dnipro',
 'Novgorod',
 'Kyiv',
 'Poland',
 'Petersburg',
 'Russia',
'Moscow']

In [None]:
rom_to_plot['label eng'] = rom_eng_labels

In [None]:
rom_to_plot

In [None]:
rom_to_plot['label bilingual'] = rom_to_plot['label eng'] + ' (' + rom_to_plot['label'] + ')'

In [None]:
rom_to_plot['label bilingual']

In [None]:
def add_bilingual_labels(df, eng_labels):
    df['label eng'] = eng_labels
    df['label bilingual'] = df['label eng'] + ' (' + df['label'] + ')'

In [None]:
list(real_to_plot['label'])

In [None]:
real_eng_labels = ['Switzerland',
 'Sweden',
 'Dnipro',
 'Berlin',
 'Riga',
 'London',
 'Constantinople',
 'Türkiye',
 'Vienna',
 'Crimea',
 'Smolensk',
 'Astrakhan',
 'Neva',
 'Germany',
 'Sevastopol',
 'England',
 'Lithuania',
 'Kyiv',
 'Novgorod',
 'Italy',
 'Kazan',
 'France',
 'Volga',
 'Rome',
 'Poland',
 'Siberia',
 'Paris',
 'Russia',
 'Petersburg',
 'Moscow']

In [None]:
add_bilingual_labels(real_to_plot, real_eng_labels)

In [None]:
real_to_plot

### Produce final bar plots for the paper

In [None]:
real_to_plot['raw_frequency'] = real_to_plot['frequency']

In [None]:
rom_to_plot['raw_frequency'] = rom_to_plot['frequency']

In [None]:
def plot_barh_viz(df, x='label', title='title', filename='someviz'):
    df.plot.barh(y='raw_frequency', x=x, title=title, figsize=(8,6))
    ax = plt.gca()
    ax.figure.tight_layout()
    plt.savefig(f'{filename}.png', dpi=300)
    
    

In [None]:
plot_barh_viz(real_to_plot, x='label bilingual', title = 'top realist locations', 
              filename = 'top realist locations translated chr')

In [None]:
plot_barh_viz(rom_to_plot, x='label bilingual', title = 'top romanticist locations', 
              filename = 'top romanticist locations translated chr')

### relative freqs

In [None]:
def plot_barh_viz(df, x='label', title='title', filename='someviz'):
    df.plot.barh(y='rel_freq', x=x, title=title, figsize=(8,6))
    ax = plt.gca()
    ax.figure.tight_layout()
    ax.set_ylabel("")
    plt.savefig(f'{filename}.png', dpi=300)
    
    

In [None]:
plot_barh_viz(real_to_plot[-20:], x='label bilingual', title = '20 most frequent locations of realism', 
              filename = 'top realist locations translated chr 20')

In [None]:
plot_barh_viz(rom_to_plot[-20:], x='label bilingual', title = '20 most frequent locations of romanticism', 
              filename = 'top romanticist locations translated chr 20')

## 4. Heatmap of relative frequencies

In [None]:
rom_grouped_with_freq_sums

In [None]:
rom_grouped_with_freq_sums['lat'] = rom_grouped_with_freq_sums['lat'].astype(float)

In [None]:
rom_grouped_with_freq_sums['long'] = rom_grouped_with_freq_sums['long'].astype(float)

In [None]:
# Create a base map centered at the mean location
m = folium.Map(location=[rom_grouped_with_freq_sums["lat"].mean(),
                         rom_grouped_with_freq_sums["long"].mean()], zoom_start=10)

In [None]:
# Prepare heatmap data: [[lat, lon, weight], ...]
heat_data = rom_grouped_with_freq_sums[["lat", "long", "rel_freq"]].values.tolist()

In [None]:
# Add heatmap layer
HeatMap(heat_data).add_to(m)

In [None]:
# Save map as HTML file
m.save("romanticism_heatmap.html")

#### realist heatmap

In [None]:
real_grouped_with_freq_sums

In [None]:
real_grouped_with_freq_sums['long'] = real_grouped_with_freq_sums['long'].astype(float)

In [None]:
real_grouped_with_freq_sums['lat'] = real_grouped_with_freq_sums['lat'].astype(float)

### subset

In [None]:
real_heat = real_grouped_with_freq_sums[real_grouped_with_freq_sums['frequency']>10]

In [None]:
# Create a base map centered at the mean location
m = folium.Map(location=[real_heat["lat"].mean(),
                         real_heat["long"].mean()], zoom_start=10)

In [None]:
# Prepare heatmap data: [[lat, lon, weight], ...]
heat_data = real_heat[["lat", "long", "rel_freq"]].values.tolist()

In [None]:
# Add heatmap layer
HeatMap(heat_data, radius=17).add_to(m) # 

In [None]:
# Save map as HTML file
m.save("realism_heatmap.html")

In [None]:
rom_heat = rom_grouped_with_freq_sums[rom_grouped_with_freq_sums['frequency']>10]

In [None]:
# Create a base map centered at the mean location
m = folium.Map(location=[rom_heat["lat"].mean(),
                         rom_heat["long"].mean()], zoom_start=10)

In [None]:
# Prepare heatmap data: [[lat, lon, weight], ...]
heat_data = rom_heat[["lat", "long", "rel_freq"]].values.tolist()

In [None]:
# Add heatmap layer
HeatMap(heat_data, radius=20).add_to(m) # 

In [None]:
# Save map as HTML file
m.save("romanticism_heatmap.html")

## 5. Measure delta from romanticism to realism for each location

### merge both datasets:

In [None]:
real_rom_freqs = pd.merge(real_grouped_with_freq_sums, rom_grouped_with_freq_sums, how='outer', 
                          suffixes=('_real', '_rom'), 
                          left_index=True, right_index=True)

In [None]:
real_rom_freqs

In [None]:
real_rom_freqs = real_rom_freqs.fillna(0)

In [None]:
real_rom_freqs['delta_rom_to_real'] =  real_rom_freqs['rel_freq_rom'] - real_rom_freqs['rel_freq_real']

In [None]:
real_rom_freqs['delta_real_to_rom'] =  real_rom_freqs['rel_freq_real'] - real_rom_freqs['rel_freq_rom']

In [None]:
real_rom_freqs.sort_values(by='delta_real_to_rom')

In [None]:
real_rom_freqs.sort_values(by='delta_real_to_rom')[:30]

In [None]:
### Filter out Ижорский (surname in our contexts)  
real_rom_freqs = real_rom_freqs.drop('59.76_30.55')

### romantic delta champions

In [None]:
real_rom_freqs.sort_values(by='delta_real_to_rom')[:30][['delta_real_to_rom', 
                                                                          'rel_freq_rom', 
                                                                          'rel_freq_real', 'label_rom']]

In [None]:
top_rom_delta = real_rom_freqs.sort_values(by='delta_real_to_rom', ascending=False)[-30:][['delta_real_to_rom', 
                                                                          'frequency_rom', 
                                                                          'frequency_real', 'label_rom']]

In [None]:
top_rom_delta

In [None]:
top_rom_delta

In [None]:
list(top_rom_delta['label_rom'])

In [None]:
rom_delta_eng_labels = ['Leipzig',
 'Ringen',
 'Zimogorye',
 'Kolomenskoye',
 'Pereyaslavl',
 'Neuhausen',
 'Swallow\'s Nest',
 'Red Square',
 'Pochayny',
 'Krakow',
 'Narva',
 'Moscow River',
 'Sweden',
 'Baturin',
 'Tver',
 'Zaporizhzhia',
 'Orenburg',
 'Neva',
 'Poltava',
 'Smolensk',
 'Nizhny Novgorod',
 'Uglich',
 'Derbent',
 'Ukraine',
 'Novgorod',
 'Moscow',
 'Russia',
 'Dnipro',
 'Poland',
 'Kyiv']

In [None]:
def add_bilingual_labels(df, eng_labels, rus_labels):
    df['label eng'] = eng_labels
    df['label'] = rus_labels
    df['label bilingual'] = df['label eng'] + ' (' + df['label'] + ')'

In [None]:
add_bilingual_labels(top_rom_delta, rom_delta_eng_labels, top_rom_delta['label_rom'])

In [None]:
top_rom_delta

In [None]:
top_rom_delta

In [None]:
top_rom_delta[-20:].plot.barh(y='delta_real_to_rom',x='label bilingual',  
                        title='romanticism to realism biggest loss',
                       figsize=(8,6))
                                                                       
ax = plt.gca()
ax.figure.tight_layout()
ax.set_ylabel("")
plt.savefig('romanticism to realism biggest loss translated chr 20.png', dpi=300)

### realist delta champions

In [None]:
real_rom_freqs.sort_values(by='delta_real_to_rom', ascending=False)[:30][['delta_real_to_rom', 
                                                                          'rel_freq_rom', 
                                                                          'rel_freq_real',
                                                                         'label_real']]

In [None]:
real_rom_freqs = real_rom_freqs.query('not label_real.isin(["Ростов"])')

In [None]:
top_real_delta = real_rom_freqs.sort_values(by='delta_real_to_rom')[-30:][['delta_real_to_rom', 
                                                                          'frequency_rom', 
                                                                          'frequency_real',
                                                                         'label_real']]

In [None]:
top_real_delta

In [None]:
top_real_delta.loc['59.56841_30.122892', 'label_real']

In [None]:
top_real_delta.loc['59.56841_30.122892', 'label_real'] = 'Гатчина'

#### Add eng labels (4th time)

In [None]:
list(top_real_delta['label_real'])

In [None]:
real_delta_eng_labels = ['Oranienbaum',
                         'Ufa',
 'Geneva',
 'Vasilievsky Island',
 'Bulgaria',
 'Switzerland',
 'Naples',
 'Winter Palace',
 'Saratov',
 'Ural',
 'Crimea',
  'Gatchina',
 'Riga',
 'Odesa',
 'Peterhof',
 'Oka',
 'Austria',
 'Türkiye',
 'Vienna',
 'Berlin',
 'England',
 'Astrakhan',
 'France',
 'Sevastopol',
 'Rome',
 'Kazan',
 'Volga',
 'Siberia',
 'Paris',
 'Petersburg']

In [None]:
add_bilingual_labels(top_real_delta, real_delta_eng_labels, top_real_delta['label_real'])

In [None]:
top_real_delta

#### Plot

In [None]:
top_real_delta.plot.barh(y='delta_real_to_rom',x='label bilingual', 
                         title='romanticism to realism: biggest gain',
                        figsize=(8,6))
                                                                       
ax = plt.gca()
ax.figure.tight_layout()
plt.savefig('romanticism to realism biggest gain translated chr.png', dpi=300)

In [None]:
top_real_delta[-20:].plot.barh(y='delta_real_to_rom',x='label bilingual', 
                         title='older to newer:biggest gain',
                        figsize=(6,4))
                                                                       
ax = plt.gca()
ax.figure.tight_layout()
plt.savefig('romanticism to realism biggest gain translated chr 25.png', dpi=300)

In [None]:
top_rom_delta[-20:].plot.barh(y='delta_real_to_rom',x='label bilingual', 
                         title='older to newer:biggest gain',
                        figsize=(6,4))
                                                                       
ax = plt.gca()
ax.figure.tight_layout()
plt.savefig('romanticism to realism biggest loss translated chr 25.png', dpi=300)

In [None]:
top_real_delta[-20:].plot.barh(y='delta_real_to_rom',x='label bilingual',  
                        title='romanticism to realism biggest gain',
                       figsize=(8,6))
                                                                       
ax = plt.gca()
ax.figure.tight_layout()
ax.set_ylabel("")
plt.savefig('romanticism to realism biggest gain translated chr 20.png', dpi=300)

### 5. Plot Bubble map with top 20 winners and losers (plotting figure 4)

In [None]:
top_real_delta[-20:]

In [None]:
real_rom_freqs.sort_values(by='delta_real_to_rom')[:20]

In [None]:
forbubble = real_rom_freqs.sort_values(by='delta_real_to_rom')[:21]

In [None]:
# drop Russia as a whole, makes no sense to put it in the Geo center of Russia
forbubble = forbubble.drop("66.0_94.0") 

In [None]:
forbubble_real = real_rom_freqs.sort_values(by='delta_real_to_rom', ascending=False)[:20]

In [None]:
forbubble_real

In [None]:
m = folium.Map(location=[forbubble["lat_real"].mean(), forbubble["lat_real"].mean()], zoom_start=10)

for _, row in forbubble.iterrows():
    folium.CircleMarker(
        location=[row["lat_rom"], row["long_rom"]],
        radius=17,  # Scale bubble size
        color="black",  # Outline color
        fill=True,
        fill_color="blue",  # Inside color
        fill_opacity=0.7,  # Transparency
        popup=f"Place: {row['label_rom']}"  # Show frequency on click
    ).add_to(m)
    
for _, row in forbubble_real.iterrows():
    folium.CircleMarker(
        location=[row["lat_real"], row["long_real"]],
        radius=17,  # Scale bubble size
        color="black",  # Outline color
        fill=True,
        fill_color="red",  # Inside color
        fill_opacity=0.7,  # Transparency
        popup=f"Place: {row['label_real']}"  # Show frequency on click
    ).add_to(m)

# Save map as HTML file
m.save("bubble_map.html")