In [None]:
import json
import sys
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from folium.plugins import HeatMap
import folium as fl
from datetime import datetime
import math
from matplotlib.transforms import Bbox

sns.set(style="whitegrid")
%matplotlib inline

In [None]:
def densplot(columns, xlabel, title, axo):
    for i,v in enumerate(columns):
        sns.distplot(v, ax=axo, kde_kws={"label": i})
    axo.set_title(title)
    axo.set_xlabel(xlabel, fontsize=12)
    
def full_extent(ax, pad=0.0):
    """Get the full extent of an axes, including axes labels, tick labels, and
    titles."""
    # For text objects, we need to draw the figure first, otherwise the extents
    # are undefined.
    ax.figure.canvas.draw()
    items = ax.get_xticklabels() + ax.get_yticklabels() 
    items += [ax, ax.title, ax.xaxis.label, ax.yaxis.label]
    items += [ax, ax.title]
    bbox = Bbox.union([item.get_window_extent() for item in items])

    return bbox.expanded(1.0 + pad, 1.0 + pad)

In [None]:
FOLDER = "../report/figures/" = "../report/figures/"

In [None]:
battles = pd.DataFrame([json.loads(line) for line in open("../datasets/battle-features-1.json")])
battles = battles[battles.start_date.notnull()]
battles.fillna(value="None", inplace=True)
def get_year(date):
    return datetime.strptime(date, "%Y-%m-%d").year if date else None

def get_duration(battle):
    if battle.end_date == "None" or battle.end_date==battle.start_date:
        return 1
    else:
        return (datetime.strptime(battle.end_date, "%Y-%m-%d") - datetime.strptime(battle.start_date, "%Y-%m-%d")).days

battles["year"] = battles["start_date"].apply(get_year)

battles = battles[(battles.year>=1000) & (battles.year <= 2018)]

battles["duration"] = [get_duration(battle) for _, battle in battles.iterrows()]

battles["log_duration"] = np.log10(battles.duration)

decades = range(int(battles.year.min()), int(battles.year.max())+10, 10)
centuries = range(int(battles.year.min()), int(battles.year.max())+100, 100)
battles["decade"] = pd.cut(battles["year"], decades, labels=decades[:-1])
battles["century"] = pd.cut(battles["year"], centuries, labels=centuries[:-1])

In [None]:
battles["killed"]=battles.killed_1 + battles.killed_2 + battles.killed_3 + battles.killed_4
battles["casualties"]= battles.casualties_1 + battles.casualties_2 + battles.casualties_3 + battles.casualties_4
battles.casualties.replace(to_replace=0, value=np.nan, inplace=True)
print((battles.casualties == 0).sum())
battles["log_casualties"] = np.log10(battles.casualties)
battles.log_casualties.plot.hist()
plt.title('Frequency of Number of Casualties on a base 10 logarithm scale on the entire dataset')

In [None]:
b = battles.decade.value_counts()
fig, ax = plt.subplots(figsize=(10,16))
sns.barplot(y=b.index, x=b, color="blue",ax=ax)

In [None]:
#battles.duration.value_counts().head(100).plot(kind="bar")

#sns.distplot(battles.duration)  
fig, ax = plt.subplots()
sns.distplot(battles.log_duration, ax=ax)
#ax.set_title('Log duration of battles')
ax.set_xlabel('duration', fontsize=12)
ax.set_ylabel('frequency', fontsize=12)
plt.show()
fig.savefig('log duration of battles')

In [None]:
#NOT USED to plot

fig, ((ax1),(ax2), (ax3)) = plt.subplots(3, 1, figsize=(10,20), sharey=True)

b = battles

cascent = b[b.year > 1000].groupby('century').log_casualties.mean().dropna()
casdec = b[b.year > 1900].groupby('decade').log_casualties.mean().dropna()
casyear = b[b.year > 1995].groupby('year').log_casualties.mean().dropna()
sns.set_style("whitegrid")
sns.pointplot(x=np.array(cascent.index), y=cascent, ax=ax1)
sns.pointplot(x=np.array(casdec.index), y=casdec, ax=ax2)
sns.pointplot(x=np.array(casyear.index), y=casyear, ax=ax3)
#ax1.set_title('Mean of the number of casualties over the last thousand years by century on a logarithmic scale')
#ax2.set_title('Mean of the number of casualties in the last century by decenny on a logarithmic scale')
#ax3.set_title('Mean of the number of casualties in the last twenty years by year on a logarithmic scale')
ax1.set_ylabel('Number of Casualties (log)', fontsize=20)
ax2.set_ylabel('Number of Casualties (log)', fontsize=12)
ax3.set_ylabel('Number of Casualties (log)', fontsize=12)
ax1.set_xlabel('Year', fontsize=20)
ax2.set_xlabel('Year', fontsize=12)
ax3.set_xlabel('Year', fontsize=12)
ax1.tick_params(labelsize=15)
test1 = full_extent(ax1).transformed(fig.dpi_scale_trans.inverted())
#test2 = full_extent(ax2).transformed(fig.dpi_scale_trans.inverted())
#test3 = full_extent(ax3).transformed(fig.dpi_scale_trans.inverted())
fig.savefig('casThByCent.eps', bbox_inches=test1)
#fig.savefig('casCentBydec.png', bbox_inches=test2)
#fig.savefig('casTwByYr.png', bbox_inches=test3)

In [None]:
#NOT USED to plot
fig, ax1 = plt.subplots(figsize=(10,3), sharey=True)
sns.set(color_codes=True)


bdec = battles.groupby('century').log_duration.mean().dropna()
#bdecpower = np.array(bdec)
#for i,v in enumerate(bdecpower):
#    bdecpower[i]=math.pow(10,v)
bcent = battles[battles.year >=1900].groupby('decade').log_duration.mean().dropna()
byearnow = battles[battles.year >=1996].groupby('year').log_duration.mean().dropna()
byearww2 = battles[(battles.year >=1935) & (battles.year <= 1945)].groupby('year').log_duration.mean().dropna()

sns.set_style("whitegrid")
sns.pointplot(x=np.array(bdec.index), y=np.array(bdec), ax=ax1)
#sns.pointplot(x=np.array(bcent.index), y=np.array(bcent), ax=ax2)
#sns.pointplot(x=np.array(byearnow.index), y=byearnow, ax=ax3)
#sns.pointplot(x=np.array(byearww2.index), y=byearww2, ax=ax4)
#ax1.set_title('Mean of the duration of battles in the last thousand years by century on a logarithmic scale')
#ax2.set_title('Mean of the duration of battles in the last century by decenny on a logarithmic scale')
#ax3.set_title('Mean of the duration of battles in the last twenty years by year on a logarithmic scale')
#ax4.set_title('Mean of the duration of battles during WW2 by year on a logarithmic scale')
ax1.set_ylabel('Duration Mean (log)', fontsize=20)
#ax2.set_ylabel('Duration Mean (log)', fontsize=12)
#ax3.set_ylabel('Duration Mean (log)', fontsize=12)
#ax4.set_ylabel('Duration Mean (log)', fontsize=12)
ax1.set_xlabel('Year', fontsize=20)
#ax2.set_xlabel('Year', fontsize=12)
#ax3.set_xlabel('Year', fontsize=12)
#ax4.set_xlabel('Year', fontsize=12)
ax1.tick_params(labelsize=15)
ax1.set_ylim(0,1)
#ax1.set(yscale="log")
#ax1.set_facecolor('white')
#ax1.grid(color='gray')
#ax1.axes.set_clip_on(True)
#battles[["title", "start_date", "end_date", "duration"]].sort_values(by="duration", ascending=False)
#test1 = full_extent(ax1).transformed(fig.dpi_scale_trans.inverted())
#test2 = full_extent(ax2).transformed(fig.dpi_scale_trans.inverted())
#test3 = full_extent(ax3).transformed(fig.dpi_scale_trans.inverted())
#test4 = full_extent(ax4).transformed(fig.dpi_scale_trans.inverted())
fig.tight_layout()
fig.savefig('durThByCent.eps')
#fig.savefig('durCentBydec.png', bbox_inches=test2)
#fig.savefig('durTwByYr.png', bbox_inches=test3)
#fig.savefig('durWW2ByYr.png', bbox_inches=test4)

In [None]:

fig, ax1 = plt.subplots(figsize=(10,3), sharey=True)
sns.set(color_codes=True)
bdec = battles.groupby('century').log_duration.mean().dropna()
bcent = battles[battles.year >=1900].groupby('decade').log_duration.mean().dropna()
byearnow = battles[battles.year >=1996].groupby('year').log_duration.mean().dropna()
byearww2 = battles[(battles.year >=1935) & (battles.year <= 1945)].groupby('year').log_duration.mean().dropna()

sns.set_style("whitegrid")
sns.pointplot(x=np.array(bdec.index), y=np.array(bdec), ax=ax1)
ax1.set_ylabel('Duration Mean (log)', fontsize=20)
ax1.set_xlabel('Year', fontsize=20)
ax1.tick_params(labelsize=15)
ax1.set_ylim(0,1)
fig.tight_layout()
fig.savefig(FOLDER+'durThByCent.eps')

In [None]:
#NOT used to plot
fig, ((ax1),(ax2), (ax3)) = plt.subplots(3, 1, figsize=(10,20), sharey=True)


battle_inc = battles[((battles.result_combatant_1 == "") & (battles.result_combatant_2 == "")) | battles.indecisive]

inccent = battle_inc.groupby("century").title.count()/battles.groupby("century").title.count()
incdec = battle_inc[battle_inc.year > 1900].groupby("decade").title.count()/battles[battles.year > 1900].groupby("decade").title.count()
incdec.dropna(inplace=True)

incyear = battle_inc[battle_inc.year > 1996].groupby("year").title.count()/battles[battles.year > 1996].groupby("year").title.count()
incyear.dropna(inplace=True)
sns.set_style("whitegrid")
sns.pointplot(x=np.array(inccent.index), y=inccent, ax=ax1)
sns.pointplot(x=np.array(incdec.index), y=incdec, ax=ax2)
sns.pointplot(x=np.array(incyear.index), y=incyear, ax=ax3)
#ax1.set_title('Mean of the number of indecisive battles over the last thousand years by century on a logarithmic scale')
#ax2.set_title('Mean of the number of indecisive battles in the last century by decenny on a logarithmic scale')
#ax3.set_title('Mean of the number of indecisive battles in the last twenty years by year on a logarithmic scale')
ax1.set_ylabel('Number of Indecisive Battles (log)', fontsize=20)
ax2.set_ylabel('Number of Indecisive battles (log)', fontsize=12)
ax3.set_ylabel('Number of Indecisive battles (log)', fontsize=12)
ax1.set_xlabel('Year', fontsize=20)
ax2.set_xlabel('Year', fontsize=12)
ax3.set_xlabel('Year', fontsize=12)
ax1.tick_params(labelsize=15)
test1 = full_extent(ax1).transformed(fig.dpi_scale_trans.inverted())
test2 = full_extent(ax2).transformed(fig.dpi_scale_trans.inverted())
test3 = full_extent(ax3).transformed(fig.dpi_scale_trans.inverted())
fig.savefig('indThByCent.png', bbox_inches=test1)
#fig.savefig('indCentBydec.png', bbox_inches=test2)
#fig.savefig('indTwByYr.png', bbox_inches=test3)

In [None]:
fig, ax1 = plt.subplots(1, 1, figsize=(10,3))
battle_inc = battles[((battles.result_combatant_1 == "") & (battles.result_combatant_2 == "")) | battles.indecisive]

inccent = battle_inc.groupby("century").title.count()/battles.groupby("century").title.count()
sns.set_style("whitegrid")
sns.pointplot(x=np.array(inccent.index), y=inccent*100, ax=ax1)
ax1.set_ylabel('Indeci. Battles (%)', fontsize=20)
ax1.set_xlabel('Year', fontsize=20)
ax1.tick_params(labelsize=15)
fig.tight_layout()
#fig.savefig(FOLDER+'indThByCent.eps')

In [None]:
b[(b.year > 1900) & (b.year < 2000)].duration.mean()

In [None]:
battlesCasuStren = battles.query('(casualties_1 >1 and strength_1 >1) and strength_1 > casualties_1')
battlesCasuStren = battlesCasuStren.query('(casualties_2 >1 and strength_2 >1) and strength_2 > casualties_2')
battlesCasuStren['CasualtiesPercent'] = ((battlesCasuStren['casualties_1']*100/battlesCasuStren['strength_1'])+(battlesCasuStren['casualties_2']*100/battlesCasuStren['strength_2']))/2
bdec = battlesCasuStren.groupby('century').CasualtiesPercent.mean().dropna()

fig, ((ax1)) = plt.subplots(1, 1, figsize=(10,3), sharey=False)
sns.set_style("whitegrid")
sns.pointplot(x=np.array(bdec.index), y=np.array(bdec), ax=ax1)
ax1.set_ylabel('Casualties (%)', fontsize=20)
ax1.set_xlabel('Year', fontsize=20)
ax1.set_ylim(0,45)
ax1.tick_params(labelsize=15)
fig.tight_layout()
#fig.savefig(FOLDER+'CasuPerCent.eps')