In [None]:
import json
import sys
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from folium.plugins import HeatMap
import folium as fl
from datetime import datetime

sns.set(style="whitegrid")
%matplotlib inline

In [None]:
battles = pd.DataFrame([json.loads(line) for line in open("../datasets/battle-features-1.json")])
battles = battles[battles.start_date.notnull()]
battles.fillna(value="None", inplace=True)
def get_year(date):
    return datetime.strptime(date, "%Y-%m-%d").year if date else None

def get_duration(battle):
    if battle.end_date == "None" or battle.end_date==battle.start_date:
        return 1
    else:
        return (datetime.strptime(battle.end_date, "%Y-%m-%d") - datetime.strptime(battle.start_date, "%Y-%m-%d")).days

battles["year"] = battles["start_date"].apply(get_year)

battles = battles[(battles.year>=1000) & (battles.year <= 2018)]

battles["duration"] = [get_duration(battle) for _, battle in battles.iterrows()]

battles["log_duration"] = np.log10(battles.duration)

decades = range(int(battles.year.min()), int(battles.year.max())+10, 10)
centuries = range(int(battles.year.min()), int(battles.year.max())+100, 100)
battles["decade"] = pd.cut(battles["year"], decades, labels=decades[:-1])
battles["century"] = pd.cut(battles["year"], centuries, labels=centuries[:-1])

In [None]:
battles["killed"]=battles.killed_1 + battles.killed_2 + battles.killed_3 + battles.killed_4
battles["casualties"]= battles.casualties_1 + battles.casualties_2 + battles.casualties_3 + battles.casualties_4
battles.casualties.replace(to_replace=0, value=np.nan, inplace=True)
print((battles.casualties == 0).sum())
battles["log_casualties"] = np.log10(battles.casualties)
battles.log_casualties.plot.hist()

In [None]:
b = battles.decade.value_counts()
fig, ax = plt.subplots(figsize=(10,16))
sns.barplot(y=b.index, x=b, color="blue",ax=ax)

In [None]:
#battles.duration.value_counts().head(100).plot(kind="bar")

#sns.distplot(battles.duration)  
sns.distplot(battles.log_duration)

In [None]:
fig, ((ax1,ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(20,10), sharey=True)
sns.set(color_codes=True)


bdec = battles.groupby('century').log_duration.mean().dropna()
bcent = battles[battles.year >=1900].groupby('decade').log_duration.mean().dropna()
byearnow = battles[battles.year >=1997].groupby('year').log_duration.mean().dropna()
byearww2 = battles[(battles.year >=1935) & (battles.year <= 1945)].groupby('year').log_duration.mean().dropna()

sns.pointplot(x=np.array(bdec.index), y=np.array(bdec), ax=ax1)
sns.pointplot(x=np.array(bcent.index), y=np.array(bcent), ax=ax2)
sns.pointplot(x=np.array(byearnow.index), y=byearnow, ax=ax3)
sns.pointplot(x=np.array(byearww2.index), y=byearww2, ax=ax4)
#battles[["title", "start_date", "end_date", "duration"]].sort_values(by="duration", ascending=False)

In [None]:
fig, ((ax1,ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(20,10), sharey=True)

b = battles

cascent = b[b.year > 1000].groupby('century').log_casualties.mean().dropna()
casdec = b[b.year > 1900].groupby('decade').log_casualties.mean().dropna()
casyear = b[b.year > 1995].groupby('year').log_casualties.mean().dropna()

sns.pointplot(x=np.array(cascent.index), y=cascent, ax=ax1)
sns.pointplot(x=np.array(casdec.index), y=casdec, ax=ax2)
sns.pointplot(x=np.array(casyear.index), y=casyear, ax=ax3)

In [None]:
fig, ((ax1,ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(20,10), sharey=True)


battle_inc = battles[((battles.result_combatant_1 == "") & (battles.result_combatant_2 == "")) | battles.indecisive]

inccent = battle_inc.groupby("century").title.count()/battles.groupby("century").title.count()
incdec = battle_inc[battle_inc.year > 1900].groupby("decade").title.count()/battles[battles.year > 1900].groupby("decade").title.count()
incdec.dropna(inplace=True)

incyear = battle_inc[battle_inc.year > 2000].groupby("year").title.count()/battles[battles.year > 1900].groupby("year").title.count()
incyear.dropna(inplace=True)

sns.pointplot(x=np.array(inccent.index), y=inccent, ax=ax1)
sns.pointplot(x=np.array(incdec.index), y=incdec, ax=ax2)
sns.pointplot(x=np.array(incyear.index), y=incyear, ax=ax3)