In [None]:
import os
import sys
import json
import pandas as pd
import numpy as np
import mwparserfromhell as mw
import seaborn as sns
import matplotlib.pyplot as plt
import re
sns.set(style="whitegrid")
%matplotlib inline

%load_ext autoreload
%autoreload 2
import processing # this makes the "processing" folder available as a python module

from features.casualties import get_features
#from features.casualtiesOLD import get_casualties, get_features

In [None]:
def densplot(columns, xlabel, title, axo):
    for i,v in enumerate(columns):
        sns.distplot(v, ax=axo, kde_kws={"label": i})
    axo.set_title(title)
    axo.set_xlabel(xlabel, fontsize=12)
    
def scatplot(xelem, yelem, xlabel, ylabel, title, axo, polyfit=None):
    axo.scatter(xelem, yelem)
    if polyfit:
        plt.plot(np.unique(xelem), np.poly1d(np.polyfit(xelem, yelem, polyfit))(np.unique(xelem)), 'C2')
    axo.set_title(title)
    axo.set_xlabel(xlabel, fontsize=12)
    axo.set_ylabel(ylabel, fontsize=12)

In [None]:
battles = json.load(open("../datasets/battle-fields-0.json"))
df = pd.DataFrame([b["infobox"] for b in battles if not b["infobox"].get("error")])
pattern = re.compile("^casualties[0-9]")
df = df[[b for b in df.columns if pattern.match(b)]]

In [None]:
missed1 = get_casualties(df, df['casualties1'],'1')

In [None]:
missed2 = get_casualties(df, df['casualties2'],'2')

In [None]:
missed3 = get_casualties(df, df['casualties3'],'3')

In [None]:
missed4 = get_casualties(df, df['casualties4'],'4')

In [None]:
df.to_csv("casualties")

In [None]:
print("For each casualties columns we have ", missed1, " ", missed2, " ", missed3, " ", missed4, " lines that contain a number but are not parsed.")


Which means that almost 100% of the values have been parsed in each column:

In [None]:
toPlot = {'casualities1': 100-100*missed1/len(df), 'casualities2': 100-(100*missed2/len(df)), 'casualties3':100-(100*missed3/len(df)), 'casualties4':100-(100*missed4/len(df)) }

plt.bar(range(len(toPlot)), toPlot.values())
plt.title("Percent of parsed rows")
plt.xticks(range(len(toPlot)), toPlot.keys())
plt.show()

In [None]:
pd.options.display.max_colwidth = 200
df[['casualties1', 'casualties_1', 'casualties2', 'casualties_2', 'casualties3', 'casualties_3', 'casualties4', 'casualties_4']].head(10)

But, we also observe that for a lot of battles, the casualties are not given numerically:

In [None]:
print("Out of ", len(df), " battles")
df = df.replace(0, np.NaN)
c1_sum_null = df['casualties_1'].isnull().sum()
c2_sum_null = df['casualties_2'].isnull().sum()
c3_sum_null = df['casualties_3'].isnull().sum()
c4_sum_null = df['casualties_4'].isnull().sum()
print("Casualties1 has ", c1_sum_null, "null values")
print("Casualties2 has ", c2_sum_null, "null values")
print("Casualties3 has ", c3_sum_null, "null values")
print("Casualties4 has ", c4_sum_null, "null values")

toPlot = {'casualities1': 100-100*df['casualties_1'].isnull().sum()/len(df), 'casualities2': 100-(100*df['casualties_2'].isnull().sum()/len(df)), 'casualties3':100-(100*df['casualties_3'].isnull().sum()/len(df)), 'casualties4':100-(100*df['casualties_4'].isnull().sum()/len(df)) }

plt.bar(range(len(toPlot)), toPlot.values())
plt.title("Percent of battles with numeric values for casualties")
plt.xticks(range(len(toPlot)), toPlot.keys())
plt.show()

We also observe that almost 60% of the battles have numeric values for two combatants.

In [None]:
print("averages: ")
print("casualties_1: ", df['casualties_1'].sum()/(len(df['casualties_1'])-c1_sum_null))
print("casualties_2: ", df['casualties_2'].sum()/(len(df['casualties_2'])-c1_sum_null))
print("casualties_3: ", df['casualties_3'].sum()/(len(df['casualties_3'])-c1_sum_null))
print("casualties_4: ", df['casualties_4'].sum()/(len(df['casualties_4'])-c1_sum_null))

In [None]:
#size of the subset
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6), (ax7, ax8)) = plt.subplots(nrows=4, ncols=2, sharey=False, figsize=(15,20))
densplot([df['casualties_1'].dropna()], 'casualties_1', "casualties_1 ", ax1)
zoom = df.query('casualties_1 >1 and casualties_1 <2000')['casualties_1']
densplot([zoom], 'casualties_1', "casualties_1 under 2000 ", ax2)

densplot([df['casualties_2'].dropna()], 'casualties_2', "casualties_2 ", ax3)
zoom = df.query('casualties_2 >1 and casualties_2 <2000')['casualties_2']
densplot([zoom], 'casualties_2', "casualties_2 under 2000 ", ax4)

densplot([df['casualties_3'].dropna()], 'casualties_3', "casualties_3", ax5)
zoom = df.query('casualties_3 >1 and casualties_3 <2000')['casualties_3']
densplot([zoom], 'casualties_3', "casualties_3 under 2000", ax6)
densplot([df['casualties_4'].dropna()], 'casualties_4', "casualties_4", ax7)
zoom = df.query('casualties_4 >1 and casualties_4 <2000')['casualties_4']
densplot([zoom], 'casualties_4', "casualties_4 under 2000", ax8)

fig.tight_layout()
plt.show()



In [None]:
c1_zoom = df.query('casualties_1 >1 and casualties_2>1 and casualties_1 <25000 and casualties_2 <25000')['casualties_1']
c2_zoom = df.query('casualties_1 >1 and casualties_2>1 and casualties_1 <25000 and casualties_2 <25000')['casualties_2']

c1 = df.query('casualties_1 >1 and casualties_2>1')['casualties_1']
c2 = df.query('casualties_1 >1 and casualties_2>1')['casualties_2']


fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(nrows=3, ncols=2, sharey=False, figsize=(10,10))
scatplot(c1, c2, 'casualities_1', 'casualities_2', "Casualities 1 vs. 2", ax1)
scatplot(c1_zoom, c2_zoom, 'casualities_1', 'casualities_2', "Casualities 1 vs. 2 Zoom", ax2)
densplot([abs(c1-c2)], 'c1', "Difference between c1 and c2", ax3)
densplot([abs(c1_zoom-c2_zoom)], 'c1', "Difference between c1 and c2", ax4)
ax4.set_xlim(0,10000)
scatplot(range(len(c1)), abs(c1-c2), 'c1', 'c2', "Difference between c1 and c2", ax5)
densplot([100*abs(c1_zoom-c2_zoom)/c1_zoom], 'c1', "Difference between c1 and c2, distribution", ax6)
ax6.set_xlim(0,10000)
fig.tight_layout()
plt.show()

In [None]:

battles = json.load(open("../datasets/battle-fields-0.json"))
print(battles[2])
print(get_features(battles[2]))