In [None]:
import os
import sys
import json
import pandas as pd
import numpy as np
import mwparserfromhell as mw
import seaborn as sns
import matplotlib.pyplot as plt
import re
sns.set(style="whitegrid")
%matplotlib inline

%load_ext autoreload
%autoreload 2
import processing # this makes the "processing" folder available as a python module

from features.casualties import get_casualties, get_casualties_line, get_number
from utils import remove_ref, get_templates

In [None]:
battles = json.load(open("../datasets/battle-fields-0.json"))
df = pd.DataFrame([b["infobox"] for b in battles if not b["infobox"].get("error")])
pattern = re.compile("^casualties[0-9]")
df = df[[b for b in df.columns if pattern.match(b)]]

In [None]:
missed1 = get_casualties(df, df['casualties1'],'1')

In [None]:
missed2 = get_casualties(df, df['casualties2'],'2')

In [None]:
missed3 = get_casualties(df, df['casualties3'],'3')

In [None]:
missed4 = get_casualties(df, df['casualties4'],'4')

In [None]:
df.to_csv("casualties")

In [None]:
print("For each casualties columns we have ", missed1, " ", missed2, " ", missed3, " ", missed4, " lines that contain a number but are not parsed.")


Which means that almost 100% of the values have been parsed in each column:

In [None]:
toPlot = {'casualities1': 100-100*missed1/len(df), 'casualities2': 100-(100*missed2/len(df)), 'casualties3':100-(100*missed3/len(df)), 'casualties4':100-(100*missed4/len(df)) }

plt.bar(range(len(toPlot)), toPlot.values())
plt.title("Percent of parsed rows")
plt.xticks(range(len(toPlot)), toPlot.keys())
plt.show()

In [None]:
pd.options.display.max_colwidth = 20
df[['casualties1', 'total_1', 'casualties2', 'total_2', 'casualties3', 'total_3', 'casualties4', 'total_4']].head(10)

But, we also observe that for a lot of battles, the casualties are not given numerically:

In [None]:
print("Out of ", len(df), " battles")
df = df.replace(0, np.NaN)
print("Casualties1 has ", df['total_1'].isnull().sum(), "null values")
print("Casualties2 has ", df['total_2'].isnull().sum(), "null values")
print("Casualties3 has ", df['total_3'].isnull().sum(), "null values")
print("Casualties4 has ", df['total_4'].isnull().sum(), "null values")

toPlot = {'casualities1': 100-100*df['total_1'].isnull().sum()/len(df), 'casualities2': 100-(100*df['total_2'].isnull().sum()/len(df)), 'casualties3':100-(100*df['total_3'].isnull().sum()/len(df)), 'casualties4':100-(100*df['total_4'].isnull().sum()/len(df)) }

plt.bar(range(len(toPlot)), toPlot.values())
plt.title("Percent of battles with numeric values for casualties")
plt.xticks(range(len(toPlot)), toPlot.keys())
plt.show()

We also observe that almost 60% of the battles have numeric values for two combatants.

In [None]:
fig = plt.figure()
ax1 = fig.add_subplot(111)

ax1.scatter(range(len(df)), df['total_1'], c='b', marker="s", label='casualties_1')
ax1.scatter(range(len(df)), df['total_2'], c='r', marker="o", label='casualties_2')
plt.title("Comparison between casualties of different combatants")
plt.legend(loc='upper left');
plt.show()

In [None]:
cbt_1 = np.array(df["combatant_1"].dropna())
cbt_2 = np.array(df["combatant_2"].dropna())
cbt_3 = np.array(df["combatant_3"].dropna())
print(len(cbt_1), len(cbt_2), len(cbt_3))

cbt_all = np.concatenate((cbt_1, cbt_2, cbt_3))

all_cbt_names = [c for cl in cbt_all for c in cl]
names = pd.Series(all_cbt_names)
print(len(set(all_cbt_names)))

f, ax = plt.subplots(figsize=(6, 15))
counts = names.value_counts().sort_values(ascending=False)
counts = counts.head(50)
sns.barplot(x=counts, y=counts.index, ax=ax)