In [2]:
import json
import sys
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="whitegrid")
%matplotlib inline

In [3]:
def densplot(columns, xlabel, title, axo):
    for i,v in enumerate(columns):
        sns.distplot(v, ax=axo, kde_kws={"label": i})
    axo.set_title(title)
    axo.set_xlabel(xlabel, fontsize=12)
    
def scatplot(xelem, yelem, xlabel, ylabel, title, axo, polyfit=None, xlim=None, ylim=None):
    axo.scatter(xelem, yelem)
    if polyfit:
        axo.plot(np.unique(xelem), np.poly1d(np.polyfit(xelem, yelem, polyfit))(np.unique(xelem)), 'C2')
    if xlim:
        axo.set_xlim(0,xlim)
    if ylim:
        axo.set_ylim(0,ylim)
    axo.set_title(title)
    axo.set_xlabel(xlabel, fontsize=12)
    axo.set_ylabel(ylabel, fontsize=12)

In [6]:
battles = pd.DataFrame([json.loads(line) for line in open("../datasets/battle-features-0.json")])
interest_col = ["casualties_1", "casualties_2", "result_combatant_1", "result_combatant_2"]
battles[interest_col].head()

Unnamed: 0,casualties_1,casualties_2,result_combatant_1,result_combatant_2
0,230,10500,decisive victory,
1,0,0,victory,
2,0,4500,decisive victory,
3,1050,250,,decisive victory
4,10786,10897,victory,


In [102]:
casuresu = battles.query("casualties_1 > 1 and casualties_2 > 1 and (result_combatant_1 != '' or result_combatant_2 != '')")
casuresu.head()[interest_col]

Unnamed: 0,casualties_1,casualties_2,result_combatant_1,result_combatant_2
0,230,10500,decisive victory,
3,1050,250,,decisive victory
4,10786,10897,victory,
5,3663,19000,victory,
6,2000,5450,,decisive victory
7,12484,27190,decisive victory,
9,600,2010,victory,
10,1999,100,,decisive victory
11,30000,1000,,decisive victory
12,2500,5000,,decisive victory


In [101]:
painfulwin = len((casuresu.loc[(casuresu["result_combatant_1"].str.contains('icto') & (casuresu["casualties_1"]>casuresu["casualties_2"])) | (casuresu["result_combatant_2"].str.contains('icto') & (casuresu["casualties_2"]>casuresu["casualties_1"]))]))
painfulwinpercent = painfulwin*100/len(casuresu)
print("The combatants that had more casualties only won in ", painfulwinpercent, "% of the cases")

The combatant that had more casualties only won in  26.056338028169016 % of the cases


We do a similar process for the strength vs. results relationship

In [104]:
interest_col = ["strength_1", "strength_2", "result_combatant_1", "result_combatant_2"]
strenresu = battles.query("strength_1 > 1 and strength_2 > 1 and (result_combatant_1 != '' or result_combatant_2 != '')")
strenresu.head()[interest_col]

Unnamed: 0,strength_1,strength_2,result_combatant_1,result_combatant_2
0,22000,42500,decisive victory,
2,6000,11000,decisive victory,
3,2700,2000,,decisive victory
4,47561,10900,victory,
5,62000,60000,victory,


In [105]:
hardwin = len((strenresu.loc[(strenresu["result_combatant_1"].str.contains('icto') & (strenresu["strength_1"]<strenresu["strength_2"])) | (strenresu["result_combatant_2"].str.contains('icto') & (strenresu["strength_2"]<strenresu["strength_1"]))]))
hardwinpercent = hardwin*100/len(strenresu)
print("The combatants that had more soldiers lost in ", hardwinpercent, "% of the cases")

The combatants that had more soldiers only lost in  47.25495111556781 % of the cases


**We observe that the number of casualties seems to be more important for the outcome of the battle. Since the opponent with more soldiers only wins in 53% of the cases.**