In [None]:
import json
import sys
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="whitegrid")
%matplotlib inline
from datetime import datetime

In [None]:
battles = pd.DataFrame([json.loads(line) for line in open("../datasets/battle-features-0.json")])

In [None]:
df = pd.DataFrame(battles).replace(0, np.NaN)
pd.options.display.max_colwidth = 200
df[['combatant_list_1','combatant_list_2']].head()

In [None]:
def find_subtext(df, txt):
    contains = df.stack().str.contains(txt).unstack()
    return contains[contains.any(1)].idxmax(1)

def get_duration(battle):
    if battle.end_date == "None":
        return 1
    else:
        return (datetime.strptime(battle.end_date, "%Y-%m-%d") - datetime.strptime(battle.start_date, "%Y-%m-%d")).days


def get_year(date):
    return datetime.strptime(date, "%Y-%m-%d").year if date else None

In [None]:
cols_comba = ['combatant_first_1', 'combatant_first_2', 'combatant_list_1', 'combatant_list_2', 'combatant_list_3']

df = df[battles.start_date.notnull()]
df = df.assign(isUSA=find_subtext(df, 'United States'))
df = df[df.start_date.notnull() & df.isUSA.notnull()]
df.fillna(value="None", inplace=True)

df["year"] = df["start_date"].apply(get_year)

df = df[(df.year>=1000) & (df.year <= 2018)]

df["duration"] = [get_duration(battle) for _, battle in df.iterrows()]
decades = range(int(df.year.min()), int(df.year.max())+10, 10)
centuries = range(int(df.year.min()), int(df.year.max())+100, 100)
df["decade"] = pd.cut(df["year"], decades)
df["century"] = pd.cut(df["year"], centuries)

df = df[(df.duration<=100)]

df_USAFights = df
df_USAFights[['combatant_first_1','combatant_first_2','isUSA']].head()
for i,v in enumerate(df['duration']):
    for j in range(1,v+1):
        df_USAFights = df_USAFights.append({'year': int(df.iloc[i]['year'])+1}, ignore_index=True)
        
X = df_USAFights["year"]
fig, ax = plt.subplots(figsize=(20,1))
ax.scatter(X, [1]*len(X),
           marker='|',linewidth=10, s=100)

ax.yaxis.set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.xaxis.set_ticks_position('bottom')

ax.get_yaxis().set_ticklabels([])
day = pd.to_timedelta("1", unit='D')
plt.xlim(1750, 2018)
plt.show()

In [None]:
couts = dict()
for i,v in enumerate(df['combatant_list_1']):
    tmp = list()
    tmp.append(v)
    tmp.append(df.iloc[i]['combatant_list_2'])
    tmp.append(df.iloc[i]['combatant_list_3'])
    for j,w in enumerate(tmp):
        for k,x in enumerate(w):
            if x in couts:
                couts[x] += int(df.iloc[i]['duration'])
            else:
                couts[x] = int(df.iloc[i]['duration'])

#df[['combatant_list_1', 'combatant_list_2', 'combatant_list_3']].head(100)
import operator
sorted_couts = sorted(couts.items(), key=operator.itemgetter(1), reverse=True)
key, value = zip(*sorted_couts[0:20])
#indices = np.arange(len(key))
#plt.bar(indices, value, color='g')
#plt.xticks(indices, key)
sns.barplot(y=key, x=value, color="blue")
#plt.show()