In [None]:
import os
import sys
import json
import pandas as pd
import numpy as np
import mwparserfromhell as mw
import seaborn as sns
import matplotlib.pyplot as plt
import re
sns.set(style="whitegrid")
%matplotlib inline

%load_ext autoreload
%autoreload 2
import processing # this makes the "processing" folder available as a python module

from features.strengths import get_features
#from utils import remove_ref, get_templates

In [None]:
battles = json.load(open("../datasets/battle-fields-0.json"))
df = pd.DataFrame([b["infobox"] for b in battles if not b["infobox"].get("error")])
pattern = re.compile("^strength[0-9]")
df = df[[b for b in df.columns if pattern.match(b)]]

In [None]:
missed1 = get_strengths(df, df['strength1'],'1')

In [None]:
missed2 = get_strengths(df, df['strength2'],'2')

In [None]:
missed3 = get_strengths(df, df['strength3'],'3')

In [None]:
df.to_csv("stengths")

In [None]:
print("For each strength column we have ", missed1, " ", missed2, " ", missed3, " lines that contain a number but are not parsed.")

Which means that almost 100% of the values have been parsed in each column:

In [None]:
toPlot = {'strength1': 100-100*missed1/len(df), 'strength2': 100-(100*missed2/len(df)), 'strength3':100-(100*missed3/len(df))}

plt.bar(range(len(toPlot)), toPlot.values())
plt.title("Percent of parsed rows")
plt.xticks(range(len(toPlot)), toPlot.keys())
plt.show()

In [None]:
pd.options.display.max_colwidth = 200
df[['strength1', 'strength_1', 'strength2', 'strength_2', 'strength3', 'strength_3']].head(10)

But, we also observe that for a lot of battles, the strengths are not given numerically:

In [None]:
print("Out of ", len(df), " battles")
df = df.replace(0, np.NaN)
print("Strength1 has ", df['strength_1'].isnull().sum(), "null values")
print("Strength2 has ", df['strength_2'].isnull().sum(), "null values")
print("Strength3 has ", df['strength_3'].isnull().sum(), "null values")

toPlot = {'strength1': 100-100*df['strength_1'].isnull().sum()/len(df), 'strength2': 100-(100*df['strength_2'].isnull().sum()/len(df)), 'strength3':100-(100*df['strength_3'].isnull().sum()/len(df))}

plt.bar(range(len(toPlot)), toPlot.values())
plt.title("Percent of battles with numeric values for strength")
plt.xticks(range(len(toPlot)), toPlot.keys())
plt.show()

We also observe that almost 70% of the battles have numeric values for two combatants.

In [None]:
battles = json.load(open("../datasets/battle-fields-0.json"))
print(battles[2])
print(get_features(battles[2]))