In [None]:
import json
import sys
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.datasets import fetch_20newsgroups
from pprint import pprint
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import itertools
from sklearn.feature_extraction.text import CountVectorizer
sns.set(style="whitegrid")
%matplotlib inline

In [None]:
def densplot(columns, xlabel, title, axo):
    for i,v in enumerate(columns):
        sns.distplot(v, ax=axo, kde_kws={"label": i})
    axo.set_title(title)
    axo.set_xlabel(xlabel, fontsize=12)
    
def scatplot(xelem, yelem, xlabel, ylabel, title, axo, polyfit=None, xlim=None, ylim=None):
    axo.scatter(xelem, yelem)
    if polyfit:
        axo.plot(np.unique(xelem), np.poly1d(np.polyfit(xelem, yelem, polyfit))(np.unique(xelem)), 'C2')
    if xlim:
        axo.set_xlim(0,xlim)
    if ylim:
        axo.set_ylim(0,ylim)
    axo.set_title(title)
    axo.set_xlabel(xlabel, fontsize=12)
    axo.set_ylabel(ylabel, fontsize=12)

In [None]:
battles = pd.DataFrame([json.loads(line) for line in open("../datasets/battle-features-0.json")])
print(battles.columns)
features = ["casualties_1", "casualties_2",'strength_1', 'strength_2']
results_features = ['result_combatant_1', 'result_combatant_2'] 
battles[features].head()

We create one column victory which will be '0' if combatant_1 won and '1' if combatant_2

In [None]:
battlesWithResult = battles.loc[(battles['result_combatant_1'] != '') | (battles['result_combatant_2'] != '')]
battlesWithResult["win"] = 0
results_features.append('win')
y = ['win']
battlesWithResult.loc[(battlesWithResult["result_combatant_1"].str.contains('icto')),'win'] = 1
battlesWithResult.head()[results_features]


In [None]:
X = pd.get_dummies(battlesWithResult[features]) #needed if we use categorical
X.head()

In [None]:
#We seperate the training dataset from the rest, as a result we obtain the TF-IDF vectors belonging 
#to the training vectors as well as their corresponding label, namely the nesgroups targets. 
#We choose to use random_state=None as it means it will use np.random thus the training set is picked randomly.
labels_training,\
labels_tmp,\
vectors_training,\
vectors_tmp= train_test_split(battlesWithResult[features], battlesWithResult['win'], test_size=0.2, random_state=None)

#We now seperate the tmp set with corresponding labels randomly in half to obtain the testing and validation sets
labels_testing,\
labels_validation,\
vectors_testing,\
vectors_validation = train_test_split(labels_tmp, vectors_tmp, test_size=0.1, random_state=None)

In [None]:
clf = RandomForestClassifier()

In [None]:
def grid_search(n_estimators_list, max_depth_list):
    score=0
    final_depth=0
    final_estimator=0
    for depth in max_depth_list:
        for n_estim in n_estimators_list:
            classifier=RandomForestClassifier(max_depth=depth, n_estimators=n_estim, n_jobs=-1, random_state=None)
            classifier.fit(vectors_training, labels_training)
            prediction = classifier.predict(vectors_validation)
            scoring = metrics.accuracy_score(labels_validation, prediction)
            if scoring > score:
                score=scoring
                final_depth=depth
                final_estimator=n_estim
    return (score, final_depth, final_estimator)

In [None]:
grid_search([50,100,200,500,1000,1500,2000,2500], [1,10,20,30])

**We observe that the number of casualties seems to be more important for the outcome of the battle. Since the opponent with more soldiers only wins in 53% of the cases.**