In [None]:
import json
import sys
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.datasets import fetch_20newsgroups
from pprint import pprint
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import itertools
from sklearn.feature_extraction.text import CountVectorizer
sns.set(style="whitegrid")
%matplotlib inline

In [None]:
def densplot(columns, xlabel, title, axo):
    for i,v in enumerate(columns):
        sns.distplot(v, ax=axo, kde_kws={"label": i})
    axo.set_title(title)
    axo.set_xlabel(xlabel, fontsize=12)
    
def scatplot(xelem, yelem, xlabel, ylabel, title, axo, polyfit=None, xlim=None, ylim=None):
    axo.scatter(xelem, yelem)
    if polyfit:
        axo.plot(np.unique(xelem), np.poly1d(np.polyfit(xelem, yelem, polyfit))(np.unique(xelem)), 'C2')
    if xlim:
        axo.set_xlim(0,xlim)
    if ylim:
        axo.set_ylim(0,ylim)
    axo.set_title(title)
    axo.set_xlabel(xlabel, fontsize=12)
    axo.set_ylabel(ylabel, fontsize=12)

In [None]:
battles = pd.DataFrame([json.loads(line) for line in open("../datasets/battle-features-0.json")])
print(battles.columns)
features = ["casualties_1", "casualties_2",'strength_1', 'strength_2']
results_features = ['result_combatant_1', 'result_combatant_2'] 
battles[features].head()

We create one column victory which will be '0' if combatant_1 won and '1' if combatant_2

In [None]:
battlesWithResult = battles.loc[(battles['result_combatant_1'] != '') | (battles['result_combatant_2'] != '')]
battlesWithResult["win"] = 0
results_features.append('win')
y = ['win']
battlesWithResult.loc[(battlesWithResult["result_combatant_1"].str.contains('icto')),'win'] = 1
battlesWithResult.head()[results_features]


In [None]:
battlesWithResult = battlesWithResult.loc[(battlesWithResult['casualties_1'] >= 1) & (battlesWithResult['casualties_2'] >= 1) & (battlesWithResult['strength_1'] >= 1) & (battlesWithResult['strength_2'] >= 1)]
len(battlesWithResult)

In [None]:
X = pd.get_dummies(battlesWithResult[features]) #needed if we use categorical
X.head()

In [None]:
#We seperate the training dataset from the rest, as a result we obtain the TF-IDF vectors belonging 
#to the training vectors as well as their corresponding label, namely the nesgroups targets. 
#We choose to use random_state=None as it means it will use np.random thus the training set is picked randomly.
#labels_training,\
#labels_tmp,\
#vectors_training,\
#vectors_tmp= train_test_split(battlesWithResult[features], battlesWithResult['win'], test_size=0.2, random_state=None)

#We now seperate the tmp set with corresponding labels randomly in half to obtain the testing and validation sets
#labels_testing,\
#labels_validation,\
#vectors_testing,\
#vectors_validation = train_test_split(labels_tmp, vectors_tmp, test_size=0.1, random_state=None)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(battlesWithResult[features], battlesWithResult['win'], test_size=0.2, random_state=None)
X_train_v, X_test_v, y_train_v, y_test_v = train_test_split(X_test, y_test, test_size=0.1, random_state=None)

In [None]:
def grid_search(n_estimators_list, max_depth_list):
    score=0
    final_depth=0
    final_estimator=0
    for depth in max_depth_list:
        for n_estim in n_estimators_list:
            classifier=RandomForestClassifier(max_depth=depth, n_estimators=n_estim, n_jobs=-1, random_state=None)
            classifier.fit(X_train, y_train)
            prediction = classifier.predict(X_train_v)
            scoring = metrics.accuracy_score(y_train_v, prediction)
            if scoring > score:
                score=scoring
                final_depth=depth
                final_estimator=n_estim
    return (score, final_depth, final_estimator)

In [None]:
grid_search([1,2,3,4,5,6,7,8,9,10,100,200,300,400,500,600,700,800,900,1000], [b for b in range(1,200)])

In [None]:
classifier=RandomForestClassifier(max_depth=100, n_estimators=100, n_jobs=-1, random_state=None)
classifier.fit(X_train, y_train)

In [None]:
prediction = classifier.predict(X_test_v)

In [None]:
print(len(y_test_v))
sum(abs(prediction-y_test_v))

In [None]:
#function found on scikit-learn to plot the confusion matrix
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
# Compute confusion matrix
confusion = confusion_matrix(y_test_v, prediction) 
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure(figsize=(20,10))
plot_confusion_matrix(confusion, classes=features,
                      title='Confusion matrix')
plt.show()

In [None]:
sorted_index = np.argsort(classifier.feature_importances_)
best10 = sorted_index[-1:-11:-1]
features = np.array(features)[best10]
importances = classifier.feature_importances_[best10]

In [None]:
fig, ax = plt.subplots(figsize=(15,10))
y_pos = np.arange(len(features))
ax.barh(y_pos, importances, align='center', color='blue')
ax.set_yticks(y_pos)
ax.set_yticklabels(features, size = 15)
ax.invert_yaxis()  
ax.set_xlabel('Feature Importances')
ax.set_title('Top 10 important features')
plt.show()

do logistic regression to see if better

In [None]:
logistic = linear_model.LogisticRegression()

propensity = logistic.fit(X_train, y_train)

prediction_logistic = propensity.predict_proba(X_test)[:,1]

In [None]:
prediction_logistic.mean()

almost random