In [1]:
import pandas as pd

In [2]:
def get_action_df(raw_df, action):
    return raw_df.loc[raw_df["actionType"] == action].drop("actionType", 1)

def get_action_dfs(csvpath, dropna=True):
    raw_df = pd.read_csv(csvpath)
    if dropna:
        raw_df = raw_df.dropna()
    
    attack_df = get_action_df(raw_df, "ATTACK")
    move_df = get_action_df(raw_df, "MOVE")
    cast_df = get_action_df(raw_df, "CAST")

    return [attack_df, move_df, cast_df]

In [3]:
import math

def get_df(file0, file1, splits):
    dfs0 = [get_stats(df, 0, splits) for df in get_action_dfs(file0)]
    dfs1 = [get_stats(df, 1, splits) for df in get_action_dfs(file1)]
    
    dfs = []
    for i in range(len(dfs0)):
        df0 = dfs0[i]
        df1 = dfs1[i]
        
        df0["tmp"] = 1
        df1["tmp"] = 1

        dfs.append(pd.merge(df0, df1, how="inner").drop("tmp", 1))
    
    return dfs
    
    
def get_stats(raw_df, fid, splits):
    i = 0
    data = []
    headers = []
    for df in split_df(raw_df, splits):
        stats = df.describe().drop("count", 0).drop("steamid", 1).fillna(0)
        headers.extend(get_headers(stats, fid, i))
        data.extend(get_data(stats))
        i += 1
       
    df = pd.DataFrame(data=[data],columns=headers)
    
    return df


def split_df(df, splits):
    rows = int(math.ceil(len(df.index)/float(splits)))
    dfs = []
    
    while len(df) > rows:
        tmp = df[:rows]
        dfs.append(tmp)
        df = df[rows:]
    else:
        dfs.append(df)

    return dfs    


def get_headers(stats, fid, i):
    return [
        "{}-{}-{}-{}".format(stats.columns[col],stats.index[row], fid, i) 
        for row in range(len(stats.index)) 
        for col in range(len(stats.columns.values))
    ]

def get_data(stats):
    return [
        stats.iloc[row,col]
        for row in range(len(stats.index))
        for col in range(len(stats.columns.values))
    ]

In [4]:
import itertools
import os
import ntpath

def get_pairs(path):
    files = ["{}/{}".format(path, file) for file in os.listdir(path)]
    return list(itertools.permutations(files, 2))

def get_playerid(name):
    return ntpath.basename(name)[:17]

def is_same_player(id1, id2):
    return 1 if id1 == id2 else 0

def get_ys(pairs):
    return [
        is_same_player(get_playerid(file0), get_playerid(file1))
        for file0,file1 in pairs
    ]

def get_pair_dfs(pairs, splits):
    attacks, moves, casts = [],[],[]
    for file0, file1 in pairs:
        dfs = get_df(file0, file1, splits)
        
        attacks.append(dfs[0])
        moves.append(dfs[1])
        casts.append(dfs[2])
        
    return pd.concat(attacks), pd.concat(moves), pd.concat(casts)

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

def train(df, ys):
    lr = LogisticRegression(class_weight="balanced")
    lr.fit(df, ys)

    rf = RandomForestClassifier(class_weight="balanced")
    rf.fit(df, ys)
    
    nn = MLPClassifier(solver="lbfgs", alpha=0.01)
    nn.fit(df, ys)
    
    return lr, rf, nn

def test(model, df, y):
    predictions = model.predict(df)
    
    accuracy = accuracy_score(y, predictions)
    precision = precision_score(y, predictions)
    recall = recall_score(y, predictions)
    
    print_scores(type(model).__name__, accuracy, precision, recall)
    
    return accuracy, precision, recall

def print_scores(name, accuracy, precision, recall):
    print("{} - Accuracy: {}, Precision: {}, Recall: {}".format(name, accuracy, precision, recall)) 
    


In [6]:
import random

def sample_filter(sample, prob):
    if sample[1] == 0:
        if random.random() < prob:
            return False
        else:
            return True
    else:
        return False

def sample(X, y, prob):
    combine = [(X[i], y[i]) for i in range(len(X))]
    sample = [s for s in combine if not sample_filter(s, prob)]
    
    return map(list, zip(*sample))

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def count_sample(x):
    negatives = x.count(0)
    positives = x.count(1)
    return "{} negative samples and {} positive samples".format(negatives, positives)


def standardise(X_train, X_test):
    scaler = StandardScaler()
    scaler.fit(X_train)
    
    return scaler.transform(X_train), scaler.transform(X_test)
    

def ml(path, splits, sample_ratio):
    scaler = StandardScaler()
    pairs = get_pairs(path)
    ys = get_ys(pairs)

    X_train, X_test, y_train, y_test = train_test_split(pairs, ys, test_size=0.2, stratify=ys)
    X_train, y_train = sample(X_train, y_train, sample_ratio)
    X_test, y_test = sample(X_test, y_test, sample_ratio)
    
    print("Training - {}".format(count_sample(y_train)))
    print("Testing - {}".format(count_sample(y_test)))

    for train_df, test_df, action in zip(get_pair_dfs(X_train, splits), get_pair_dfs(X_test, splits), ["ATTACK", "CAST", "MOVE"]):
        print(action)
        train_df, test_df = standardise(train_df, test_df)

        lr, rf, nn = train(train_df, y_train)
        test(lr, test_df, y_test)
        test(rf, test_df, y_test)
        test(nn, test_df, y_test)
    print("---")
    

In [61]:
ml("/cs/scratch/sy35/dota-data/tmp", 3, 0.5)

Training - 14 negative samples and 9 positive samples
Testing - 3 negative samples and 3 positive samples
ATTACK
LogisticRegression - Accuracy: 0.833333333333, Precision: 0.75, Recall: 1.0
RandomForestClassifier - Accuracy: 0.5, Precision: 0.0, Recall: 0.0
MLPClassifier - Accuracy: 1.0, Precision: 1.0, Recall: 1.0
CAST
LogisticRegression - Accuracy: 0.833333333333, Precision: 0.75, Recall: 1.0
RandomForestClassifier - Accuracy: 0.666666666667, Precision: 0.666666666667, Recall: 0.666666666667
MLPClassifier - Accuracy: 0.5, Precision: 0.5, Recall: 0.333333333333
MOVE
LogisticRegression - Accuracy: 0.833333333333, Precision: 0.75, Recall: 1.0
RandomForestClassifier - Accuracy: 0.333333333333, Precision: 0.0, Recall: 0.0
MLPClassifier - Accuracy: 0.833333333333, Precision: 0.75, Recall: 1.0
---


In [8]:
df = pd.read_csv("/cs/scratch/sy35/dota-data/14-mouseaction.csv").drop("actionType", 1).dropna()

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

preproc = Pipeline([
    ("feature_selection", PCA()),
    ("standardisation", StandardScaler())
])

clf = Pipeline([
    # 
    ("preprocessing", preproc),
    
    # Learning
    ("classifier", LogisticRegression(class_weight="balanced"))
])

In [67]:
preproc.fit_transform(df)

array([[ 1.32279605, -0.10393854,  3.15177178, ...,  1.05176267,
         0.26479506, -0.07871237],
       [ 1.32279605, -0.10393854,  3.15177178, ...,  1.05176267,
         0.26479506, -0.07871237],
       [ 1.32279605,  0.1444993 ,  3.45986189, ...,  0.96094588,
         0.5646299 , -0.18546485],
       ...,
       [-0.82552376, -0.5544113 ,  0.91358539, ...,  0.77335063,
         1.47696555, -0.75466931],
       [-0.82552376, -0.5544113 ,  0.91358539, ...,  0.77335063,
         1.47696555, -0.75466931],
       [-0.82552376, -0.59828206, -0.51220148, ...,  0.78132467,
         0.9869663 , -0.54915021]])

In [74]:
clf.fit(df, y)

Pipeline(memory=None,
     steps=[('preprocessing', Pipeline(memory=None,
     steps=[('feature_selection', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('standardisation', StandardScaler(copy=True, with_mean=True, with_std=True))])), ('classifier', ...ty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

In [110]:
#ml("/cs/scratch/sy35/dota-data/15-1/data/mouseaction")

In [None]:
ml("/cs/scratch/sy35/dota-data/tmp", 3, 0.5)

Training - 369 negative samples and 203 positive samples
Testing - 83 negative samples and 51 positive samples


In [10]:
path = "/cs/scratch/sy35/dota-data/15-1/data/mouseaction"
pairs = get_pairs(path)
ys = get_ys(pairs)
splits = 3

In [11]:
X, y = sample(pairs, ys, 0.015)

In [12]:
count_sample(y)

'343 negative samples and 254 positive samples'

In [13]:
clf = Pipeline([
    # 
    ("preprocessing", preproc),
    
    # Learning
    ("classifier", LogisticRegression(class_weight="balanced"))
])

In [14]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(3)

for train_index, test_index in skf.split(X, y):
    X_train = [X[i] for i in train_index]
    X_test = [X[i] for i in test_index]
    y_train = [y[i] for i in train_index]
    y_test = [y[i] for i in test_index]
    
    for train_df, test_df, action in zip(get_pair_dfs(X_train, splits), get_pair_dfs(X_test, splits), ["ATTACK", "CAST", "MOVE"]):
        print(action)
        clf = clf.fit(train_df, y_train)
        test(clf, test_df, y_test)
        
        #lr, rf, nn = train(train_df, y_train)
        #test(lr, test_df, y_test)
        #test(rf, test_df, y_test)
        #test(nn, test_df, y_test)
    print("---")

ATTACK
Pipeline - Accuracy: 0.47, Precision: 0.407079646018, Recall: 0.541176470588
CAST
Pipeline - Accuracy: 0.53, Precision: 0.451612903226, Recall: 0.494117647059
MOVE
Pipeline - Accuracy: 0.51, Precision: 0.448818897638, Recall: 0.670588235294
---
ATTACK
Pipeline - Accuracy: 0.447236180905, Precision: 0.40157480315, Recall: 0.6
CAST
Pipeline - Accuracy: 0.48743718593, Precision: 0.406593406593, Recall: 0.435294117647
MOVE
Pipeline - Accuracy: 0.532663316583, Precision: 0.455555555556, Recall: 0.482352941176
---
ATTACK
Pipeline - Accuracy: 0.515151515152, Precision: 0.441176470588, Recall: 0.535714285714
CAST
Pipeline - Accuracy: 0.454545454545, Precision: 0.392857142857, Recall: 0.52380952381
MOVE
Pipeline - Accuracy: 0.530303030303, Precision: 0.449438202247, Recall: 0.47619047619
---


In [26]:
from sklearn.neural_network import MLPClassifier


def get_pair_dfs(self, pair, splits):
        attacks, moves, casts = [], [], []
        file0, file1 = pair
        
        dfs = get_df(file0, file1, splits)
        
        return dfs[0], dfs[1], dfs[2]

class Pair:
    def __init__(self, pair, splits):
        attack_df, move_df, cast_df = get_pair_dfs(pair, splits)
        
        self.attack_df = attack_df
        self.move_df = move_df
        self.cast_df = cast_df
        

class PairClassifier:
    
    def __init__(self, attack_model, move_model, cast_model, network_size):
        self.attack_model = attack_model
        self.move_model = move_model
        self.cast_model = cast_model
        
        self.network = MLPClassifier(solver="lbfgs", hidden_layer_sizes=network_size, random_state=42)
        
    def get_pairs_dfs(self, pairs, splits):
        attacks, moves, casts = [], [], []
        for pair in pairs:
            attacks.append(pair.attack_df)
            moves.append(pair.move_df)
            casts.append(pair.cast_df)
            
        return pd.concat(attacks), pd.concat(moves). pd.concat(casts)
        
    def train(self, pairs, y, splits):
        for model, train_df in self.get_pairs_dfs(pairs, splits):
            model.fit(train_df, y)
        self.fit_network(pairs, y)
        
    def fit_network(self, pairs, y):
        X = [self.get_all_probas(pair) for pair in pairs]
        self.network.fit(X, y)
        
    def get_proba(self, model, df):
        return model.predict_proba(df)
        
    def get_all_probas(self, pair):
        attack_proba = self.get_proba(self.attack_model, pair.attack_df)
        move_proba = self.get_proba(self.move_model, pair.move_df)
        cast_proba = self.get_proba(self.cast_model, pair.cast_df)
        
        return [attack_proba[1], move_proba[1], cast_proba[1]]
        
    def predict(self, pairs):
        probabilities = [self.get_all_probas(pair) for pair in pairs]
        return self.network.predict(probabilities)

In [27]:
path = "/cs/scratch/sy35/dota-data/tmp"
pairs = get_pairs(path)
ys = get_ys(pairs)
splits = 3

In [28]:
X, y = sample(pairs, ys, 0.5)

In [29]:
pc = PairClassifier(LogisticRegression(), LogisticRegression(), LogisticRegression(), (3,))

In [30]:
pc.train(X, y, 2)

AttributeError: 'tuple' object has no attribute 'attack_df'