In [102]:
import pandas as pd

def get_action_df(raw_df, action):
    return raw_df.loc[raw_df["actionType"] == action].drop("actionType", 1)


def is_filter_player(steam_id, filter_id):
    if steam_id == filter_id:
        return 1
    else:
        return 0
    
def get_dfs(csvpath, dropna=True):
    raw_df = pd.read_csv(csvpath)
    if dropna:
        raw_df = raw_df.dropna()
    
    attack_df = get_action_df(raw_df, "ATTACK")
    move_df = get_action_df(raw_df, "MOVE")
    cast_df = get_action_df(raw_df, "CAST")

    return attack_df, move_df, cast_df

def containsPlayer(game, filter_id):
    if filter_id in game.csv_file:
        return 1
    else:
        return 0


In [None]:
class Game:

    def __init__(self, csv_file):
        a, m, c = get_dfs(csv_file)
        
        self.attack_df = a
        self.move_df = m
        self.cast_df = c
        
        self.csv_file = csv_file

In [108]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier

class GameClassifier:

    def __init__(self, filter_id, attack_model, move_model, cast_model):
        self.filter_id = filter_id
        self.attack_model = attack_model
        self.move_model = move_model
        self.cast_model = cast_model


    def cross_validate(self, games, ys, splits=3):
        kf = KFold(n_splits=splits)
        for train, test in kf.split(games):
            training_games = [games[i] for i in train]
            testing_games = [games[i] for i in test]
            y_test = [ys[i] for i in test]
            
            for model,train_df in self.concat_data(training_games):
                self.fit(model, train_df)

            predictions = self.predict(testing_games)
            print("Predictions: {}".format(predictions))
            print("Actual:      {}".format(y_test))
            print("KFold score: {}\n".format(accuracy_score(y_test, predictions)))


    def concat_data(self, training_games):
        attack_df, move_df, cast_df = self.concat_games(training_games)

        return ((self.attack_model,attack_df),
                (self.move_model,move_df),
                (self.cast_model,cast_df))


    def concat_games(self, games):
        attack_df = pd.concat([game.attack_df for game in games])
        move_df = pd.concat([game.move_df for game in games])
        cast_df = pd.concat([game.cast_df for game in games])
        
        return attack_df, move_df, cast_df


    def get_y(self, df):
        return df["steamid"].map(lambda steamid: is_filter_player(steamid, self.filter_id))

    
    def fit(self, model, df):
        y = self.get_y(df)
        X = df.drop("steamid", 1)
        model.fit(X, y)

    
    def get_percents(self, model, df):
        predictions = model.predict(df.drop("steamid", 1))
        return sum(predictions)/len(predictions)
    
    def get_probas(self, model, df):
        probabilities = model.predict_proba(df.drop("steamid", 1))
        return sum(probabilities)/len(probabilities)

    def predict(self, games):
        predictions = []
        for game in games:
            attack_score = self.get_percents(self.attack_model, game.attack_df)
            move_score = self.get_percents(self.move_model, game.move_df)
            cast_score = self.get_percents(self.cast_model, game.cast_df)
            
            attack_proba = self.get_probas(self.attack_model, game.attack_df)
            move_proba = self.get_probas(self.move_model, game.move_df)
            cast_proba = self.get_probas(self.cast_model, game.cast_df)
            
            total_len = len(game.attack_df.index) + len(game.move_df.index) + len(game.cast_df.index)
            attack_weight = len(game.attack_df.index)/total_len
            move_weight = len(game.move_df.index)/total_len
            cast_weight = len(game.cast_df.index)/total_len
            
            print((attack_proba*attack_weight) + (move_proba*move_weight) + (cast_proba*cast_weight))
            print("{}, {}, {}".format(attack_score, move_score, cast_score))
            if attack_score + move_score + cast_score > 1.5:
                predictions.append(1)
            else:
                predictions.append(0)

        return predictions

In [111]:
import os

path = "/cs/scratch/sy35/dota-data/14/mouseaction"
games = []
for filename in os.listdir(path):
    games.append(Game("{}/{}".format(path, filename)))

In [110]:
from sklearn.linear_model import LogisticRegression

classifier = GameClassifier(76561198119286646, LogisticRegression(), LogisticRegression(), LogisticRegression())

ys = [containsPlayer(game, "76561198119286646") for game in games]
classifier.cross_validate(games, ys, splits=5)

[0.31241027 0.68758973]
0.6301775147928994, 0.8055424528301887, 0.6935483870967742
[0.48226316 0.51773684]
0.412483039348711, 0.5449821151380172, 0.4566929133858268
[0.69986007 0.30013993]
0.021660649819494584, 0.17643142476697737, 0.031746031746031744
[0.76918362 0.23081638]
0.02099737532808399, 0.11430260047281324, 0.08426966292134831
[0.75152964 0.24847036]
0.02615278733654508, 0.13296930342384888, 0.12544802867383512
Predictions: [1, 0, 0, 0, 0]
Actual:      [1, 1, 0, 0, 0]
KFold score: 0.8

[0.81090975 0.18909025]
0.02631578947368421, 0.0706953642384106, 0.024
[0.59594864 0.40405136]
0.1787709497206704, 0.3961570394970941, 0.18181818181818182
[0.72761876 0.27238124]
0.11226611226611227, 0.20244307400379508, 0.11827956989247312
[0.48079354 0.51920646]
0.43623188405797103, 0.5500560747663551, 0.5918367346938775
Predictions: [0, 0, 0, 1]
Actual:      [0, 1, 0, 1]
KFold score: 0.75

[0.7606331 0.2393669]
0.04827586206896552, 0.11868214152002995, 0.051094890510948905
[0.43075887 0.5692