In [4]:
%run pair_classifier.py

In [5]:
import itertools
import os
import ntpath

def get_pair_names(path):
    files = ["{}/{}".format(path, file) for file in os.listdir(path)]
    return list(itertools.permutations(files, 2))

def get_pairs(pair_names, splits):
    return [Pair(pair, splits) for pair in pair_names]

def get_playerid(name):
    return ntpath.basename(name)[:17]

def is_same_player(id1, id2):
    return 1 if id1 == id2 else 0

def get_ys(pairs):
    return [
        is_same_player(get_playerid(file0), get_playerid(file1))
        for file0,file1 in pairs
    ]


In [6]:
import random

def sample_filter(sample, prob):
    if sample[1] == 0:
        if random.random() < prob:
            return False
        else:
            return True
    else:
        return False

def sample(X, y):
    prob = float(y.count(1))/float(y.count(0))
    combine = [(X[i], y[i]) for i in range(len(X))]
    sample = [s for s in combine if not sample_filter(s, prob)]
    
    return map(list, zip(*sample))

def count_sample(x):
    negatives = x.count(0)
    positives = x.count(1)
    return "{} negative samples and {} positive samples".format(negatives, positives)

In [7]:
from sklearn.model_selection import StratifiedKFold

def ml(X, y, pc, cv, split_num):
    skf = StratifiedKFold(cv)

    for train_index, test_index in skf.split(X, y):
        X_train = [X[i] for i in train_index]
        X_test = [X[i] for i in test_index]
        y_train = [y[i] for i in train_index]
        y_test = [y[i] for i in test_index]

        pc.train(X_train, y_train, split_num)
        pc.test(X_test, y_test, split_num)
        
def ml_split(X, y, model, cv, splits):
    for i in range(splits):
        print("split {}".format(i))
        ml(X, y, model, cv, i)
        
    print("all")
    ml(X, y, model, cv, -1)

In [8]:
import sys

#path = sys.argv[1]
path = "/cs/scratch/sy35/dota-data/20/data/mouseaction"
pairs = get_pair_names(path)
ys = get_ys(pairs)

In [9]:
pair_names, y = sample(pairs, ys)

In [10]:
count_sample(y)

'311 negative samples and 320 positive samples'

In [None]:
pairs1 = get_pairs(pair_names, 1)

In [8]:
pairs2 = get_pairs(pair_names, 2)

In [None]:
pairs3 = get_pairs(pair_names, 3)

In [None]:
pairs5 = get_pairs(pair_names, 5)

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

lr = PairClassifier(LogisticRegression(class_weight="balanced"), 
                    LogisticRegression(class_weight="balanced"), 
                    LogisticRegression(class_weight="balanced"), 
                    (3,))

rf = PairClassifier(RandomForestClassifier(class_weight="balanced"), 
                    RandomForestClassifier(class_weight="balanced"), 
                    RandomForestClassifier(class_weight="balanced"), 
                    (3,))

clf = PairClassifier(MLPClassifier(solver="lbfgs", alpha=0.001, hidden_layer_sizes=(256,64,)),
                     MLPClassifier(solver="lbfgs", alpha=0.001, hidden_layer_sizes=(256,64,)),
                     MLPClassifier(solver="lbfgs", alpha=0.001, hidden_layer_sizes=(256,64,)),
                     (3,))


In [12]:
print("lr pairs2")
ml_split(pairs2, y, lr, 5, 2)

lr pairs2
split 0
Accuracy: 0.463768115942, Precision: 0.457142857143, Recall: 0.470588235294
Accuracy: 0.449275362319, Precision: 0.423076923077, Recall: 0.323529411765
Accuracy: 0.411764705882, Precision: 0.425, Recall: 0.5
Accuracy: 0.514705882353, Precision: 0.516129032258, Recall: 0.470588235294
Accuracy: 0.514705882353, Precision: 0.513513513514, Recall: 0.558823529412
split 1
Accuracy: 0.507246376812, Precision: 0.5, Recall: 0.5
Accuracy: 0.449275362319, Precision: 0.4375, Recall: 0.411764705882
Accuracy: 0.338235294118, Precision: 0.387755102041, Recall: 0.558823529412
Accuracy: 0.529411764706, Precision: 0.545454545455, Recall: 0.352941176471
Accuracy: 0.411764705882, Precision: 0.4, Recall: 0.352941176471
all
Accuracy: 0.463768115942, Precision: 0.457142857143, Recall: 0.470588235294
Accuracy: 0.449275362319, Precision: 0.423076923077, Recall: 0.323529411765
Accuracy: 0.411764705882, Precision: 0.425, Recall: 0.5
Accuracy: 0.514705882353, Precision: 0.516129032258, Recall: 0.

In [11]:
print("rf pairs2")
ml_split(pairs2, y, rf, 5, 2)

rf pairs2
split 0
Accuracy: 0.608695652174, Precision: 0.62962962963, Recall: 0.5
Accuracy: 0.608695652174, Precision: 0.606060606061, Recall: 0.588235294118
Accuracy: 0.647058823529, Precision: 0.631578947368, Recall: 0.705882352941
Accuracy: 0.720588235294, Precision: 0.674418604651, Recall: 0.852941176471
Accuracy: 0.661764705882, Precision: 0.617021276596, Recall: 0.852941176471
split 1
Accuracy: 0.594202898551, Precision: 0.588235294118, Recall: 0.588235294118
Accuracy: 0.623188405797, Precision: 0.586956521739, Recall: 0.794117647059
Accuracy: 0.661764705882, Precision: 0.648648648649, Recall: 0.705882352941
Accuracy: 0.735294117647, Precision: 0.681818181818, Recall: 0.882352941176
Accuracy: 0.632352941176, Precision: 0.595744680851, Recall: 0.823529411765
all
Accuracy: 0.594202898551, Precision: 0.583333333333, Recall: 0.617647058824
Accuracy: 0.579710144928, Precision: 0.560975609756, Recall: 0.676470588235
Accuracy: 0.676470588235, Precision: 0.714285714286, Recall: 0.5882352

In [1]:
print("mlp pairs2")
ml_split(pairs2, y, clf, 5, 2)

mlp pairs2


NameError: name 'ml_split' is not defined

In [None]:
#print("rf pairs5")
#ml_split(pairs5, y, rf, 5, 5)

In [None]:
#print("lr pairs10")
#ml_split(pairs10, y, lr, 5, 10)

In [None]:
#print("rf pairs10")
#ml_split(pairs109, y, rf, 5, 10)