In [314]:
import pandas as pd
from sklearn.ensemble import IsolationForest

In [301]:
def extract_one_type(data, target):
    return data[data["Тип"]==target]

In [302]:
"""Чтение исходных данных пользователей"""
data_mine_train = extract_one_type(pd.read_csv('datasets/User_0_Train.csv', index_col = None, sep=";", encoding="cp1251"), "DU")
data_mine_test = extract_data(pd.read_csv('datasets/User_0_Test.csv', index_col = None, sep=";", encoding="cp1251"), "DU")
data_others_test = []
for i in range(1,21):
    data_others_test.append(extract_one_type(
        pd.read_csv('datasets/User_'+ str(i) + '.csv', index_col = None, sep=";", encoding="cp1251"),"DU"))

In [319]:
def calculate_start_coordinates(df_features, data_i):
    x_end = data_i[["Координата X"]].get_value(0,0, takeable=True)
    y_end = data_i[["Координата Y"]].get_value(0,0, takeable=True)
    i = data_i.get_value(0,0, takeable=True)
    df_features.set_value(i, "X нач.", x_end)
    df_features.set_value(i, "Y нач.", y_end)
    return df_features

In [320]:
def calculate_end_coordinates(df_features, data_i):
    x_end = data_i[["Координата X"]].get_value(data_i.shape[0]-1,0, takeable=True)
    y_end = data_i[["Координата Y"]].get_value(data_i.shape[0]-1,0, takeable=True)
    i = data_i.get_value(0,0, takeable=True)
    df_features.set_value(i, "X кон.", x_end)
    df_features.set_value(i, "Y кон.", y_end)
    return df_features

In [378]:
def calculate_avrg_size(df_features, data_i):
    avrg_size = data_i[["Размер касания"]].mean()[0]
    i = data_i.get_value(0,0, takeable=True)
    df_features.set_value(i, "Средний размер", avrg_size)
    return df_features

In [379]:
def calculate_vec_features(df_features, data_i):
    i = data_i.get_value(0,0, takeable=True)
    x_vec = df_features.get_value(i, "X кон.") - df_features.get_value(i, "X нач.")
    y_vec = df_features.get_value(i, "Y кон.") - df_features.get_value(i, "Y нач.")
    vec_mod = (x_vec**2 + y_vec**2)**(1/2)
    df_features.set_value(i, "X вектора", x_vec)
    df_features.set_value(i, "Y вектора", y_vec)
    df_features.set_value(i, "Модуль вектора", vec_mod)
    return df_features

In [380]:
def calculate_distance(df_features, data_i):
    distance = 0
    x0 = data_i[["Координата X"]].get_value(0,0, takeable=True)
    y0 = data_i[["Координата Y"]].get_value(0,0, takeable=True)
    for x,y in zip(data_i["Координата X"], data_i["Координата Y"]):
        distance += ((x-x0)**2 + (y-y0)**2)**(1/2)
        x0 = x
        y0 = y
    i = data_i.get_value(0,0, takeable=True)
    df_features.set_value(i, "Длина", distance)
    return df_features

In [381]:
def calculate_duration(df_features, data_i):
    time_start = data_i[["Метка времени"]].get_value(0,0, takeable=True)
    time_end = data_i[["Метка времени"]].get_value(data_i.shape[0]-1,0, takeable=True)
    duration = time_end - time_start
    i = data_i.get_value(0,0, takeable=True)
    df_features.set_value(i, "Время", duration)
    return df_features

In [382]:
def calculate_avrg_speed(df_features, data_i):
    i = data_i.get_value(0,0, takeable=True)
    avrg_speed = df_features.get_value(i,"Длина") / df_features.get_value(i,"Время")
    df_features.set_value(i, "Средняя скорость", avrg_speed)
    return df_features

In [400]:
def calculate_features(data):
    df = pd.DataFrame(index=range(data.iloc[data.shape[0] - 1][0] + 1), columns=['Номер действия','Тип', 'X вектора', 'Y вектора', 
                                                                                 'Модуль вектора', 'Время', 'Длина', 'Средняя скорость',
                                                                                 'Средний размер', 'X нач.', 'Y нач.', 'X кон.', 
                                                                                 'Y кон.'])
    for i in range(data.iloc[data.shape[0]-1][0] + 1):
        data_i = data[(data["Номер действия"] == i )]
        df.set_value(i,1, data_i.get_value(0,6,takeable=True), takeable=True)
        df.set_value(i,0, i, takeable=True)
        calculate_start_coordinates(df, data_i)
        calculate_end_coordinates(df, data_i)
        calculate_avrg_size(df, data_i)
        calculate_vec_features(df, data_i)
        calculate_distance(df, data_i)
        calculate_duration(df, data_i)
        calculate_avrg_speed(df, data_i)
    df = df.fillna(method="pad")
    return df

In [392]:
def combine_data(df_list):
    df = df_list[0].copy()
    for i in range(1,len(df_list)):
        df = df.append(df_list[i],ignore_index=True)
    return df

In [393]:
def far(des_mod, test_illeg):
    cnt = 0
    for sample in test_illeg.as_matrix():
        sample = sample.reshape(1, -1)
        if des_mod.predict(sample) == -1:
            cnt+=1
    return 100 - (cnt * 100) /len(test_illeg)

In [394]:
def frr(des_mod, test_leg):
    cnt = 0
    for sample in test_leg.as_matrix():
        sample = sample.reshape(1, -1)
        if des_mod.predict(sample) == -1:
            cnt+=1
    return (cnt * 100) /len(test_leg)

In [397]:
def main():
    X_train = calculate_features(data_mine_train)
    X_leg = calculate_features(data_mine_test)
    
    X_others = [calculate_features(data) for data in data_others_test]
    X_illeg = combine_data(X_others)
    
    isolation_forest = IsolationForest(random_state=3634, n_estimators=150, n_jobs=-1, contamination=0.057, bootstrap=True)
    X_leg_train = X_train.iloc[:,range(2,13)]
    isolation_forest.fit(X_leg_train.as_matrix())

    X_illeg_test = X_illeg.iloc[:,range(2,13)]
    print("FAR =", far(isolation_forest, X_illeg_test), "%")

    X_leg_test = X_leg.iloc[:,range(2,13)]
    print("FRR =", frr(isolation_forest, X_leg_test), "%")

In [401]:
X_train = calculate_features(data_mine_train)
X_leg = calculate_features(data_mine_test)
    
X_others = [calculate_features(data) for data in data_others_test]
X_illeg = combine_data(X_others)

In [403]:
isolation_forest = IsolationForest(random_state=3634, n_estimators=150, n_jobs=-1, contamination=0.066, bootstrap=True)
X_leg_train = X_train.iloc[:,range(2,13)]
isolation_forest.fit(X_leg_train.as_matrix())

X_illeg_test = X_illeg.iloc[:,range(2,13)]
print("FAR =", far(isolation_forest, X_illeg_test), "%")

X_leg_test = X_leg.iloc[:,range(2,13)]
print("FRR =", frr(isolation_forest, X_leg_test), "%")

FAR = 7.5 %
FRR = 6.35 %


In [389]:
X_train

Unnamed: 0,Номер действия,Тип,X нач.,Y нач.,X кон.,Y кон.,Средний размер,X вектора,Y вектора,Модуль вектора,Длина,Время,Средняя скорость
0,0,DU,444.382812,936.231079,510.291260,455.606567,0.155594,65.908447,-480.624512,485.122505,492.688391,188,2.620683
1,1,DU,454.368927,947.222473,534.257996,432.624512,0.151923,79.889069,-514.597961,520.762254,537.847841,136,3.954764
2,2,DU,451.373108,958.213867,548.238586,429.626862,0.151923,96.865479,-528.587006,537.389192,553.897878,136,4.072779
3,3,DU,451.072327,948.167480,562.219177,401.648712,0.145105,111.146851,-546.518768,557.706362,582.607824,159,3.664200
4,4,DU,453.370331,948.221680,558.426086,412.700684,0.151923,105.055756,-535.520996,545.728366,564.737895,150,3.764919
5,5,DU,457.364777,955.216248,541.700745,433.286133,0.146154,84.335968,-521.930115,528.699915,540.530699,160,3.378317
6,6,DU,463.356445,949.220947,547.239929,461.601868,0.148077,83.883484,-487.619080,494.781574,509.322988,137,3.717686
7,7,DU,445.381439,907.253723,583.190002,398.651062,0.147436,137.808563,-508.602661,526.941996,541.117518,146,3.706284
8,8,DU,450.374481,920.243530,541.248291,459.603424,0.143162,90.873810,-460.640106,469.518218,476.600953,124,3.843556
9,9,DU,462.357849,969.205322,509.712494,501.525269,0.125000,47.354645,-467.680054,470.071372,477.689478,117,4.082816


In [396]:
main()

FAR = 8.0 %
FRR = 10.95 %
