In [1]:
import pandas as pd
import os
from tqdm import tqdm
import numpy as np
import re
import geopandas as gpd

import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'geopandas'

# 1. Препроцессинг паркетов

In [2]:
list_of_files = list()
for (dirpath, dirnames, filenames) in os.walk("../../data/01_raw/third_departure_parquet/"):
    list_of_files += [os.path.join(dirpath, file) for file in filenames if ".parquet" in file]

In [5]:
def preprocess_accelerometer_data(one_row_acc_data):
    possible_list = eval(one_row_acc_data) if one_row_acc_data is not None else [np.nan, np.nan, np.nan]
    if len(possible_list) == 0:
        possible_list = [np.nan, np.nan, np.nan]
    return possible_list

In [6]:
for filepath in tqdm(list_of_files):
    data = pd.read_parquet(filepath)
    data["timestamp_rownum"] = data.groupby(["Время отправки гео"]).cumcount() + 1
    data['Акселерометр'] = data['Акселерометр'].str.replace("\n", "").apply(lambda x: preprocess_accelerometer_data(x))
    data[["acc_x", "acc_y", "acc_z"]] = data['Акселерометр'].to_list()
    data.to_parquet(filepath.replace('third_departure_parquet', 'third_departure_parquet_preprocessed').replace(" ", "_"))

  9%|███████▌                                                                           | 1/11 [00:05<00:55,  5.54s/it]


KeyboardInterrupt: 

In [7]:
list_of_files_with_anomalies = list()
for (dirpath, dirnames, filenames) in os.walk("../../data/01_raw/third_departure_parquet_preprocessed/"):
    list_of_files_with_anomalies += [os.path.join(dirpath, file) for file in filenames if ".parquet" in file]

In [9]:
def get_date_from_filename(filename):
    filename_list = filename.replace('.parquet', '').split("_")
    for each in filename_list:
        found_date = re.search("[0-9][0-9](.)[0-9][0-9](.)20[0-9][0-9]$", each)
        if found_date is not None:
            return found_date.group()

In [10]:
dict_dates = dict()
for filename in tqdm(list_of_files_with_anomalies):
    date = get_date_from_filename(filename)
    if date not in dict_dates:
        dict_dates[date] = []
    df = pd.read_parquet(filename)
    dict_dates[date].append(df)

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 12.06it/s]


In [11]:
def make_subsample_for_df(df):
    subsample = df.sample(n=50)
    dev_id = subsample["Наименование"].values[0]
    gdf_subsample = gpd.GeoDataFrame(subsample, 
                                     geometry=gpd.points_from_xy(subsample['Долгота'], subsample['Широта']), 
                                     crs=4326).to_crs(3857)
    data_for_gdf = d = {"dev_id": [dev_id], "geometry":[gdf_subsample.buffer(1).unary_union]}
    output_gdf = gpd.GeoDataFrame(data_for_gdf, crs=3857)
    return output_gdf

In [12]:
def preprocess_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    dt_column = "Время отправки гео"
    device_id_column = "Наименование"
    if not df[dt_column].is_monotonic_increasing:
        df = df.sort_values(by=[dt_column], ignore_index=True)
    df = df.groupby([device_id_column, dt_column]).head(1)
    df = df.sort_values(by=[dt_column], ignore_index=True)
    return df

In [13]:
def convert_dataframe_to_geodataframe(df: pd.DataFrame) -> gpd.GeoDataFrame:
    gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['Широта'], df['Долгота']), 
                                     crs=4326).to_crs(32633)
    gdf["x_metres"] = gdf["geometry"].x
    gdf["y_metres"] = gdf["geometry"].y
    return gdf

In [14]:
def make_subsample_for_df(df):
    subsample = df.sample(n=50)
    dev_id = subsample["Наименование"].values[0]
    gdf_subsample = gpd.GeoDataFrame(subsample, 
                                     geometry=gpd.points_from_xy(subsample['Долгота'], subsample['Широта']), 
                                     crs=4326).to_crs(3857)
    data_for_gdf = d = {"dev_id": [dev_id], "geometry":[gdf_subsample.buffer(1).unary_union]}
    output_gdf = gpd.GeoDataFrame(data_for_gdf, crs=32633)
    return output_gdf

In [15]:
def find_possible_intersections(df: gpd.GeoDataFrame, buffer=50):
    df_copy = df.copy(deep=True)
    df_copy["geometry"] = df_copy.buffer(buffer)
    data_for_gdf = {"dev_id": [], 
                    "geometry": []}
    for unique_id in df_copy["Наименование"].unique():
        data_for_gdf["dev_id"].append(unique_id)
        data_for_gdf["geometry"].append(df_copy[df_copy["Наименование"] == unique_id].unary_union)
    batch_polygons = gpd.GeoDataFrame(data_for_gdf, crs=32633)
    possible_intersects = batch_polygons.sjoin(batch_polygons, how='inner', predicate="intersects")
    possible_intersects = possible_intersects[possible_intersects["dev_id_left"] !=
                                              possible_intersects["dev_id_right"]]
    return possible_intersects

In [16]:
def drop_permutations_copys(df: pd.DataFrame) -> pd.DataFrame:
    cols = df.columns.values
    df.loc[:, cols] = np.sort(df.loc[:, cols].values, axis=1)
    df = df.drop_duplicates()
    return df

In [17]:
def count_speed(row):
    x_diff_1 = row["x_metres_x_left"] - row["x_metres_x_right"]
    y_diff_1 = row["y_metres_x_left"] - row["y_metres_x_right"]
    x_diff_2 = row["x_metres_y_left"] - row["x_metres_y_right"]
    y_diff_2 = row["y_metres_y_left"] - row["y_metres_y_right"]
    time_diff = (row["Время отправки гео_left"] - row["Время отправки гео_right"]).total_seconds()
    
    distance_1 = np.sqrt(x_diff_1 ** 2 + y_diff_1 ** 2)
    distance_2 = np.sqrt(x_diff_2 ** 2 + y_diff_2 ** 2)
    row["speed_left"] = distance_1 / time_diff
    row["speed_right"] = distance_2 / time_diff
    return row


def count_correlation_features(merged_df: gpd.GeoDataFrame, 
                               data_df: pd.DataFrame):
    accelerometer_axis = ["x", "y", "z"]
    for axis in accelerometer_axis:
        data_df[f"accelerometer_{axis}_corr"] = abs(merged_df[[f"acc_{axis}_x", f"acc_{axis}_y"]].corr().values[1, 0])
        data_df[f"accelerometer_{axis}_diff_corr"] = abs(merged_df[[f"acc_{axis}_x", f"acc_{axis}_y"]].diff(1).corr().values[1, 0])
        
    metres_columns_correlation = ["x", "y"]
    for meter_column in metres_columns_correlation:
        data_df[f"{meter_column}_metres_corr"] = abs(merged_df[[f"{meter_column}_metres_x", f"{meter_column}_metres_y"]].corr().values[1, 0])
        data_df[f"{meter_column}_diff_metres_corr"] = abs(merged_df[[f"{meter_column}_metres_x", f"{meter_column}_metres_y"]].diff(1).corr().values[1, 0])
        
    data_df["height_corr"] = abs(merged_df[["Высота_x", "Высота_y"]].corr().values[1, 0])
    data_df["bearing_corr"] = abs(merged_df[["bearing_x", "bearing_y"]].corr().values[1, 0])
    data_df["speed_corr"] = abs(merged_df[["Скорость_x", "Скорость_y"]].corr().values[1, 0])
    
    data_df["height_diff_corr"] = abs(merged_df[["Высота_x", "Высота_y"]].diff(1).corr().values[1, 0])
    data_df["bearing_diff_corr"] = abs(merged_df[["bearing_x", "bearing_y"]].diff(1).corr().values[1, 0])
    data_df["speed_diff_corr"] = abs(merged_df[["Скорость_x", "Скорость_y"]].diff(1).corr().values[1, 0])
    
    merged_itself = merged_df.join(merged_df.shift(), lsuffix="_left", rsuffix="_right")
    merged_itself.reset_index(drop=True, inplace=True)
    merged_itself = merged_itself.loc[1:].apply(lambda x: count_speed(x), axis=1)
    
    data_df["speed_corr"] = abs(merged_itself[["speed_left", "speed_right"]].corr().values[1, 0])
    data_df["speed_diff_corr"] = abs(merged_itself[["speed_left", "speed_right"]].diff(1).corr().values[1, 0])
    
    data_df["speed_mean_abs_diff"] = abs(merged_itself["speed_left"] - merged_itself["speed_right"]).mean()
    data_df["speed_mean_diff"] = (merged_itself["speed_left"] - merged_itself["speed_right"]).mean()
    data_df["speed_std_diff"] = (merged_itself["speed_left"] - merged_itself["speed_right"]).std()
    return data_df


def get_features_from_dataframes(gdf1: gpd.GeoDataFrame, 
                                 gdf2: gpd.GeoDataFrame):
    data = pd.DataFrame()
    merged_df = gdf1.merge(gdf2, on=["Время отправки гео"], how="inner")
    data["date"] = [gdf1["Время отправки гео"].dt.to_period('D').values[0].to_timestamp()]
    data["datetime"] = gdf1.reset_index()["time_category"][0].left
    data["dev_id_1"] = gdf1["Наименование"].values[0]
    data["id_1_last_geometry"] = gdf1.tail(1).reset_index()["geometry"][0]
    data["id_1_last_lon"] = gdf1.tail(1).reset_index()["Долгота"][0]
    data["id_1_last_lat"] = gdf1.tail(1).reset_index()["Широта"][0]
    data["dev_id_2"] = gdf2["Наименование"].values[0]
    data["id_2_last_geometry"] = gdf2.tail(1).reset_index()["geometry"][0]
    data["id_2_last_lon"] = gdf2.tail(1).reset_index()["Долгота"][0]
    data["id_2_last_lat"] = gdf2.tail(1).reset_index()["Широта"][0]
    if merged_df.shape[0] < 5:
        return data
    
    merged_df['distance'] = merged_df.apply(lambda x: x["geometry_x"].distance(x["geometry_y"]), axis=1)
    
    data = count_correlation_features(merged_df, data)
    data["distance_mean"] = merged_df['distance'].mean()
    data["distance_std"] = merged_df['distance'].std()
    return data

# final features collection

In [15]:
# Добавить фичи по корреляции координат, высоты, наклона, скорости 
# фичи по приращению координат, акселерометра

In [18]:
dates = dict_dates.keys()
date_to_generate_new_data = list(dates)[2]
dataset_to_fake = dict_dates[date_to_generate_new_data][0].copy(deep=True)

In [19]:
from random import normalvariate

In [20]:
for col in ["Долгота", "Широта"]:
    dataset_to_fake[col] = dataset_to_fake[col].apply(lambda x: x + normalvariate(0, 0.00001))

In [21]:
dataset_to_fake["Наименование"] = dataset_to_fake["Наименование"].str.replace("D", "A").str.replace("4", "1").str.replace("2", "5").str.replace("9", "3") 

In [22]:
dict_dates[date_to_generate_new_data].append(dataset_to_fake)

In [23]:
len(dict_dates[date_to_generate_new_data])

3

In [24]:
dates = dict_dates.keys()
feats = []
for date in tqdm(dates):
    data_list = dict_dates[date]
    all_data = pd.concat(data_list, ignore_index=True)
    all_data_preprocessd = preprocess_dataframe(all_data)
    all_data_gdf = convert_dataframe_to_geodataframe(all_data_preprocessd)
    cuts = pd.date_range(all_data_gdf["Время отправки гео"].min() - pd.Timedelta("5min"),
                     all_data_gdf["Время отправки гео"].max() + pd.Timedelta("5min"),
                     freq="5min")
    all_data_gdf["time_category"] = pd.cut(all_data_gdf["Время отправки гео"], cuts)
    useful_samples = []
    for each in all_data_gdf["time_category"].unique():
        tmp_df = all_data_gdf[all_data_gdf["time_category"] == each]
        if tmp_df["Наименование"].nunique() > 1:
            useful_samples.append(tmp_df)
    for sample in useful_samples:
        possible_intersecions = find_possible_intersections(sample, 10000)
        relevant_pairs = possible_intersecions[["dev_id_left", "dev_id_right"]]
        relevant_pairs = drop_permutations_copys(relevant_pairs)
        for i, (first_id, second_id) in relevant_pairs.iterrows():
            df1 = sample[sample["Наименование"] == first_id]
            df2 = sample[sample["Наименование"] == second_id]
            feats.append(get_features_from_dataframes(df1, df2))

100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [05:16<00:00, 52.78s/it]


In [143]:
60 / (157 + 129 + 159 + 127 + 125 + 228)

0.06486486486486487

In [25]:
concat_gdf = gpd.GeoDataFrame(pd.concat(feats, ignore_index=True))

In [26]:
concat_gdf_copy = concat_gdf.drop(["id_1_last_geometry", "id_2_last_geometry"], axis=1)

In [32]:
concat_gdf_copy.to_parquet("../../data/02_interim/sample_11_02_all_features_high_radius.parquet")

In [147]:
pd.concat(feats)

Unnamed: 0,date,datetime,dev_id_1,id_1_last_geometry,id_1_last_lon,id_1_last_lat,dev_id_2,id_2_last_geometry,id_2_last_lon,id_2_last_lat,...,bearing_corr,speed_corr,height_diff_corr,bearing_diff_corr,speed_diff_corr,speed_mean_abs_diff,speed_mean_diff,speed_std_diff,distance_mean,distance_std
0,2022-12-01,2022-12-01 13:23:18,D4909E26-0469-4045-B426-211006C38915,POINT (4167729.7370922077 5046305.233131125),37.602747,55.892787,E9B9C64C-B66D-416D-BACD-3F0C9C2E1767,POINT (4167731.35551136 5046305.619663482),37.602743,55.892803,...,0.338381,0.042635,0.004418513,0.025518,0.159448,2.333663,0.715513,9.392581,167.094257,137.438628
0,2022-12-01,2022-12-01 13:28:18,D4909E26-0469-4045-B426-211006C38915,POINT (4167710.144697991 5046306.129507374),37.602823,55.892623,E9B9C64C-B66D-416D-BACD-3F0C9C2E1767,POINT (4167694.825697468 5046320.523731084),37.602976,55.892558,...,0.057829,0.037434,3.9331709999999997e-19,0.06795,0.010796,1.64502,-0.2658,5.692483,24.32799,26.000711
0,2022-12-01,2022-12-01 13:33:18,D4909E26-0469-4045-B426-211006C38915,POINT (4167649.1527787885 5046745.919781288),37.606037,55.894093,E9B9C64C-B66D-416D-BACD-3F0C9C2E1767,POINT (4167516.697585057 5046150.599008019),37.602461,55.890264,...,0.293307,0.008766,0.1259161,0.028467,0.013877,2.285475,-0.193516,7.742086,255.952047,169.591255
0,2022-11-23,2022-11-23 08:32:55,A1303E56-0163-1015-B156-511006C38315,POINT (4167166.2155623296 5047647.915995742),37.613919,55.894047,D4909E26-0469-4045-B426-211006C38915,POINT (4167166.077660636 5047648.173057517),37.613921,55.894047,...,0.992794,0.340381,0.08496672,0.921901,0.298254,11.471759,-0.1886,15.512566,15.983252,13.413887
0,2022-11-23,2022-11-23 08:37:55,A1303E56-0163-1015-B156-511006C38315,POINT (4166801.792311876 5047722.784957221),37.615742,55.891267,D4909E26-0469-4045-B426-211006C38915,POINT (4166783.3084728173 5047708.395730081),37.615711,55.891043,...,0.991177,0.113257,0.8138939,0.817616,0.037867,10.910695,0.59749,14.334209,15.591108,13.084448
0,2022-11-23,2022-11-23 08:42:55,A1303E56-0163-1015-B156-511006C38315,POINT (4166507.9267304596 5048110.261520888),37.619439,55.890507,D4909E26-0469-4045-B426-211006C38915,POINT (4166507.8159378185 5048110.335273138),37.61944,55.890507,...,0.99119,0.272667,0.9688304,0.574795,0.306233,10.492915,-0.262314,13.811728,15.240522,12.67658
0,2022-11-23,2022-11-23 08:47:55,A1303E56-0163-1015-B156-511006C38315,POINT (4166510.333802866 5048159.616967436),37.619767,55.890752,D4909E26-0469-4045-B426-211006C38915,POINT (4166515.767004424 5048121.424633239),37.619487,55.890625,...,0.983878,0.125557,0.8007275,0.712638,0.025302,11.839408,0.073684,16.012501,15.767768,14.044652
0,2022-11-23,2022-11-23 08:52:55,A1303E56-0163-1015-B156-511006C38315,POINT (4166423.9482391244 5047978.217199802),37.618843,55.88919,D4909E26-0469-4045-B426-211006C38915,POINT (4166445.1125429687 5047973.88180606),37.618737,55.889352,...,0.962112,0.178826,0.9533443,0.581856,0.208384,11.538364,-0.246751,14.869585,15.855886,13.374617
0,2022-11-23,2022-11-23 08:57:55,A1303E56-0163-1015-B156-511006C38315,POINT (4166599.809806052 5047846.857978165),37.617315,55.8901,D4909E26-0469-4045-B426-211006C38915,POINT (4166599.7798591885 5047847.016700024),37.617316,55.890101,...,0.985075,0.333248,0.7604701,0.876458,0.197352,11.671487,-0.758598,16.317176,15.249034,14.388407
0,2022-11-23,2022-11-23 09:02:55,A1303E56-0163-1015-B156-511006C38315,POINT (4166567.587614095 5047877.695669343),37.617641,55.889964,D4909E26-0469-4045-B426-211006C38915,POINT (4166569.523856252 5047867.637606687),37.617565,55.889935,...,0.961768,0.185184,0.1675153,0.736675,0.143248,10.839715,0.3352,14.854976,14.992387,12.946071
