In [1]:
from typing import Optional
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

In [2]:
''' utility '''


class Data:
    train_data: list[pd.DataFrame]
    train_labels: list[int]
    test_data: list[pd.DataFrame]
    test_nums: list[str]
        
    def __init__(
        self,
        train_data_path: Path,
        test_data_path: Path
    ) -> None:
        self.train_data, self.train_labels = self.collect_train(train_data_path)
        self.test_data, self.test_nums = self.collect_test(test_data_path)
    
    def collect_train(self, path: Path) -> tuple[list[pd.DataFrame], list[int]]:
        frames: list[pd.DataFrame] = []
        labels: list[int] = []
        
        type_patterns = {
            ('cтояние', 'тояни', 'месте'): {'name': 'стояние', 'id': 0},
            ('ходьба', 'одьб', 'аг'): {'name': 'ходьба', 'id': 1},
            ('бег'): {'name': 'бег', 'id': 2},
            ('велосипед', 'елоси'): {'name': 'велосипед', 'id': 3},
            ('лестница', 'естн', 'одъ'): {'name': 'лестница', 'id': 4},
            ('автомобиль', 'втомо', 'ашин'): {'name': 'автомобиль', 'id': 5},
            ('метро'): {'name': 'метро', 'id': 6},
            ('автобус', 'автоб'): {'name': 'автобус', 'id': 7},
            ('самокат', 'амок'): {'name': 'самокат', 'id': 8}
        }
        
        for sub_path in path.iterdir():
            if sub_path.is_file():
                try:
                    type_str = str(sub_path.name).split('_')[-2]
                except IndexError:
                    continue
                else:
                    label = int()
                    
                    for key, value in type_patterns.items():
                        if type_str.lower() in key:
                            label = value['id']
                            
                    if label not in range(9):
                        raise ValueError(f'Ошибка при определении типа движения. {type_str = }')
                        
                for sep in (',', ';'):
                    try:
                        df: pd.DataFrame = pd.read_csv(sub_path, sep=sep, dtype=str)
                    except Exception as ex:
                        continue
                    else:
                        if df.shape[1] == 5:
                            try:
                                df = df.applymap(lambda x: str(x).replace(',', '.'))
                                df['gFx'] = df['gFx'].astype(float)
                                df['gFy'] = df['gFy'].astype(float)
                                df['gFz'] = df['gFz'].astype(float)
                            except Exception as ex:
                                 continue
                            else:
                                frames.append(df)
                                labels.append(label)
                            
        return frames, labels
    
    def collect_test(self, path: Path) -> list[pd.DataFrame]:
        frames: list[pd.DataFrame] = []
        nums: list[str] = []
            
        for sub_path in path.iterdir():
            if sub_path.is_file():
                track_num = str(sub_path.name).split('_')[-1].split('.')[-2]
                
                for sep in (',', ';'):
                    try:
                        df: pd.DataFrame = pd.read_csv(sub_path, sep=sep, dtype=str)
                    except Exception as ex:
                        print(f'{sub_path = }, {ex = }')
                        continue
                    else:
                        if df.shape[1] == 5:
                            try:
                                df = df.applymap(lambda x: str(x).replace(',', '.'))
                                df['gFx'] = df['gFx'].astype(float)
                                df['gFy'] = df['gFy'].astype(float)
                                df['gFz'] = df['gFz'].astype(float)
                            except Exception as ex:
                                continue
                            else:
                                frames.append(df)
                                nums.append(track_num)
                                
                            
        return frames, nums
    

def process_frame(df: pd.DataFrame, label: int) -> pd.Series:
    start = int(0.2 * len(df))
    end = -int(0.2 * len(df))

    accel = np.log(np.std((df.gFx[start:end] ** 2 + df.gFy[start:end] ** 2 + df.gFz[start:end] ** 2) ** 0.5))
    
    return pd.Series([accel, label])

In [3]:
train_data_path = Path('./public_data')
test_data_path = Path('./kaggle_data_open')


data = Data(train_data_path, test_data_path)
print(len(data.train_data))
print(len(data.train_labels))
print(len(data.test_data))

736
736
183


In [4]:
train_simple = pd.DataFrame()
train_advanced = pd.DataFrame()


for frame, label in zip(data.train_data, data.train_labels):
    
    if label in (0, 1, 2, 3, 4):
        train_simple = train_simple.append(process_frame(frame, label), ignore_index=True)
    elif label in (5, 6, 7, 8):
        train_advanced = train_advanced.append(process_frame(frame, label), ignore_index=True)
        

train_simple.columns = ['accel', 'label']
train_advanced.columns = ['accel', 'label']


X_simple = np.array(train_simple.accel).reshape(-1, 1)
y_simple = train_simple.label


X_advanced = np.array(train_advanced.accel).reshape(-1, 1)
y_advanced = train_advanced.label


model_simple = LogisticRegression().fit(X_simple, y_simple)
model_advanced = LogisticRegression().fit(X_advanced, y_advanced)


print(model_simple.score(X_simple, y_simple))
print(model_advanced.score(X_advanced, y_advanced))

0.616519174041298
0.5


In [5]:
test_categories: pd.DataFrame = pd.read_csv('tracks_levels_open.csv', sep=',', dtype=str)
test_simple = pd.DataFrame()
test_advanced = pd.DataFrame()


for track, num in zip(data.test_data, data.test_nums):
    category: str = test_categories[test_categories.track_num == num]['level'].values[0]
    
    if category == 'base':
        test_simple = test_simple.append(process_frame(track, num), ignore_index=True)
    elif category == 'advanced':
        test_advanced = test_advanced.append(process_frame(track, num), ignore_index=True)
        

test_simple.columns = ['accel', 'track_num']
test_advanced.columns = ['accel', 'track_num']


test_simple = pd.concat(
    [
        test_simple,
        pd.Series(model_simple.predict(np.array(test_simple.accel).reshape(-1, 1)), name='action')
    ],
    axis=1
)    
test_advanced = pd.concat(
    [
        test_advanced,
        pd.Series(model_advanced.predict(np.array(test_advanced.accel).reshape(-1, 1)), name='action')
    ],
    axis=1
) 

In [6]:
out: pd.DataFrame = pd.concat([test_simple[['track_num', 'action']], test_advanced[['track_num', 'action']]])
out = out.astype({'track_num': int, 'action': int})
out.sort_values(by=['track_num']).to_csv('output.csv', index=False)