<a href="https://colab.research.google.com/github/Danil-Kazakov/HW5/blob/main/Data_HW5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, accuracy_score

In [5]:
from pathlib import Path

DATASET_KEYS = ("f0s0", "f0s1", "f1s0", "f1s1")
CSV_DATA_PATH  = Path("/content/drive/MyDrive/content/data")
SAVED_DATAFRAME_BASE = Path("/content/")

In [6]:
if CSV_DATA_PATH.is_dir():
  class_path = list(CSV_DATA_PATH.iterdir())
  class_list = list(d.name for d in class_path)
  print(class_list)

['stairs', 'running', 'idle', 'walking']


In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import numpy as np
from pathlib import Path

# Assuming class_path and DATASET_KEYS are defined appropriately

def gen_saved_name(key):
    return SAVED_DATAFRAME_BASE.joinpath(f"data-{key}.father")

df_set = {}
X_set = {}
y_set = {}
reports = {}
skip_models = {
    "SVC_Linear": ["f0s0"]
}
SEED = 42

def save_dataset(key, df):
    filename = gen_saved_name(key)
    if not df.empty and not filename.is_file():
        df.to_feather(filename)

def add_stat_feature_frame(frame):
    features = []
    rows = frame.shape[0]
    for col_id in range(frame.shape[1]):
        col = frame.iloc[:, col_id]
        features.append(pd.DataFrame([col.mean()] * rows, columns=[f'{col.name}_mean']))
        features.append(pd.DataFrame([col.max()] * rows, columns=[f'{col.name}_max']))
        features.append(pd.DataFrame([col.min()] * rows, columns=[f'{col.name}_min']))
        features.append(pd.DataFrame([col.quantile(0.75) - col.quantile(0.25)] * rows, columns=[f'{col.name}_interquartile_range']))
        features.append(pd.DataFrame([col.idxmin()] * rows, columns=[f'{col.name}_index_of_minimum_value']))
        features.append(pd.DataFrame([np.mean(np.abs(col - col.mean()))] * rows, columns=[f'{col.name}_mean_of_absolute_deviation']))
        features.append(pd.DataFrame([col.median()] * rows, columns=[f'{col.name}_median']))
        features.append(pd.DataFrame([col.std()] * rows, columns=[f'{col.name}_standard_deviation']))
        features.append(pd.DataFrame([np.sqrt(np.mean(col) ** 2)] * rows, columns=[f'{col.name}_root_mean_square_error']))
    return pd.concat(features, axis=1)

# def prepare_dataset(class_path: list[Path], flatten=True, stat_feature=True, limit_frames=None):
#     dfs = []
#     for class_id, work_class_path in enumerate(class_path):
#         list_files = sorted(work_class_path.glob('*.csv'), key=lambda path: int(path.stem.rsplit("-", 1)[1]))
#         print(f"Importing class '{work_class_path.name}' : {class_id}. Frames: {len(list_files)}")
#         for i, filename in enumerate(list_files):
#             df = pd.read_csv(filename)
#             if flatten:
#                 df = flatten_frame(df)
#             if stat_feature:
#                 df = add_stat_feature_frame(df)
#             df['class'] = class_id
#             dfs.append(df)
#             if limit_frames and i >= limit_frames:
#                 break
#     df = pd.concat(dfs, ignore_index=True)
#     print(f"Розмір датасету: {df.shape}")
#     return df


def prepare_dataset(class_path: list[Path], flatten=True, stat_feature=True, limit_frames=None):
    dfs = []
    for class_id, work_class_path in enumerate(class_path):
        list_files = sorted(work_class_path.glob('*.csv'), key=lambda path: int(path.stem.rsplit("-", 1)[1]))
        print(f"Importing class '{work_class_path.name}' : {class_id}. Frames: {len(list_files)}")
        for i, filename in enumerate(list_files):
            df = pd.read_csv(filename)
            if flatten:
                df = flatten_frame(df)
            if stat_feature:
                df_stat = add_stat_feature_frame(df)
                new_df = pd.concat([df.copy(), df_stat], axis=1)  # Concatenate original and stat features
            else:
                new_df = df.copy()
            new_df['class'] = class_id
            dfs.append(new_df)
            if limit_frames and i >= limit_frames:
                break
    df = pd.concat(dfs, ignore_index=True)
    print(f"Розмір датасету: {df.shape}")
    return df

# def flatten_frame(frame):
#     columns = [f"{col}_{i}" for i in range(frame.shape[1]) for col in frame.columns]
#     return pd.DataFrame(frame.values.reshape(1, -1), columns=columns)

def flatten_frame(frame):
    columns = [f"{col}_{i}" for col in frame.columns for i in range(frame.shape[0])]
    return pd.DataFrame(frame.values.reshape(1, -1), columns=columns)


limit_frames = None
for f in [False, True]:
    for s in [False, True]:
        key_set = f"f{int(f)}s{int(s)}"
        print(f"\nDATASET {key_set}. flatten={f}, stat_feature={s}")
        if df_set.get(key_set) is not None and not df_set[key_set].empty:
            print("Already loaded, skipping")
            continue
        df_set[key_set] = prepare_dataset(class_path, flatten=f, stat_feature=s, limit_frames=limit_frames)
        save_dataset(key_set, df_set[key_set])
        X_set[key_set] = df_set[key_set].iloc[:, :-1]
        y_set[key_set] = df_set[key_set].iloc[:, -1]

# Define models and skip conditions as before
models = {
    "SVC": lambda: SVC(),
    "SVC_Linear": lambda: SVC(kernel="linear"),
    "RandomForestClassifier": lambda: RandomForestClassifier()
}

print("Models fit and prepare report")
for key in X_set.keys():
    print("-" * 80)
    print(f"DATASET {key}. shape: {df_set[key].shape}")
    X = X_set[key]
    y = y_set[key]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED, stratify=y)
    if reports.get(key) is None:
        reports[key] = {}
    for model, classification in models.items():
        print(f"\n- classification: {model}")
        if reports[key].get(model):
            print("   already fit, skipped")
            continue
        if key in skip_models.get(model, []):
            print("   skip this model")
            continue
        clf = classification()
        %time clf.fit(X_train, y_train)
        %time y_test_predict = clf.predict(X_test)
        reports[key][model] = classification_report(y_test, y_test_predict, digits=4, target_names=class_list)



DATASET f0s0. flatten=False, stat_feature=False
Importing class 'stairs' : 0. Frames: 165
Importing class 'running' : 1. Frames: 3408
Importing class 'idle' : 2. Frames: 1039
Importing class 'walking' : 3. Frames: 1850
Розмір датасету: (193860, 4)

DATASET f0s1. flatten=False, stat_feature=True
Importing class 'stairs' : 0. Frames: 165
Importing class 'running' : 1. Frames: 3408
Importing class 'idle' : 2. Frames: 1039
Importing class 'walking' : 3. Frames: 1850
Розмір датасету: (193860, 31)

DATASET f1s0. flatten=True, stat_feature=False
Importing class 'stairs' : 0. Frames: 165
Importing class 'running' : 1. Frames: 3408
Importing class 'idle' : 2. Frames: 1039
Importing class 'walking' : 3. Frames: 1850
Розмір датасету: (6462, 91)

DATASET f1s1. flatten=True, stat_feature=True
Importing class 'stairs' : 0. Frames: 165


  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['clas

Importing class 'running' : 1. Frames: 3408


  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['class'] = class_id
  new_df['clas

KeyboardInterrupt: 

Підготовка датасетів з часовими ознаками

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import numpy as np

def gen_saved_name(key):
    return SAVED_DATAFRAME_BASE.joinpath(f"data-{key}.father")

df_set = {}
X_set = {}
y_set = {}
reports = {}
skip_load = False

for key in DATASET_KEYS:
    filename = gen_saved_name(key)
    if not skip_load and filename.is_file():
        df_set[key] = pd.read_feather(filename)
        X_set[key] = df_set[key].iloc[:, :-1]
        y_set[key] = df_set[key].iloc[:, -1]
        print(f"DATASET {key}. Завантажено. Форма: {df_set[key].shape}")
    else:
        df_set[key] = pd.DataFrame()

def save_dataset(key, df):
    filename = gen_saved_name(key)
    if not df.empty and not filename.is_file():
        df.to_feather(filename)

def add_time_features(frame):
    time_features = []
    for col in frame.columns:
        if np.issubdtype(frame[col].dtype, np.number):
            time_features.append(pd.DataFrame({
                f'{col}_mean': [frame[col].mean()],
                f'{col}_max': [frame[col].max()],
                f'{col}_min': [frame[col].min()],
                f'{col}_std': [frame[col].std()]
            }))
    if time_features:
        return pd.concat(time_features, axis=1)
    else:
        return pd.DataFrame()

def prepare_dataset(class_path: list[Path], limit_frames=None):
    dfs = []
    for class_id, work_class_path in enumerate(class_path):
        list_files = sorted(work_class_path.glob('*.csv'), key=lambda path: int(path.stem.rsplit("-", 1)[1]))
        print(f"Importing class '{work_class_path.name}' : {class_id}. Frames: {len(list_files)}")
        for i, filename in enumerate(list_files):
            df = pd.read_csv(filename)
            time_features = add_time_features(df)
            if time_features.empty:
                print(f"Warning: No numeric columns found in frame {filename}")
                continue
            df['class'] = class_id
            dfs.append(pd.concat([df, time_features], axis=1))
            if limit_frames and i >= limit_frames:
                break
    df = pd.concat(dfs, ignore_index=True)
    print(f"Розмір датасету: {df.shape}")
    return df

limit_frames = None
for flatten in [False, True]:
    for stat_feature in [False, True]:
        key_set = f"f{int(flatten)}s{int(stat_feature)}"
        print(f"\nDATASET {key_set}. flatten={flatten}, stat_feature={stat_feature}")
        if df_set.get(key_set) is not None and not df_set[key_set].empty:
            print("Вже завантажено, пропускаємо")
            continue
        df_set[key_set] = prepare_dataset(class_path, limit_frames)
        save_dataset(key_set, df_set[key_set])
        X_set[key_set] = df_set[key_set].iloc[:, :-1]
        y_set[key_set] = df_set[key_set].iloc[:, -1]



DATASET f0s0. flatten=False, stat_feature=False
Importing class 'stairs' : 0. Frames: 165
Importing class 'running' : 1. Frames: 3408
Importing class 'idle' : 2. Frames: 1039
Importing class 'walking' : 3. Frames: 1850
Розмір датасету: (193860, 16)

DATASET f0s1. flatten=False, stat_feature=True
Importing class 'stairs' : 0. Frames: 165
Importing class 'running' : 1. Frames: 3408
Importing class 'idle' : 2. Frames: 1039
Importing class 'walking' : 3. Frames: 1850
Розмір датасету: (193860, 16)

DATASET f1s0. flatten=True, stat_feature=False
Importing class 'stairs' : 0. Frames: 165
Importing class 'running' : 1. Frames: 3408
Importing class 'idle' : 2. Frames: 1039
Importing class 'walking' : 3. Frames: 1850
Розмір датасету: (193860, 16)

DATASET f1s1. flatten=True, stat_feature=True
Importing class 'stairs' : 0. Frames: 165
Importing class 'running' : 1. Frames: 3408
Importing class 'idle' : 2. Frames: 1039
Importing class 'walking' : 3. Frames: 1850
Розмір датасету: (193860, 16)


Порівняння моделей SVM та RandomForestClassifier

In [20]:

models = {"SVC": lambda: SVC(),
          "SVC_Linear": lambda: SVC(kernel="linear"),
          "RandomForestClassifier": lambda: RandomForestClassifier()}
reports = {}
skip_models = {
    "SVC_Linear": ["f0s0"]
}

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np

# Define models
models = {
    "SVC": lambda: SVC(),
    "SVC_Linear": lambda: SVC(kernel="linear"),
    "RandomForestClassifier": lambda: RandomForestClassifier()
}

# Define models to skip
skip_models = {
    "SVC_Linear": ["f0s0"]
}

# Random seed
SEED = 42

# Dictionary to store classification reports
reports = {}

# Function to fit models and prepare report
print("Models fit and prepare report")
for key in X_set.keys():
    print("-" * 80)
    print(f"DATASET {key}. shape: {df_set[key].shape}")
    X = X_set[key]
    y = y_set[key]

    # Check the number of samples in each class
    class_counts = y.value_counts()
    if class_counts.min() < 2:
        print(f"Недостатньо зразків у деяких класах для датасету {key}. Пропускаємо.")
        continue

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED, stratify=y)

    if reports.get(key) is None:
        reports[key] = {}

    for model_name, classification in models.items():
        print(f"\n- classification: {model_name}")
        if reports[key].get(model_name):
            print("   already fit, skipped")
            continue
        if key in skip_models.get(model_name, []):
            print("   skip this model")
            continue

        clf = classification()
        print("   fitting model...")
        %time clf.fit(X_train, y_train)
        print("   predicting...")
        %time y_test_predict = clf.predict(X_test)
        reports[key][model_name] = classification_report(y_test, y_test_predict, digits=4, target_names=class_list)

# Output classification reports
for key, report in reports.items():
    print("-" * 80)
    print(f"DATASET: {key}, shape: {df_set[key].shape}")
    for model_name, metrics in report.items():
        print(f"\nМодель: {model_name}")
        print(metrics)


Models fit and prepare report
--------------------------------------------------------------------------------
DATASET f0s0. shape: (193860, 16)
Недостатньо зразків у деяких класах для датасету f0s0. Пропускаємо.
--------------------------------------------------------------------------------
DATASET f0s1. shape: (193860, 16)
Недостатньо зразків у деяких класах для датасету f0s1. Пропускаємо.
--------------------------------------------------------------------------------
DATASET f1s0. shape: (193860, 16)
Недостатньо зразків у деяких класах для датасету f1s0. Пропускаємо.
--------------------------------------------------------------------------------
DATASET f1s1. shape: (193860, 16)
Недостатньо зразків у деяких класах для датасету f1s1. Пропускаємо.
