In [1]:
import os
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import TimeSeriesSplit

In [2]:
# Get current working directory
cwd = os.getcwd()

def load_file(filepath: str) -> pd.DataFrame:
    # Check if file exists
    if not os.path.exists(filepath):
        raise FileNotFoundError("File not found")
    
    # Read file
    try:
        df = pd.read_csv(filepath, skiprows=2, header=0, sep=",", decimal=".")
    except:
        return None
    

    # Make new column with entire filepath, and another with just the filename
    df["filepath"] = '/'.join(filepath.split(os.sep)[-2:])
    df["filename"] = os.path.basename(filepath)
    df["label"] = filepath.split(os.sep)[-2].split("-")[-1]

    return df

def get_all_data(data_folder) -> pd.DataFrame:
    # Check if folder exists
    if not os.path.exists(data_folder):
        raise FileNotFoundError("Folder not found")

    # Check if cwd is part of data_folder
    if not data_folder.startswith(cwd):
        # If not, add cwd to data_folder
        data_folder = os.path.join(cwd, data_folder)

    # Get all csv files from subfolders of data_folder
    dfs = []
    for root, dirs, filenames in os.walk(data_folder):
        for f in filenames:
            if f.endswith(".CSV"):
                df = load_file(os.path.join(root, f))
                if df is not None:
                    dfs.append(df)
    
    # Concatenate all dataframes
    df = pd.concat(dfs, ignore_index=True)
    return df
    

## Load data

In [3]:
# Load data
df = get_all_data("../../data/raw")

### Simple stats

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4189920 entries, 0 to 4189919
Data columns (total 7 columns):
 #   Column    Dtype  
---  ------    -----  
 0   millis    int64  
 1   x         float64
 2   y         float64
 3   z         float64
 4   filepath  object 
 5   filename  object 
 6   label     object 
dtypes: float64(3), int64(1), object(3)
memory usage: 223.8+ MB


In [5]:
df.head()

Unnamed: 0,millis,x,y,z,filepath,filename,label
0,7518,0.37,1.08,-0.02,Test 10-Broken_Cutting/Test 10 (1).CSV,Test 10 (1).CSV,Broken_Cutting
1,7518,0.03,1.86,-0.02,Test 10-Broken_Cutting/Test 10 (1).CSV,Test 10 (1).CSV,Broken_Cutting
2,7518,-0.36,1.72,0.47,Test 10-Broken_Cutting/Test 10 (1).CSV,Test 10 (1).CSV,Broken_Cutting
3,7518,-0.9,2.16,0.03,Test 10-Broken_Cutting/Test 10 (1).CSV,Test 10 (1).CSV,Broken_Cutting
4,7518,-0.46,0.83,0.67,Test 10-Broken_Cutting/Test 10 (1).CSV,Test 10 (1).CSV,Broken_Cutting


In [6]:
df.label.value_counts()

Broken_Cutting     1419600
Missing_Idle        898800
Healthy_Cutting     663600
Healthy_Idle        633360
Broken_Idle         299040
Missing_Cutting     275520
Name: label, dtype: int64

In [7]:
df.groupby(["label", "filename"]).size().reset_index(name="count")

Unnamed: 0,label,filename,count
0,Broken_Cutting,Test 10 (1).CSV,13440
1,Broken_Cutting,Test 10 (2).CSV,26040
2,Broken_Cutting,Test 10 (3).CSV,670320
3,Broken_Cutting,Test 10 - Merged.CSV,709800
4,Broken_Idle,Test 09 (1).CSV,2520
5,Broken_Idle,Test 09 (14).CSV,5040
6,Broken_Idle,Test 09 (15).CSV,85680
7,Broken_Idle,Test 09 (2).CSV,10080
8,Broken_Idle,Test 09 (27).CSV,840
9,Broken_Idle,Test 09 (59).CSV,1680


In [8]:
# Save data
df.to_parquet("../../data/processed/df.parquet")

: 

: 

## Split data in train and test

In [8]:
def split_grouped_data(df: pd.DataFrame, n_splits: int = 2) -> list:
    tss = TimeSeriesSplit(n_splits=n_splits)
    train_splits, test_splits = tss.split(df)
    return train_splits, test_splits

def split_data(df: pd.DataFrame, n_splits: int = 2, groupby_cols=["label", "filename"]) -> list:
    # Split into train and test for each group
    train_splits, test_splits = [], []
    for name, group in tqdm(df.groupby(groupby_cols)):
        print(name)
        train_splits, test_splits = split_data(df=group, n_splits=n_splits)
        train_data = group.iloc[train_splits[0], :]
        test_data = group.iloc[test_splits[0], :]
        train_splits.append(train_data)
        test_splits.append(test_data)
    return train_splits, test_splits
    

In [None]:
train_splits, test_splits = split_data(df, n_splits=2, groupby_cols=["label", "filename"])