In [14]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler

def clean_drone_data(file_path: Path):
    # Load dataset
    df = pd.read_csv(file_path)

    # Replace junk values with NaN
    df = df.replace([' ', '', 'NaN', 'null', '----'], np.nan)

    # Convert all to numeric if possible
    df = df.apply(pd.to_numeric, errors="coerce")

    # Drop rows with missing values
    df = df.dropna()

    # Remove duplicates
    df = df.drop_duplicates()

    # If "time" column exists, sort + interpolate gaps
    if "time" in df.columns:
        df = df.sort_values("time")
        df = df.set_index("time").interpolate(method="linear").reset_index()

    # Filter unrealistic sensor ranges
    conditions = (
        df["accel_x"].between(-20, 20) &
        df["accel_y"].between(-20, 20) &
        df["accel_z"].between(-20, 20) &
        df["gyro_x"].between(-2000, 2000) &
        df["gyro_y"].between(-2000, 2000) &
        df["gyro_z"].between(-2000, 2000) &
        df["mag_x"].between(-100, 100) &
        df["mag_y"].between(-100, 100) &
        df["mag_z"].between(-100, 100)
    )
    df = df[conditions]

    return df

def get_cleaned_data():
    """
    Finds the first CSV inside /data, cleans it, and returns the DataFrame.
    """
    repo_root = Path().resolve().parent
    data_dir = repo_root / "data"

    csv_files = list(data_dir.glob("*.csv"))
    if not csv_files:
        raise FileNotFoundError(f"No CSV file found in {data_dir}")

    print(f"📂 Using dataset: {csv_files[0].name}")
    return clean_drone_data(csv_files[0])


In [15]:
df = get_cleaned_data()
df.head()
df.tail()
df.info()
df.describe()

📂 Using dataset: imu_data.csv
<class 'pandas.core.frame.DataFrame'>
Index: 440850 entries, 0 to 544762
Data columns (total 16 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   time     440850 non-null  float64
 1   accel_x  440850 non-null  float64
 2   accel_y  440850 non-null  float64
 3   accel_z  440850 non-null  float64
 4   gyro_x   440850 non-null  float64
 5   gyro_y   440850 non-null  float64
 6   gyro_z   440850 non-null  float64
 7   mag_x    440850 non-null  float64
 8   mag_y    440850 non-null  float64
 9   mag_z    440850 non-null  float64
 10  pos_x    440850 non-null  float64
 11  pos_y    440850 non-null  float64
 12  pos_z    440850 non-null  float64
 13  roll     440850 non-null  float64
 14  pitch    440850 non-null  float64
 15  yaw      440850 non-null  float64
dtypes: float64(16)
memory usage: 57.2 MB


Unnamed: 0,time,accel_x,accel_y,accel_z,gyro_x,gyro_y,gyro_z,mag_x,mag_y,mag_z,pos_x,pos_y,pos_z,roll,pitch,yaw
count,440850.0,440850.0,440850.0,440850.0,440850.0,440850.0,440850.0,440850.0,440850.0,440850.0,440850.0,440850.0,440850.0,440850.0,440850.0,440850.0
mean,1134.18493,-0.109436,-0.043655,0.470377,0.241656,2.952476,1.143162,0.023707,-0.021935,-0.172883,1.348655,-0.023586,19.836665,0.029152,0.026798,0.006278
std,655.017058,8.886288,9.032118,3.929611,20.085323,21.8261,23.590596,0.53762,0.540648,0.628624,25.042991,24.447751,5.826608,1.633526,0.634087,1.595231
min,0.0,-19.999911,-19.999832,-19.996503,-98.483083,-100.016831,-100.020102,-1.171928,-1.193104,-1.21238,-55.18791,-54.696409,4.125631,-3.141567,-1.570796,-3.141592
25%,569.346875,-5.951704,-5.977057,-2.306548,-9.774008,-7.983377,-9.879336,-0.376429,-0.460243,-0.767784,-18.54263,-19.10389,15.446866,-1.240301,-0.385084,-1.137931
50%,1133.802083,-0.090822,0.026607,0.459349,0.003099,0.173902,0.011973,0.016036,-0.012257,-0.20357,0.848127,-0.46219,19.522095,0.0,0.012687,0.0
75%,1700.823958,5.751883,5.999937,3.205261,10.334747,12.966995,12.615225,0.451144,0.381213,0.358319,21.223284,19.421032,24.387018,1.260695,0.469021,1.168135
max,2269.841667,19.999866,19.999984,19.99362,94.494714,100.018078,100.031915,1.161891,1.158463,1.16327,55.239371,54.968636,34.943372,3.141588,1.570796,3.651898


In [20]:
repo_root = Path().resolve().parent
data_dir = repo_root / "data"

csv_files = list(data_dir.glob("*.csv"))
if not csv_files:
    raise FileNotFoundError(f"No CSV file found in {data_dir}")

print(f"📂 Using dataset: {csv_files[0].name}")
daf = pd.read_csv(csv_files[0])
daf.head()
daf.tail()
daf.info()
daf.describe()
print(daf[["accel_x","accel_y","accel_z","gyro_x","gyro_y","gyro_z","mag_x","mag_y","mag_z"]].describe())


📂 Using dataset: imu_data.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 544763 entries, 0 to 544762
Data columns (total 16 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   time     544763 non-null  float64
 1   accel_x  544763 non-null  float64
 2   accel_y  544763 non-null  float64
 3   accel_z  544763 non-null  float64
 4   gyro_x   544763 non-null  float64
 5   gyro_y   544763 non-null  float64
 6   gyro_z   544763 non-null  float64
 7   mag_x    544763 non-null  float64
 8   mag_y    544763 non-null  float64
 9   mag_z    544763 non-null  float64
 10  pos_x    544763 non-null  float64
 11  pos_y    544763 non-null  float64
 12  pos_z    544763 non-null  float64
 13  roll     544763 non-null  float64
 14  pitch    544763 non-null  float64
 15  yaw      544763 non-null  float64
dtypes: float64(16)
memory usage: 66.5 MB
             accel_x        accel_y        accel_z         gyro_x  \
count  544763.000000  544763.000000  544763.00

In [22]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler

def clean_drone_data(file_path: Path):
    # Load dataset
    df = pd.read_csv(file_path)

    # Replace junk values with NaN
    df = df.replace([' ', '', 'NaN', 'null', '----'], np.nan)

    # Convert all to numeric if possible
    df = df.apply(pd.to_numeric, errors="coerce")

    # Drop rows with missing values
    df = df.dropna()

    # Remove duplicates
    df = df.drop_duplicates()

    # If "time" column exists, sort + interpolate gaps
    if "time" in df.columns:
        df = df.sort_values("time")
        df = df.set_index("time").interpolate(method="linear").reset_index()

    # Filter unrealistic sensor ranges
    for col in ["accel_x","accel_y","accel_z",
            "gyro_x","gyro_y","gyro_z",
            "mag_x","mag_y","mag_z"]:
        mean, std = df[col].mean(), df[col].std()
        df = df[df[col].between(mean - 3*std, mean + 3*std)]


    return df

def get_cleaned_data():
    """
    Finds the first CSV inside /data, cleans it, and returns the DataFrame.
    """
    repo_root = Path().resolve().parent
    data_dir = repo_root / "data"

    csv_files = list(data_dir.glob("*.csv"))
    if not csv_files:
        raise FileNotFoundError(f"No CSV file found in {data_dir}")

    print(f"📂 Using dataset: {csv_files[0].name}")
    return clean_drone_data(csv_files[0])

df = get_cleaned_data()
df.head()
df.tail()
df.info()
df.describe()
print(df[["accel_x","accel_y","accel_z","gyro_x","gyro_y","gyro_z","mag_x","mag_y","mag_z"]].describe())

📂 Using dataset: imu_data.csv
<class 'pandas.core.frame.DataFrame'>
Index: 502135 entries, 0 to 544762
Data columns (total 16 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   time     502135 non-null  float64
 1   accel_x  502135 non-null  float64
 2   accel_y  502135 non-null  float64
 3   accel_z  502135 non-null  float64
 4   gyro_x   502135 non-null  float64
 5   gyro_y   502135 non-null  float64
 6   gyro_z   502135 non-null  float64
 7   mag_x    502135 non-null  float64
 8   mag_y    502135 non-null  float64
 9   mag_z    502135 non-null  float64
 10  pos_x    502135 non-null  float64
 11  pos_y    502135 non-null  float64
 12  pos_z    502135 non-null  float64
 13  roll     502135 non-null  float64
 14  pitch    502135 non-null  float64
 15  yaw      502135 non-null  float64
dtypes: float64(16)
memory usage: 65.1 MB
             accel_x        accel_y        accel_z         gyro_x  \
count  502135.000000  502135.000000  502135.000000 

In [13]:
import pandas as pd
import numpy as np
from pathlib import Path

def clean_drone_data(file_path: Path) -> pd.DataFrame:
    """
    Clean drone dataset:
    1. Replace junk values
    2. Drop NaN + duplicates
    3. Sort by time & interpolate gaps
    4. Remove outliers (>3σ) for sensor columns
    """
    # Load dataset
    df = pd.read_csv(file_path)

    # Replace junk placeholders with NaN
    df = df.replace([' ', '', 'NaN', 'null', '----'], np.nan)

    # Convert all to numeric where possible
    df = df.apply(pd.to_numeric, errors="coerce")

    # Drop rows with missing values
    df = df.dropna()

    # Remove duplicates
    df = df.drop_duplicates()

    # Sort + interpolate if time column exists
    if "time" in df.columns:
        df = df.sort_values("time")
        df = df.set_index("time").interpolate(method="linear").reset_index()

    # Define sensor columns to filter
    sensor_cols = [
        "accel_x", "accel_y", "accel_z",
        "gyro_x", "gyro_y", "gyro_z",
        "mag_x", "mag_y", "mag_z"
    ]
    # Apply 3σ rule for each sensor column
    for col in sensor_cols:
        if col in df.columns:  # avoid KeyError
            mean, std = df[col].mean(), df[col].std()
            df = df[df[col].between(mean - 3*std, mean + 3*std)]

    return df


def get_cleaned_data():
    """
    Finds the first CSV inside /data, cleans it, and returns the DataFrame.
    """
    repo_root = Path().resolve().parent  # adjust if needed
    data_dir = repo_root / "data"

    csv_files = list(data_dir.glob("*.csv"))
    if not csv_files:
        raise FileNotFoundError(f"No CSV file found in {data_dir}")

    print(f"📂 Using dataset: {csv_files[0].name}")
    return clean_drone_data(csv_files[0])



df = get_cleaned_data()
print("✅ Cleaned dataset shape:", df.shape)
print(df.head())


📂 Using dataset: imu_data.csv
✅ Cleaned dataset shape: (502135, 16)
         time    accel_x   accel_y    accel_z    gyro_x    gyro_y    gyro_z  \
0    0.000000   0.121071  0.055025   0.140186 -0.011067 -0.010805 -0.008495   
98   0.408333  26.175778  2.483711  13.233593  0.002653  0.820761 -0.003468   
99   0.412500  25.853871  2.395954  13.169779 -0.007524  0.657785  0.007546   
100  0.416667  25.590717  2.447633  12.967597  0.001140  0.494727  0.003560   
101  0.420833  25.224523  2.553393  12.966746 -0.012782  0.337902 -0.002019   

        mag_x     mag_y     mag_z     pos_x     pos_y      pos_z  roll  \
0    0.035449 -0.019849 -1.021133  0.000000  0.000000  15.000000   0.0   
98   0.906541  0.087718 -0.041971  3.086126  0.198377  16.560868  -0.0   
99   0.904822  0.068235 -0.032869  3.145891  0.203011  16.591171  -0.0   
100  0.998752  0.043929 -0.109071  3.206101  0.207688  16.621701  -0.0   
101  1.010489 -0.005313 -0.004478  3.266750  0.212407  16.652455  -0.0   

        pitc

In [14]:
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Optional
import logging

logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

def clean_drone_data(file_path: Path) -> pd.DataFrame:
    """
    Cleans drone sensor data from a CSV file.

    Args:
        file_path (Path): Path to the CSV file.

    Returns:
        pd.DataFrame: Cleaned DataFrame.

    Raises:
        FileNotFoundError: If the file does not exist.
        ValueError: If required columns are missing.
    """
    if not file_path.exists():
        raise FileNotFoundError(f"File not found: {file_path}")

    df = pd.read_csv(file_path)

    # Replace junk values with NaN
    df = df.replace([' ', '', 'NaN', 'null', '----'], np.nan)

    # Convert all to numeric if possible
    df = df.apply(pd.to_numeric, errors="coerce")

    # Drop rows with missing values
    df = df.dropna()

    # Remove duplicates
    df = df.drop_duplicates()

    # If "time" column exists, sort + interpolate gaps
    if "time" in df.columns:
        df = df.sort_values("time")
        df = df.set_index("time").interpolate(method="linear").reset_index()

    # Required sensor columns
    required_cols = [
        "accel_x", "accel_y", "accel_z",
        "gyro_x", "gyro_y", "gyro_z",
        "mag_x", "mag_y", "mag_z"
    ]
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing required columns: {missing_cols}")

    # Filter unrealistic sensor ranges
    sensor_ranges = {
        "accel_x": (-20, 20), "accel_y": (-20, 20), "accel_z": (-20, 20),
        "gyro_x": (-2000, 2000), "gyro_y": (-2000, 2000), "gyro_z": (-2000, 2000),
        "mag_x": (-100, 100), "mag_y": (-100, 100), "mag_z": (-100, 100)
    }
    for col, (low, high) in sensor_ranges.items():
        df = df[df[col].between(low, high)]

    df = df.reset_index(drop=True)
    return df

def get_cleaned_data():
    """
    Finds the first CSV inside /data, cleans it, and returns the DataFrame.
    """
    repo_root = Path().resolve().parent  # adjust if needed
    data_dir = repo_root / "data"

    csv_files = list(data_dir.glob("*.csv"))
    if not csv_files:
        raise FileNotFoundError(f"No CSV file found in {data_dir}")

    print(f"📂 Using dataset: {csv_files[0].name}")
    return clean_drone_data(csv_files[0])



df = get_cleaned_data()
print("✅ Cleaned dataset shape:", df.shape)
print(df.head())


📂 Using dataset: imu_data.csv
✅ Cleaned dataset shape: (440850, 16)
       time    accel_x   accel_y    accel_z    gyro_x    gyro_y    gyro_z  \
0  0.000000   0.121071  0.055025   0.140186 -0.011067 -0.010805 -0.008495   
1  0.504167  19.957816  1.920644  10.143239 -0.000567 -2.835298 -0.001505   
2  0.508333  19.790528  1.958678  10.134391  0.010026 -2.971429  0.006720   
3  0.512500  19.461474  1.863660   9.811761 -0.005655 -3.139505  0.013458   
4  0.516667  19.186982  2.083963   9.814028  0.029304 -3.242148 -0.008315   

      mag_x     mag_y     mag_z     pos_x     pos_y      pos_z  roll  \
0  0.035449 -0.019849 -1.021133  0.000000  0.000000  15.000000   0.0   
1  0.970741  0.099023 -0.122807  4.564932  0.315078  17.311109  -0.0   
2  1.035171  0.022468 -0.162167  4.633747  0.320597  17.346041  -0.0   
3  0.997608  0.057119 -0.181006  4.702900  0.326149  17.381145  -0.0   
4  0.916779 -0.029111 -0.141337  4.772387  0.331735  17.416420  -0.0   

      pitch  yaw  
0 -0.000000  0.0 

In [15]:
import pandas as pd
import numpy as np
from pathlib import Path

def clean_drone_data(file_path: Path, drop_constant_cols: bool = True) -> pd.DataFrame:
    """
    Clean drone dataset:
    1. Replace junk values
    2. Drop NaN + duplicates
    3. Remove columns with >50% NaN
    4. Strip whitespace & lowercase columns
    5. Sort by time & interpolate gaps
    6. Remove outliers (>3σ) for sensor columns
    7. Optionally drop constant columns
    """
    # Load dataset
    df = pd.read_csv(file_path)

    # Clean column names
    df.columns = df.columns.str.strip().str.lower()

    # Strip whitespace from string/object columns
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].str.strip()

    # Replace junk placeholders with NaN
    df = df.replace([' ', '', 'NaN', 'nan', 'null', '----'], np.nan)

    # Remove columns with >50% missing values
    thresh = len(df) * 0.5
    df = df.dropna(axis=1, thresh=thresh)

    # Convert all to numeric where possible
    df = df.apply(pd.to_numeric, errors="coerce")

    # Drop rows with missing values
    df = df.dropna()

    # Remove duplicates
    df = df.drop_duplicates()

    # Sort + interpolate if time column exists
    if "time" in df.columns:
        df = df.sort_values("time")
        df = df.set_index("time").interpolate(method="linear").reset_index()

    # Define sensor columns to filter
    sensor_cols = [
        "accel_x", "accel_y", "accel_z",
        "gyro_x", "gyro_y", "gyro_z",
        "mag_x", "mag_y", "mag_z"
    ]
    # Apply 3σ rule for each sensor column
    for col in sensor_cols:
        if col in df.columns:  # avoid KeyError
            mean, std = df[col].mean(), df[col].std()
            df = df[df[col].between(mean - 3*std, mean + 3*std)]

    # Optionally drop constant columns
    if drop_constant_cols:
        nunique = df.nunique()
        const_cols = nunique[nunique == 1].index
        df = df.drop(columns=const_cols)

    df = df.reset_index(drop=True)
    return df


def get_cleaned_data():
    """
    Finds the first CSV inside /data, cleans it, and returns the DataFrame.
    """
    repo_root = Path().resolve().parent  # adjust if needed
    data_dir = repo_root / "data"

    csv_files = list(data_dir.glob("*.csv"))
    if not csv_files:
        raise FileNotFoundError(f"No CSV file found in {data_dir}")

    print(f"📂 Using dataset: {csv_files[0].name}")
    return clean_drone_data(csv_files[0])



df = get_cleaned_data()
print("✅ Cleaned dataset shape:", df.shape)
print(df.head())


📂 Using dataset: imu_data.csv
✅ Cleaned dataset shape: (502135, 16)
       time    accel_x   accel_y    accel_z    gyro_x    gyro_y    gyro_z  \
0  0.000000   0.121071  0.055025   0.140186 -0.011067 -0.010805 -0.008495   
1  0.408333  26.175778  2.483711  13.233593  0.002653  0.820761 -0.003468   
2  0.412500  25.853871  2.395954  13.169779 -0.007524  0.657785  0.007546   
3  0.416667  25.590717  2.447633  12.967597  0.001140  0.494727  0.003560   
4  0.420833  25.224523  2.553393  12.966746 -0.012782  0.337902 -0.002019   

      mag_x     mag_y     mag_z     pos_x     pos_y      pos_z  roll  \
0  0.035449 -0.019849 -1.021133  0.000000  0.000000  15.000000   0.0   
1  0.906541  0.087718 -0.041971  3.086126  0.198377  16.560868  -0.0   
2  0.904822  0.068235 -0.032869  3.145891  0.203011  16.591171  -0.0   
3  0.998752  0.043929 -0.109071  3.206101  0.207688  16.621701  -0.0   
4  1.010489 -0.005313 -0.004478  3.266750  0.212407  16.652455  -0.0   

      pitch  yaw  
0 -0.000000  0.0 