Import Required Libraries

In [None]:
import pandas as pd # for data manuipulation and analysis
import numpy as np # for data manuipulation and analysis
import sklearn as sk # for Machine learning metrics
import matplotlib.pyplot as plt # for Plotting

Load Your Dataset

In [None]:
# loading the dataset from the google colab files
df = pd.read_csv('/content/sensor_log.csv')

Viewing dataset

In [None]:
# viewing the first 5 datasets
df.head()

Unnamed: 0,timestamp,temperature_c,humidity_pct,voltage_v
0,2025-10-01 08:00:00,24.5,55.2,3.7
1,2025-10-01 08:00:10,24.7,55.0,3.69
2,2025-10-01 08:00:20,24.6,55.1,
3,2025-10-01 08:00:30,,54.9,3.68
4,2025-10-01 08:01:00,24.9,54.8,3.68


Basic information about the dataset

In [None]:
# description of the dataset with the shape, columns, dtypes, missing counts and total missing values
info = {
    "shape": df.shape,
    "columns": df.columns.tolist(),
    "dtypes": df.dtypes.apply(lambda x: x.name).to_dict(),
    "missing_counts": df.isna().sum().to_dict(),
    "total_missing": int(df.isna().sum().sum())
}
print("Basic info:", info)

Basic info: {'shape': (10, 4), 'columns': ['timestamp', 'temperature_c', 'humidity_pct', 'voltage_v'], 'dtypes': {'timestamp': 'object', 'temperature_c': 'float64', 'humidity_pct': 'float64', 'voltage_v': 'float64'}, 'missing_counts': {'timestamp': 0, 'temperature_c': 2, 'humidity_pct': 1, 'voltage_v': 1}, 'total_missing': 4}


Find duplicates (all columns identical)

In [None]:
duplicates_mask = df.duplicated(keep=False)
duplicates = df[duplicates_mask].copy()
duplicates_count = duplicates.shape[0]
print(f"Found {duplicates_count} duplicate rows (keep=False).")

Found 0 duplicate rows (keep=False).


Numeric columns for imputation evaluation

In [None]:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print("Numeric columns:", numeric_cols)

Numeric columns: ['temperature_c', 'humidity_pct', 'voltage_v']


Handle Missing Values - Interpolation

In [None]:
# Perform interpolation to fill missing values
df_interpolated = df.interpolate()
df_interpolated

  df_interpolated = df.interpolate()


Unnamed: 0,timestamp,temperature_c,humidity_pct,voltage_v
0,2025-10-01 08:00:00,24.5,55.2,3.7
1,2025-10-01 08:00:10,24.7,55.0,3.69
2,2025-10-01 08:00:20,24.6,55.1,3.685
3,2025-10-01 08:00:30,24.75,54.9,3.68
4,2025-10-01 08:01:00,24.9,54.8,3.68
5,2025-10-01 08:02:15,25.1,54.75,3.67
6,2025-10-01 08:03:00,25.3,54.7,3.67
7,2025-10-01 08:05:30,25.5,54.9,3.65
8,2025-10-01 08:08:00,25.75,55.0,3.64
9,2025-10-01 08:10:00,26.0,55.1,3.63


Rechecking for missing values

In [None]:
# Rechecking for missing values after interpolation
df_interpolated.isna().sum()

Unnamed: 0,0
timestamp,0
temperature_c,0
humidity_pct,0
voltage_v,0


Handle Missing Values - Forward Fill

In [None]:
# Perform forward fill to fill missing values
df_ffill = df.fillna(method='ffill')
df_ffill

  df_ffill = df.fillna(method='ffill')


Unnamed: 0,timestamp,temperature_c,humidity_pct,voltage_v
0,2025-10-01 08:00:00,24.5,55.2,3.7
1,2025-10-01 08:00:10,24.7,55.0,3.69
2,2025-10-01 08:00:20,24.6,55.1,3.69
3,2025-10-01 08:00:30,24.6,54.9,3.68
4,2025-10-01 08:01:00,24.9,54.8,3.68
5,2025-10-01 08:02:15,25.1,54.8,3.67
6,2025-10-01 08:03:00,25.3,54.7,3.67
7,2025-10-01 08:05:30,25.5,54.9,3.65
8,2025-10-01 08:08:00,25.5,55.0,3.64
9,2025-10-01 08:10:00,26.0,55.1,3.63


Rechecking for missing values

In [None]:
df_ffill.isna().sum()

Unnamed: 0,0
timestamp,0
temperature_c,0
humidity_pct,0
voltage_v,0


Handle Missing Values - Backward Fill

In [None]:
# Perform backward fill to fill missing values
df_bfill = df.fillna(method='bfill')
df_bfill

  df_bfill = df.fillna(method='bfill')


Unnamed: 0,timestamp,temperature_c,humidity_pct,voltage_v
0,2025-10-01 08:00:00,24.5,55.2,3.7
1,2025-10-01 08:00:10,24.7,55.0,3.69
2,2025-10-01 08:00:20,24.6,55.1,3.68
3,2025-10-01 08:00:30,24.9,54.9,3.68
4,2025-10-01 08:01:00,24.9,54.8,3.68
5,2025-10-01 08:02:15,25.1,54.7,3.67
6,2025-10-01 08:03:00,25.3,54.7,3.67
7,2025-10-01 08:05:30,25.5,54.9,3.65
8,2025-10-01 08:08:00,26.0,55.0,3.64
9,2025-10-01 08:10:00,26.0,55.1,3.63


Rechecking for missing values

In [None]:
df_bfill.isna().sum()

Unnamed: 0,0
timestamp,0
temperature_c,0
humidity_pct,0
voltage_v,0


Finding rows that originally had missing values

In [None]:
# Finding positions of missing values in the DataFrame
missing_positions = list(zip(*np.where(df.isna())))
print("\nRows that had missing values:", missing_positions)


Rows that had missing values: [(np.int64(2), np.int64(3)), (np.int64(3), np.int64(1)), (np.int64(5), np.int64(2)), (np.int64(8), np.int64(1))]


Comparing the 3 methods

In [None]:
# Comparing the results of different missing value handling methods
results = []
for row, col in missing_positions:
    colname = df.columns[col]  # get column name

    true_val = df_interpolated.loc[row, colname]   # reference value from interpolation
    ffill_val = df_ffill.loc[row, colname]        # forward fill value
    bfill_val = df_bfill.loc[row, colname]        # backward fill value

    results.append({
        'row': row,
        'column': colname,
        'interpolated_value': true_val,
        'ffill_value': ffill_val,
        'bfill_value': bfill_val,
        'ffill_error': abs(ffill_val - true_val),
        'bfill_error': abs(bfill_val - true_val)
    })

comp_df = pd.DataFrame(results)
print("\nComparison Table:")
comp_df


Comparison Table:


Unnamed: 0,row,column,interpolated_value,ffill_value,bfill_value,ffill_error,bfill_error
0,2,voltage_v,3.685,3.69,3.68,0.005,0.005
1,3,temperature_c,24.75,24.6,24.9,0.15,0.15
2,5,humidity_pct,54.75,54.8,54.7,0.05,0.05
3,8,temperature_c,25.75,25.5,26.0,0.25,0.25


Showing total and average errors

In [None]:
print("\n==================== ERROR SUMMARY ====================")
print("Total Forward Fill Error:", comp_df['ffill_error'].sum())
print("Total Backward Fill Error:", comp_df['bfill_error'].sum())
print("Average Forward Fill Error:", comp_df['ffill_error'].mean())
print("Average Backward Fill Error:", comp_df['bfill_error'].mean())

print("\nBEST METHOD = Interpolation (0 error, smooth & realistic)")


Total Forward Fill Error: 0.45499999999999563
Total Backward Fill Error: 0.45499999999999563
Average Forward Fill Error: 0.11374999999999891
Average Backward Fill Error: 0.11374999999999891

BEST METHOD = Interpolation (0 error, smooth & realistic)


Saving Clean version

In [None]:
# Saving cleaned dataframes to CSV files
df_ffill.to_csv("sensor_log_ffill.csv", index=False)
df_bfill.to_csv("sensor_log_bfill.csv", index=False)
df_interpolated.to_csv("sensor_log_interpolated.csv", index=False)

print("\nCleaned files saved:")
print(" - sensor_log_ffill.csv")
print(" - sensor_log_bfill.csv")
print(" - sensor_log_interpolated.csv")


Cleaned files saved:
 - sensor_log_ffill.csv
 - sensor_log_bfill.csv
 - sensor_log_interpolated.csv
