### 🧰 Data Inconsistency

This notebook loads `maintenance_data.csv` and demonstrates how to identify and fix:
- ⏱️ **Time/Date issues** → Standardize formats
- 📏 **Inconsistent units** → Convert to a single unit
- 🏷️ **Inconsistent categories** → Normalize labels/casing
- 🔠 **Incorrect data types** → Enforce correct types (dates, numbers)

In [11]:
import pandas as pd, os

# Load CSV (auto-create from example if not exists)
csv_path = 'raw_data/maintenance_data.csv'

df = pd.read_csv(csv_path)
df.head(15)

Unnamed: 0,maintenance_id,equipment_name,equipment_type,last_maintenance,maintenance_interval,status,temperature,cost
0,1,Equip-1,pump,2023-13-45,199 days,active,53.9,6766.31
1,2,Equip-2,Pump,2025-02-20,845 HRS,maint,83.7,$8886
2,3,Equip-3,motor,2025-04-25,573 hours,Maintenance,78.9,1069.3
3,4,Equip-4,Motor,09/07/2025,195 hours,ACTIVE,58.9,$6670
4,5,Equip-5,Motor,2025-09-13,499 HRS,Active,181.1,$9205
5,6,Equip-6,PUMP,2025-06-09,403 hours,ACTIVE,151.4,665.94
6,7,Equip-7,pump,12/17/2024,261 hrs,active,79.5,$9849
7,8,Equip-8,VALVE,2023-13-45,982 hours,active,95.3,493.44
8,9,Equip-9,pump,2023-13-45,773 min,Maint,45.3,$5247
9,10,Equip-10,Valve,2023-13-45,505 hours,ACTIVE,111.2,$1288


### Load data from a url

In [1]:
# import pandas as pd

# url = "https://raw.githubusercontent.com/Dr-AlaaKhamis/ISE518/main/6_Data_imperfection/raw_data/maintenance_data.csv"

# df = pd.read_csv(url, encoding="latin1")
# df.head()

#### ⏱️ Fix Time/Date Issues

In [12]:
raw_dates = df['last_maintenance'].copy()
df['last_maintenance'] = pd.to_datetime(df['last_maintenance'], errors='coerce')

pd.DataFrame({'raw': raw_dates, 'parsed': df['last_maintenance']})

  df['last_maintenance'] = pd.to_datetime(df['last_maintenance'], errors='coerce')


Unnamed: 0,raw,parsed
0,2023-13-45,NaT
1,2025-02-20,2025-02-20
2,2025-04-25,2025-04-25
3,09/07/2025,2025-09-07
4,2025-09-13,2025-09-13
...,...,...
95,04/21/2025,2025-04-21
96,07/06/2025,2025-07-06
97,2023-13-45,NaT
98,2023-13-45,NaT


#### 📏 Standardize Units (maintenance_interval → hours)

In [3]:
import re
def interval_to_hours(s):
    if pd.isna(s): return pd.NA
    s=str(s).lower().strip()
    m=re.search(r'(\d+(?:\.\d+)?)\s*(days|day|d|hours|hrs|hr|h)',s)
    if not m: return pd.NA
    val=float(m.group(1)); unit=m.group(2)
    return val*24 if unit in ['days','day','d'] else val

df['maintenance_interval_hours']=df['maintenance_interval'].apply(interval_to_hours)
df[['maintenance_interval','maintenance_interval_hours']]

Unnamed: 0,maintenance_interval,maintenance_interval_hours
0,199 days,4776.0
1,845 HRS,845.0
2,573 hours,573.0
3,195 hours,195.0
4,499 HRS,499.0
...,...,...
95,894 hours,894.0
96,743 hrs,743.0
97,473 days,11352.0
98,235 min,


#### 🏷️ Normalize Categories

In [4]:
df['equipment_type_clean']=df['equipment_type'].str.strip().str.lower()
df['status_clean']=df['status'].str.strip().str.lower()
df[['equipment_type','equipment_type_clean','status','status_clean']]

Unnamed: 0,equipment_type,equipment_type_clean,status,status_clean
0,pump,pump,active,active
1,Pump,pump,maint,maint
2,motor,motor,Maintenance,maintenance
3,Motor,motor,ACTIVE,active
4,Motor,motor,Active,active
...,...,...,...,...
95,motor,motor,active,active
96,Valve,valve,Maintenance,maintenance
97,Valve,valve,Down,down
98,MTR,mtr,Maintenance,maintenance


#### 🔠 Enforce Numeric Types

In [5]:
def clean_currency(x):
    s=str(x).replace('$','').replace(',','')
    try: return float(s)
    except: return pd.NA

df['cost_clean']=df['cost'].apply(clean_currency)
df['temperature_clean']=pd.to_numeric(df['temperature'],errors='coerce')
df[['cost','cost_clean','temperature','temperature_clean']]

Unnamed: 0,cost,cost_clean,temperature,temperature_clean
0,6766.31,6766.31,53.9,53.9
1,$8886,8886.00,83.7,83.7
2,1069.3,1069.30,78.9,78.9
3,$6670,6670.00,58.9,58.9
4,$9205,9205.00,181.1,181.1
...,...,...,...,...
95,6809.87,6809.87,118.7,118.7
96,$7322,7322.00,161.1,161.1
97,$6907,6907.00,92.8,92.8
98,1728.99,1728.99,120.7,120.7


#### ✅ Save Cleaned Data

In [6]:
df.to_csv('preprocessed_data/maintenance_data_clean.csv',index=False)
df.head()

Unnamed: 0,maintenance_id,equipment_name,equipment_type,last_maintenance,maintenance_interval,status,temperature,cost,maintenance_interval_hours,equipment_type_clean,status_clean,cost_clean,temperature_clean
0,1,Equip-1,pump,NaT,199 days,active,53.9,6766.31,4776.0,pump,active,6766.31,53.9
1,2,Equip-2,Pump,2025-02-20,845 HRS,maint,83.7,$8886,845.0,pump,maint,8886.0,83.7
2,3,Equip-3,motor,2025-04-25,573 hours,Maintenance,78.9,1069.3,573.0,motor,maintenance,1069.3,78.9
3,4,Equip-4,Motor,NaT,195 hours,ACTIVE,58.9,$6670,195.0,motor,active,6670.0,58.9
4,5,Equip-5,Motor,2025-09-13,499 HRS,Active,181.1,$9205,499.0,motor,active,9205.0,181.1
