In [25]:
import pandas as pd
import numpy as np

# Load the data
print("=== ORIGINAL DATA ===\n")
mine_df = pd.read_csv('Mine.csv')
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(mine_df)
print("\nMissing values count before cleaning:")
print(mine_df.isna().sum())

=== ORIGINAL DATA ===

    Duration         Date  Pulse  Maxpulse  Calories
0         60  2023/10/01'  110.0     130.0     409.1
1         60  2023/10/02'  117.0     145.0     479.0
2         60  2023/10/03'  103.0     135.0     340.3
3         45  2023/10/04'  109.0     175.0     282.4
4         45  2023/10/05'  117.0     150.0     405.1
5         60  2023/10/06'  103.0     125.0     300.0
6         60  2023/10/07'  110.0     135.0     374.0
7        400  2023/10/08'  114.0     133.0       NaN
8         60  2023/10/09'  112.0     126.0     193.8
9         30  2023/10/10'  102.0     147.0     234.8
10        60  2023/10/11'  100.0     129.0     375.3
11        60  2023/10/12'  109.0     131.0     345.6
12        60  2023/10/13'  103.0     136.0     239.2
13        60  2023/10/15'  120.0       NaN     240.8
14        60  2023/10/15'  120.0     100.0     240.8
15        60  2023/10/16'    NaN     101.0     243.8
16        60  2023/10/17'  127.0     102.0     380.2
17        45     202310

In [26]:

# 1. Handle missing values
print("\n=== STEP 1: HANDLE MISSING VALUES ===")

print("\nFilling missing Pulse with median...")
pulse_before = mine_df['Pulse'].isna().sum()
mine_df['Pulse'] = mine_df['Pulse'].fillna(mine_df['Pulse'].median())
pulse_after = mine_df['Pulse'].isna().sum()
print(f"Fixed {pulse_before - pulse_after} missing Pulse values")

print("\nFilling missing Maxpulse with mean...")
maxpulse_before = mine_df['Maxpulse'].isna().sum()
mine_df['Maxpulse'] = mine_df['Maxpulse'].fillna(mine_df['Maxpulse'].mean())
maxpulse_after = mine_df['Maxpulse'].isna().sum()
print(f"Fixed {maxpulse_before - maxpulse_after} missing Maxpulse values")

print("\nFilling missing Calories with 0...")
calories_before = mine_df['Calories'].isna().sum()
mine_df['Calories'] = mine_df['Calories'].fillna(0)
calories_after = mine_df['Calories'].isna().sum()
print(f"Fixed {calories_before - calories_after} missing Calories values")

print("\nData after missing value treatment:")
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(mine_df)

# [Rest of your cleaning steps with similar full-data display...]


=== STEP 1: HANDLE MISSING VALUES ===

Filling missing Pulse with median...
Fixed 1 missing Pulse values

Filling missing Maxpulse with mean...
Fixed 2 missing Maxpulse values

Filling missing Calories with 0...
Fixed 4 missing Calories values

Data after missing value treatment:
    Duration         Date  Pulse    Maxpulse  Calories
0         60  2023/10/01'  110.0  130.000000     409.1
1         60  2023/10/02'  117.0  145.000000     479.0
2         60  2023/10/03'  103.0  135.000000     340.3
3         45  2023/10/04'  109.0  175.000000     282.4
4         45  2023/10/05'  117.0  150.000000     405.1
5         60  2023/10/06'  103.0  125.000000     300.0
6         60  2023/10/07'  110.0  135.000000     374.0
7        400  2023/10/08'  114.0  133.000000       0.0
8         60  2023/10/09'  112.0  126.000000     193.8
9         30  2023/10/10'  102.0  147.000000     234.8
10        60  2023/10/11'  100.0  129.000000     375.3
11        60  2023/10/12'  109.0  131.000000     345.6
12 

In [28]:
# 2. Fix inconsistent date formats and handle missing dates
print("\n=== STEP 2: FIX DATES ===")
print("\nRemoving apostrophes from dates...")
mine_df['Date'] = mine_df['Date'].str.replace("'", "")
print("Dates after apostrophe removal:")
print(mine_df['Date'].head())

print("\nConverting to datetime...")
mine_df['Date'] = pd.to_datetime(mine_df['Date'], errors='coerce')
print("Dates after conversion (NaN means invalid format):")
print(mine_df['Date'].head())

print("\nForward-filling missing dates...")
mine_df['Date'] = mine_df['Date'].ffill()
print("Dates after forward fill:")
print(mine_df['Date'].head())


=== STEP 2: FIX DATES ===

Removing apostrophes from dates...
Dates after apostrophe removal:
0    2023/10/01
1    2023/10/02
2    2023/10/03
3    2023/10/04
4    2023/10/05
Name: Date, dtype: object

Converting to datetime...
Dates after conversion (NaN means invalid format):
0   2023-10-01
1   2023-10-02
2   2023-10-03
3   2023-10-04
4   2023-10-05
Name: Date, dtype: datetime64[ns]

Forward-filling missing dates...
Dates after forward fill:
0   2023-10-01
1   2023-10-02
2   2023-10-03
3   2023-10-04
4   2023-10-05
Name: Date, dtype: datetime64[ns]


In [29]:
# 3. Remove duplicate rows
print("\n=== STEP 3: REMOVE DUPLICATES ===")
print(f"Shape before removing duplicates: {mine_df.shape}")
mine_df = mine_df.drop_duplicates()
print(f"Shape after removing duplicates: {mine_df.shape}")
print("Sample data after duplicate removal:")
print(mine_df.head())


=== STEP 3: REMOVE DUPLICATES ===
Shape before removing duplicates: (31, 5)
Shape after removing duplicates: (31, 5)
Sample data after duplicate removal:
   Duration       Date  Pulse  Maxpulse  Calories
0        60 2023-10-01  110.0     130.0     409.1
1        60 2023-10-02  117.0     145.0     479.0
2        60 2023-10-03  103.0     135.0     340.3
3        45 2023-10-04  109.0     175.0     282.4
4        45 2023-10-05  117.0     150.0     405.1


In [30]:
# 4. Fix wrong data
print("\n=== STEP 4: FIX WRONG DATA ===")
print("\nChecking for cases where Maxpulse < Pulse...")
mask = mine_df['Maxpulse'] < mine_df['Pulse']
print(f"Found {mask.sum()} problematic rows")
if mask.sum() > 0:
    print("Problematic rows before fix:")
    print(mine_df[mask])
    mine_df.loc[mask, 'Maxpulse'] = mine_df.loc[mask, 'Pulse'] + 10
    print("\nAfter fixing Maxpulse < Pulse cases:")
    print(mine_df[mask])


=== STEP 4: FIX WRONG DATA ===

Checking for cases where Maxpulse < Pulse...
Found 13 problematic rows
Problematic rows before fix:
    Duration       Date  Pulse    Maxpulse  Calories
14        60 2023-10-15  120.0  100.000000     240.8
15        60 2023-10-16  115.5  101.000000     243.8
16        60 2023-10-17  127.0  102.000000     380.2
17        45 2023-10-17  142.0  103.000000     241.4
18        60 2023-10-19  151.0  104.000000       0.0
19        60 2023-10-20  162.0  105.000000     300.9
22        60 2023-10-22  130.0  108.000000     230.8
23        60 2023-10-24  134.0  123.241379     239.7
24        60 2023-10-25  132.0  110.000000     236.9
25        60 2023-10-26  135.0  118.000000     278.8
26        60 2023-10-27  137.0  119.000000     212.9
27        60 2023-10-28  138.0  121.000000     345.9
28        60 2023-10-29  139.0  122.000000     345.2

After fixing Maxpulse < Pulse cases:
    Duration       Date  Pulse  Maxpulse  Calories
14        60 2023-10-15  120.0     1

In [32]:

# Final cleaned dataset
print("\n=== FINAL CLEANED DATA ===")
print("Cleaned Mine.csv:")
print(mine_df)
print("\nMissing values count after cleaning:")
print(mine_df.isna().sum())

mine_df.to_csv('Mine_cleaned.csv', index=False)
print("\nSaved cleaned data to 'Mine_cleaned.csv'")


=== FINAL CLEANED DATA ===
Cleaned Mine.csv:
    Duration       Date  Pulse    Maxpulse  Calories
0         60 2023-10-01  110.0  130.000000     409.1
1         60 2023-10-02  117.0  145.000000     479.0
2         60 2023-10-03  103.0  135.000000     340.3
3         45 2023-10-04  109.0  175.000000     282.4
4         45 2023-10-05  117.0  150.000000     405.1
5         60 2023-10-06  103.0  125.000000     300.0
6         60 2023-10-07  110.0  135.000000     374.0
7        400 2023-10-08  114.0  133.000000       0.0
8         60 2023-10-09  112.0  126.000000     193.8
9         30 2023-10-10  102.0  147.000000     234.8
10        60 2023-10-11  100.0  129.000000     375.3
11        60 2023-10-12  109.0  131.000000     345.6
12        60 2023-10-13  103.0  136.000000     239.2
13        60 2023-10-15  120.0  123.241379     240.8
14        60 2023-10-15  120.0  130.000000     240.8
15        60 2023-10-16  115.5  125.500000     243.8
16        60 2023-10-17  127.0  137.000000     380.2
