In [13]:
import pandas as pd
import numpy as np

# Sample DataFrame for demonstration
data = {
    'Name': ['Alice', 'Bob', 'Charlie', np.nan, 'Alice'],
    'Age': [25, 30, 'Unknown', 22, 25],
    'Email': ['alice@example.com', 'bob@example.com', 'charlie@example.com', 'bob@example.com', 'alice@example.com'],
    'DateOfBirth': ['1999-01-01', 'not available', '2000-02-29', '1998-12-31', np.nan]
}

# Create DataFrame
df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

# 1. Cleaning Data (General cleaning process)
print("\nStep 1: General Data Cleaning")

# Drop the 'Email' column
print("\nStep 2: Dropping 'Email' Column")
df.drop(columns=['Email'], inplace=True)

# 2. Cleaning Empty Cells
print("\nStep 3: Cleaning Empty Cells")
# Fill empty cells with placeholders
df['Name'].fillna('Unknown', inplace=True)
df['DateOfBirth'].fillna('1900-01-01', inplace=True)  # Default date

# 3. Cleaning Wrong Format
print("\nStep 4: Cleaning Wrong Format")
# Convert 'Age' column to numeric, coercing errors to NaN
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')

# Convert 'DateOfBirth' column to datetime, invalid parsing will be set as NaT
df['DateOfBirth'] = pd.to_datetime(df['DateOfBirth'], errors='coerce')

# 4. Cleaning Wrong Data
print("\nStep 5: Cleaning Wrong Data")
# Drop rows where 'Age' or 'DateOfBirth' are NaN after conversion
df.dropna(subset=['Age', 'DateOfBirth'], inplace=True)

# 5. Removing Duplicates
print("\nStep 6: Removing Duplicates")
df.drop_duplicates(inplace=True)

print("\nCleaned DataFrame:")
print(df)
 


Original DataFrame:
      Name      Age                Email    DateOfBirth
0    Alice       25    alice@example.com     1999-01-01
1      Bob       30      bob@example.com  not available
2  Charlie  Unknown  charlie@example.com     2000-02-29
3      NaN       22      bob@example.com     1998-12-31
4    Alice       25    alice@example.com            NaN

Step 1: General Data Cleaning

Step 2: Dropping 'Email' Column

Step 3: Cleaning Empty Cells

Step 4: Cleaning Wrong Format

Step 5: Cleaning Wrong Data

Step 6: Removing Duplicates

Cleaned DataFrame:
      Name   Age DateOfBirth
0    Alice  25.0  1999-01-01
3  Unknown  22.0  1998-12-31
4    Alice  25.0  1900-01-01


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Name'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['DateOfBirth'].fillna('1900-01-01', inplace=True)  # Default date


In [3]:
#find corelation


import pandas as pd

df = pd.read_csv('data.csv')

print(df.corr())


          Duration     Pulse  Maxpulse  Calories
Duration  1.000000 -0.155408  0.009403  0.922717
Pulse    -0.155408  1.000000  0.786535  0.025121
Maxpulse  0.009403  0.786535  1.000000  0.203813
Calories  0.922717  0.025121  0.203813  1.000000


In [4]:


#remove duplicateimport pandas as pd

df = pd.read_csv('data.csv')

print(df.duplicated())


0      False
1      False
2      False
3      False
4      False
       ...  
164    False
165    False
166    False
167    False
168    False
Length: 169, dtype: bool


In [5]:
#wrong data
import pandas as pd

df = pd.read_csv('data.csv')

df.loc[7,'Duration'] = 45

print(df.to_string())


     Duration  Pulse  Maxpulse  Calories
0          60    110       130     409.1
1          60    117       145     479.0
2          60    103       135     340.0
3          45    109       175     282.4
4          45    117       148     406.0
5          60    102       127     300.0
6          60    110       136     374.0
7          45    104       134     253.3
8          30    109       133     195.1
9          60     98       124     269.0
10         60    103       147     329.3
11         60    100       120     250.7
12         60    106       128     345.3
13         60    104       132     379.3
14         60     98       123     275.0
15         60     98       120     215.2
16         60    100       120     300.0
17         45     90       112       NaN
18         60    103       123     323.0
19         45     97       125     243.0
20         60    108       131     364.2
21         45    100       119     282.0
22         60    130       101     300.0
23         45   

In [9]:



#replace on wrong
import pandas as pd

df = pd.read_csv('data.csv')

for x in df.index:
  if df.loc[x, "Duration"] > 120:
    df.loc[x, "Duration"] = 120

print(df.to_string())


     Duration  Pulse  Maxpulse  Calories
0          60    110       130     409.1
1          60    117       145     479.0
2          60    103       135     340.0
3          45    109       175     282.4
4          45    117       148     406.0
5          60    102       127     300.0
6          60    110       136     374.0
7          45    104       134     253.3
8          30    109       133     195.1
9          60     98       124     269.0
10         60    103       147     329.3
11         60    100       120     250.7
12         60    106       128     345.3
13         60    104       132     379.3
14         60     98       123     275.0
15         60     98       120     215.2
16         60    100       120     300.0
17         45     90       112       NaN
18         60    103       123     323.0
19         45     97       125     243.0
20         60    108       131     364.2
21         45    100       119     282.0
22         60    130       101     300.0
23         45   