## 6. Data Cleaning

- **Handling Missing Data**: `isnull()`, `dropna()`, `fillna()`
- **Replacing Values**: `replace()`
- **Duplicated Data**: `duplicated()`, `drop_duplicates()`

In [1]:
import pandas as pd
df = pd.read_csv('data_dictionary.csv')
df.describe()

Unnamed: 0,Table,Field,Description
count,65,60,65
unique,5,40,60
top,patients,Id,Foreign key to the Patient.
freq,23,4,2


In [4]:
import pandas as pd

# Example DataFrame with missing values
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
        'Age': [25, None, 35, 40],
        'City': ['New York', 'Los Angeles', None, 'Houston']}
df = pd.DataFrame(data)

# Detecting missing values
missing_values = df.isnull()
print(missing_values)

# Dropping rows with missing values
cleaned_df = df.dropna()
print(cleaned_df)

# Dropping columns with missing values
cleaned_df = df.dropna(axis=1)
print(cleaned_df)

    Name    Age   City
0  False  False  False
1  False   True  False
2  False  False   True
3  False  False  False
    Name   Age      City
0  Alice  25.0  New York
3  David  40.0   Houston
      Name
0    Alice
1      Bob
2  Charlie
3    David


In [9]:
# Filling missing values with a specified value
filled_df = df.fillna(0)
print(filled_df)

# Filling missing values with a method (e.g., forward fill)
filled_df = df.ffill()
print(filled_df)
filled_df = df.bfill()
print(filled_df)

      Name   Age         City
0    Alice  25.0     New York
1      Bob   0.0  Los Angeles
2  Charlie  35.0            0
3    David  40.0      Houston
      Name   Age         City
0    Alice  25.0     New York
1      Bob  25.0  Los Angeles
2  Charlie  35.0  Los Angeles
3    David  40.0      Houston
      Name   Age         City
0    Alice  25.0     New York
1      Bob  35.0  Los Angeles
2  Charlie  35.0      Houston
3    David  40.0      Houston


In [10]:
# Replacing specific values
replaced_df = df.replace({'New York': 'NYC', 'Los Angeles': 'LA'})
print(replaced_df)

      Name   Age     City
0    Alice  25.0      NYC
1      Bob   NaN       LA
2  Charlie  35.0     None
3    David  40.0  Houston


In [17]:
# Example DataFrame with duplicated rows
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Alice'],
        'Age': [25, 30, 35, 40, 25],
        'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'New York']}
df = pd.DataFrame(data)

# Identifying duplicate rows
duplicates = df.duplicated()
print(duplicates)

# Dropping duplicate rows
deduplicated_df = df.drop_duplicates()
print(deduplicated_df)


0    False
1    False
2    False
3    False
4     True
dtype: bool
      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago
3    David   40      Houston


In [16]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

# Fetch the California Housing dataset
housing = fetch_california_housing(as_frame=True)

# Convert to a pandas DataFrame
df = housing.frame
print(df.head())
df.describe()

   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  MedHouseVal  
0    -122.23        4.526  
1    -122.22        3.585  
2    -122.24        3.521  
3    -122.25        3.413  
4    -122.25        3.422  


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001
