# Import library

In [2]:
import pandas as pd
import numpy as np

# --- 1. Load the data ---

In [5]:
df = pd.read_csv("./Data/housePrice.csv")
# Print initial info to see column types and non-null counts
print("\n--- Initial Data Info ---")
df.info()


--- Initial Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3479 entries, 0 to 3478
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Area        3479 non-null   object 
 1   Room        3479 non-null   int64  
 2   Parking     3479 non-null   bool   
 3   Warehouse   3479 non-null   bool   
 4   Elevator    3479 non-null   bool   
 5   Address     3456 non-null   object 
 6   Price       3479 non-null   float64
 7   Price(USD)  3479 non-null   float64
dtypes: bool(3), float64(2), int64(1), object(2)
memory usage: 146.2+ KB


In [6]:
df.columns

Index(['Area', 'Room', 'Parking', 'Warehouse', 'Elevator', 'Address', 'Price',
       'Price(USD)'],
      dtype='object')

# --- 2. Standardize and Clean Columns ---

In [7]:
# Assuming 'Area', 'Price', and 'Price(USD)' might have non-numeric characters.
# The code below removes commas and converts them to a numeric type.

# Function to clean and convert to float
def clean_and_convert(column):
    # Check if the column exists to avoid errors
    if column in df.columns:
        # Fill missing values with NaN (if they aren't already)
        df[column] = pd.to_numeric(df[column].astype(str).str.replace(r'[^\d.]', '', regex=True), errors='coerce')
    else:
        print(f"Warning: Column '{column}' not found in the dataset.")
    return df

# Apply the cleaning function to the numerical columns
df = clean_and_convert('Area')
df = clean_and_convert('Price')
df = clean_and_convert('Price(USD)')

# --- 3. Handle Missing Values ---

In [9]:
# You can choose one of the following methods.

# Method A: Fill missing values with the mean for numerical columns
df['Area'] = df['Area'].fillna(df['Area'].mean())
df['Price'] = df['Price'].fillna(df['Price'].mean())
df['Price(USD)'] = df['Price(USD)'].fillna(df['Price(USD)'].mean())

# Method B (Alternative): Drop rows with any missing values
df.dropna(inplace=True)

# Method C (Alternative): Fill missing values with a specific value (e.g., 0)
df.fillna(0, inplace=True)

# --- 4. Remove Duplicate Rows ---

In [10]:
# Check for duplicates and drop them
initial_rows = len(df)
df.drop_duplicates(inplace=True)
dropped_rows = initial_rows - len(df)
print(f"\n--- Duplicates Dropped ---")
print(f"Number of duplicate rows found and removed: {dropped_rows}")


--- Duplicates Dropped ---
Number of duplicate rows found and removed: 208


# --- 5. Clean and Standardize Categorical Data (e.g., 'Address') ---

In [11]:
# This step is optional but useful for standardizing text.
if 'Address' in df.columns:
    df['Address'] = df['Address'].astype(str).str.strip().str.title()
else:
    print("Warning: 'Address' column not found. Skipping text standardization.")

# --- 6. Final check and Save the cleaned data ---

In [12]:
print("\n--- Cleaned Data Info ---")
df.info()


--- Cleaned Data Info ---
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3248 entries, 0 to 3478
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Area        3248 non-null   int64  
 1   Room        3248 non-null   int64  
 2   Parking     3248 non-null   bool   
 3   Warehouse   3248 non-null   bool   
 4   Elevator    3248 non-null   bool   
 5   Address     3248 non-null   object 
 6   Price       3248 non-null   float64
 7   Price(USD)  3248 non-null   float64
dtypes: bool(3), float64(2), int64(2), object(1)
memory usage: 161.8+ KB


# Save the cleaned data to a new CSV file

In [13]:
# You can change the filename as needed.
df.to_csv('./Data/cleaned_housePrice.csv', index=False)
print("\nData cleaning complete! The cleaned data has been saved to 'cleaned_housePrice.csv'.")


Data cleaning complete! The cleaned data has been saved to 'cleaned_housePrice.csv'.
