In [3]:
import pandas as pd
import numpy as np

# Task 1: Import the dataset and clean column names
# Load dataset
df = pd.read_csv('RealEstate_Prices (1).csv')

# Clean column names: remove spaces, special characters, and rename columns
df.columns = df.columns.str.strip().str.replace(' ', '_').str.replace(r'[^\w]', '')

# Display the first few rows to verify
print(df.head())

# Task 2: Handle missing values
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values per column:\n", missing_values)

# Handling missing values (example: filling with mean for numeric columns)
df.fillna(df.mean(), inplace=True)

# Alternatively, you can drop rows with missing values
# df.dropna(inplace=True)

# Task 3: Data merging (if additional datasets are available)
# For demonstration purposes, assuming another dataset "Neighborhood_Demographics.csv" exists
# neighborhood_df = pd.read_csv('Neighborhood_Demographics.csv')

# Merging the datasets based on a common column (example: 'Neighborhood_ID')
# df = pd.merge(df, neighborhood_df, on='Neighborhood_ID', how='left')

# Task 4: Filter and subset the data
# Example: filter for properties sold after 2010 in a specific city (e.g., 'CityName')
df_filtered = df[(df['SaleYear'] > 2010) & (df['City'] == 'CityName')]

# Task 5: Handle categorical variables
# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Apply one-hot encoding (if necessary)
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Task 6: Aggregate data
# Example: Calculate average sale price by neighborhood
avg_price_by_neighborhood = df.groupby('Neighborhood')['SalePrice'].mean()
print("Average Sale Price by Neighborhood:\n", avg_price_by_neighborhood)

# Task 7: Identify and handle outliers
# Example: Use z-score to detect outliers in the 'SalePrice' column
from scipy import stats

z_scores = np.abs(stats.zscore(df['SalePrice']))
outliers = df[z_scores > 3]

# Optionally remove outliers
df_no_outliers = df[z_scores <= 3]

# Save the cleaned and wrangled dataset
df_no_outliers.to_csv('Cleaned_RealEstate_Prices.csv', index=False)

   Property_ID Property_Type  Location  Bedrooms  Bathrooms  Square_Feet  \
0            1         House  Downtown         2          3         1148   
1            2     Apartment    Suburb         5          1         2663   
2            3         House  Downtown         4          3         2957   
3            4         House     Rural         4          3         2721   
4            5     Apartment  Downtown         2          1         2780   

   Year_Built  Sale_Price    Neighborhood  Amenities_Distance_Miles  
0      2009.0    807870.0  Neighborhood A                  4.745561  
1      2019.0    776389.0  Neighborhood C                  1.453618  
2         NaN         NaN  Neighborhood A                  2.480775  
3      2001.0    186148.0  Neighborhood C                  0.141468  
4      2009.0    405378.0  Neighborhood B                  0.305964  
Missing values per column:
 Property_ID                  0
Property_Type                0
Location                     0
Be

TypeError: Could not convert ['HouseApartmentHouseHouseApartmentCondoCondoHouseCondoHouseHouseApartmentHouseApartmentHouseApartmentHouseCondoHouseHouseCondoCondoApartmentHouseCondoHouseApartmentHouseCondoHouseHouseApartmentHouseCondoCondoHouseCondoHouseHouseApartmentHouseHouseCondoApartmentApartmentHouseHouseHouseApartmentApartmentApartmentHouseApartmentHouseCondoCondoHouseHouseHouseCondoApartmentCondoHouseCondoApartmentApartmentApartmentCondoHouseHouseApartmentApartmentCondoCondoCondoHouseHouseApartmentApartmentApartmentApartmentCondoHouseCondoCondoApartmentHouseCondoCondoHouseHouseApartmentApartmentCondoApartmentCondoApartmentCondoCondoCondo'
 'DowntownSuburbDowntownRuralDowntownSuburbSuburbRuralSuburbDowntownSuburbDowntownRuralRuralRuralDowntownRuralRuralRuralRuralSuburbRuralDowntownDowntownRuralRuralRuralRuralRuralSuburbSuburbRuralDowntownRuralRuralSuburbRuralSuburbDowntownSuburbSuburbRuralSuburbRuralRuralSuburbRuralRuralDowntownSuburbRuralRuralRuralSuburbDowntownDowntownRuralSuburbSuburbRuralRuralRuralDowntownSuburbRuralSuburbRuralSuburbSuburbSuburbSuburbSuburbRuralDowntownDowntownRuralDowntownDowntownSuburbSuburbDowntownDowntownSuburbRuralSuburbRuralSuburbSuburbRuralRuralRuralRuralDowntownSuburbDowntownSuburbRuralRuralDowntownDowntown'
 'Neighborhood ANeighborhood CNeighborhood ANeighborhood CNeighborhood BNeighborhood ANeighborhood CNeighborhood ANeighborhood ANeighborhood ANeighborhood ANeighborhood CNeighborhood BNeighborhood CNeighborhood ANeighborhood CNeighborhood BNeighborhood ANeighborhood ANeighborhood CNeighborhood BNeighborhood CNeighborhood CNeighborhood ANeighborhood ANeighborhood ANeighborhood ANeighborhood BNeighborhood BNeighborhood CNeighborhood CNeighborhood BNeighborhood BNeighborhood CNeighborhood BNeighborhood CNeighborhood ANeighborhood ANeighborhood CNeighborhood CNeighborhood CNeighborhood CNeighborhood CNeighborhood CNeighborhood BNeighborhood BNeighborhood ANeighborhood ANeighborhood BNeighborhood BNeighborhood ANeighborhood CNeighborhood BNeighborhood ANeighborhood CNeighborhood ANeighborhood BNeighborhood CNeighborhood ANeighborhood ANeighborhood CNeighborhood BNeighborhood ANeighborhood ANeighborhood ANeighborhood BNeighborhood ANeighborhood ANeighborhood BNeighborhood BNeighborhood ANeighborhood BNeighborhood ANeighborhood BNeighborhood BNeighborhood BNeighborhood BNeighborhood BNeighborhood CNeighborhood CNeighborhood BNeighborhood ANeighborhood BNeighborhood CNeighborhood CNeighborhood CNeighborhood BNeighborhood BNeighborhood ANeighborhood ANeighborhood BNeighborhood ANeighborhood CNeighborhood ANeighborhood BNeighborhood ANeighborhood BNeighborhood ANeighborhood BNeighborhood B'] to numeric