# Linear Regression with Bedrooms as the dependant Variables

These are the dependant variables which i think would be interesting to look into:

**LIVING_AREA**: The size of the living area may be predictive of the number of bedrooms.

**LAND_SF**: The size of the land could also influence the number of bedrooms a property can support.

**TOTAL_VALUE**: The total value of the property might be related to the number of bedrooms, as larger homes tend to be more valuable.

**GROSS_TAX**: Maybe the amount a property is taxed affects the number of bedrooms

**GROSS_AREA**: Similar to living area, but this includes all space within the building(s), which could correlate with bedrooms.

**HEAT_TYPE and AC_TYPE**: These might be proxies for the overall quality or luxury level of a property, which could in turn be related to the number of bedrooms.

**EXT_FIN**: The exterior finish of the building after a renovation may correlate to bedrooms 

**KITCHEN_TYP**: The type of kitchen may also influence bedrooms

**FPLACE**: Similarly, the number of fireplaces may indicate the luxury of certain properties.

**OVERALL_COND**: The overall condition of the property could influence bedroom count if properties in better condition are more likely to have been expanded or improved.

**ROOF_STRUCTURE and ROOF_COVER**: These might not directly influence the number of bedrooms, but they could be related to other property improvements that coincide with bedroom changes.

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

In [18]:
# pa_2014 = pd.read_csv('../data/property-assessment-fy2014.csv')[['Parcel_ID','LU', 'AV_TOTAL', 'GROSS_AREA', 'LIVING_AREA', 'R_BDRMS', 'YR_REMOD', 'LAND_SF', 'R_ROOF_TYP', 'R_EXT_FIN', 'R_KITCH', 'R_HEAT_TYP', 'R_AC', 'R_FPLACE', 'R_OVRALL_CND']] 


# pa_2015 = pd.read_csv('../data/property_assessment_2015.csv')[['PID','LU', 'AV_TOTAL', 'GROSS_AREA', 'LIVING_AREA', 'R_BDRMS', 'YR_REMOD',  'LAND_SF', 'R_ROOF_TYP', 'R_EXT_FIN', 'R_KITCH', 'R_HEAT_TYP', 'R_AC', 'R_FPLACE']] 
# pa_2016 = pd.read_csv('../data/property_assessment_2016.csv')[['PID','LU', 'AV_TOTAL', 'GROSS_AREA', 'LIVING_AREA', 'R_BDRMS', 'YR_REMOD',  'LAND_SF', 'R_ROOF_TYP', 'R_EXT_FIN', 'R_KITCH', 'R_HEAT_TYP', 'R_AC', 'R_FPLACE']] 
# pa_2017 = pd.read_csv('../data/property_assessment_2017.csv')[['PID','LU', 'AV_TOTAL', 'GROSS_AREA', 'LIVING_AREA', 'R_BDRMS', 'YR_REMOD', 'LAND_SF', 'R_ROOF_TYP', 'R_EXT_FIN', 'R_KITCH', 'R_HEAT_TYP', 'R_AC', 'R_FPLACE']] 
# pa_2018 = pd.read_csv('../data/property_assessment_2018.csv')[['PID','LU', 'AV_TOTAL', 'GROSS_AREA', 'LIVING_AREA', 'R_BDRMS', 'YR_REMOD', 'LAND_SF', 'R_ROOF_TYP', 'R_EXT_FIN', 'R_KITCH', 'R_HEAT_TYP', 'R_AC', 'R_FPLACE']] 
# pa_2019 = pd.read_csv('../data/property_assessment_2019.csv')[['PID','LU', 'AV_TOTAL', 'GROSS_AREA', 'LIVING_AREA', 'R_BDRMS', 'YR_REMOD', 'LAND_SF', 'R_ROOF_TYP', 'R_EXT_FIN', 'R_KITCH', 'R_HEAT_TYP', 'R_AC', 'R_FPLACE']] 
pa_2020 = pd.read_csv('../data/property_assessment_2020.csv')[['PID','LU', 'AV_TOTAL', 'GROSS_AREA', 'LIVING_AREA', 'R_BDRMS', 'YR_REMOD', 'LAND_SF', 'R_ROOF_TYP', 'R_EXT_FIN', 'R_KITCH', 'R_HEAT_TYP', 'R_AC', 'R_FPLACE']] 

pa_2021 = pd.read_csv('../data/property_assessment_2021.csv')[['PID','LU', 'TOTAL_VALUE', 'GROSS_AREA', 'LIVING_AREA', 'BED_RMS', 'YR_REMODEL', 'LAND_SF', 'ROOF_STRUCTURE', 'ROOF_COVER', 'INT_WALL', 'EXT_FINISHED', 'OVERALL_COND', 'FULL_BTH', 'HLF_BTH', 'KITCHEN_TYPE', 'HEAT_TYPE', 'AC_TYPE', 'FIRE_PLACE', 'GROSS_TAX']]  
# AV_Total is now TOTAL_VALUE, R_BDRMS is now BED_RMS

pa_2022 = pd.read_csv('../data/property_assessment_2022.csv')[['PID','LU', 'TOTAL_VALUE', 'GROSS_AREA', 'LIVING_AREA', 'BED_RMS', 'YR_REMODEL', 'LAND_SF', 'ROOF_STRUCTURE', 'ROOF_COVER', 'INT_WALL', 'EXT_FINISHED', 'OVERALL_COND', 'FULL_BTH', 'HLF_BTH', 'KITCHEN_TYPE', 'HEAT_TYPE', 'AC_TYPE', 'FIRE_PLACE', 'GROSS_TAX']]   
pa_2023 = pd.read_csv('../data/property_assessment_2023.csv')[['PID','LU', 'TOTAL_VALUE', 'GROSS_AREA', 'LIVING_AREA', 'BED_RMS', 'YR_REMODEL', 'LAND_SF', 'ROOF_STRUCTURE', 'ROOF_COVER', 'INT_WALL', 'EXT_FNISHED', 'OVERALL_COND', 'FULL_BTH', 'HLF_BTH', 'KITCHEN_TYPE', 'HEAT_TYPE', 'AC_TYPE', 'FIREPLACES', 'GROSS_TAX']]  
pa_2024 = pd.read_csv('../data/property_assessment_2024.csv')[['PID','LU', 'TOTAL_VALUE', 'GROSS_AREA', 'LIVING_AREA', 'BED_RMS', 'YR_REMODEL', 'LAND_SF', 'ROOF_STRUCTURE', 'ROOF_COVER', 'INT_WALL', 'EXT_FNISHED', 'OVERALL_COND', 'FULL_BTH', 'HLF_BTH', 'KITCHEN_TYPE', 'HEAT_TYPE', 'AC_TYPE', 'FIREPLACES', 'GROSS_TAX']]   

  pa_2024 = pd.read_csv('../data/property_assessment_2024.csv')[['PID','LU', 'TOTAL_VALUE', 'GROSS_AREA', 'LIVING_AREA', 'BED_RMS', 'YR_REMODEL', 'LAND_SF', 'ROOF_STRUCTURE', 'ROOF_COVER', 'INT_WALL', 'EXT_FNISHED', 'OVERALL_COND', 'FULL_BTH', 'HLF_BTH', 'KITCHEN_TYPE', 'HEAT_TYPE', 'AC_TYPE', 'FIREPLACES', 'GROSS_TAX']]


In [19]:
# Store the DataFrames in a dictionary for easier access
pa_data = {
    2021: pa_2021,
    2022: pa_2022,
    2023: pa_2023,
    2024: pa_2024,
}

LU_filtered_pa_data = {}

residential_codes = ['R1', 'R2', 'R3', 'RC', 'R4', 'A', 'CD']

# Loop through each year's DataFrame and filter it
for year, df in pa_data.items():
    LU_filtered_pa_data[year] = df[df['LU'].isin(residential_codes)]

In [20]:

average_bedrooms_per_year = {}
filtered_bedroom_dfs = {}
years_where_bedrooms_name_changed = [2021, 2022, 2023, 2024]

for year, df in pa_data.items():
    if year in years_where_bedrooms_name_changed:

        # Drop rows where 'R_BDRMS' is NaN
        df_filtered = df.dropna(subset=['BED_RMS'])
        filtered_bedroom_dfs[year] = df_filtered
        # Calculate the mean of 'R_BDRMS'
        average_bedrooms = df_filtered['BED_RMS'].mean()
        # Store the average in the dictionary
        average_bedrooms_per_year[year] = average_bedrooms
    else:
        # Drop rows where 'R_BDRMS' is NaN
        df_filtered = df.dropna(subset=['R_BDRMS'])
        filtered_bedroom_dfs[year] = df_filtered
        # Calculate the mean of 'R_BDRMS'
        average_bedrooms = df_filtered['R_BDRMS'].mean()
        # Store the average in the dictionary
        average_bedrooms_per_year[year] = average_bedrooms

In [21]:
data_2022 = pa_data[2022]

# Drop rows where 'R_BDRMS' is NaN
data_2022_filtered = data_2022.dropna(subset=['BED_RMS'])

# Drop rows where 'R_BDRMS' is 0
data_2022_filtered_no_zeroes = data_2022_filtered[data_2022_filtered['BED_RMS'] != 0]

# Store DF in dictionary
filtered_bedroom_dfs[2022] = data_2022_filtered_no_zeroes

In [22]:
# Create a list to store the modified DataFrames for each year
bedroom_data_with_years = []

# Iterate over each year and add a 'DATA_YEAR' column to the corresponding DataFrame
for year in range(2021, 2025):
    df = filtered_bedroom_dfs[year].copy()
    df['DATA_YEAR'] = year
    bedroom_data_with_years.append(df)

# Concatenate the modified DataFrames
all_years_filtered_bedroom_data = pd.concat(bedroom_data_with_years, ignore_index=True)

In [23]:
# Group the data by PID
sorted_by_pid = all_years_filtered_bedroom_data.sort_values(by=['PID', 'DATA_YEAR'])
grouped_bedroom_data = sorted_by_pid.groupby('PID')

In [25]:
# Define a new DataFrame for the before and after renovation data
renovation_comparison = pd.DataFrame()

In [26]:
rows_to_append = []  # Initialize an empty list to collect DataFrames

# Loop over each PID and process the groups
for pid, group in grouped_bedroom_data.groupby('PID'):
    # Ensure the group is sorted by year
    group = group.sort_values(by='DATA_YEAR')
    
    # Get all the remodel years for the PID that fall between 2008 and 2023
    remodel_years = group.loc[group['YR_REMODEL'].between(2021, 2023), 'YR_REMODEL'].unique()

    for remodel_year in remodel_years:

        remodel_year_int = int(remodel_year)  # Convert to integer

        # Find the data entry before the renovation
        pre_remodel_data = group[group['DATA_YEAR'] < remodel_year].iloc[-1:]

        # Find the data entry after the renovation
        # We have to loop because the year after the remodelling might not reflect the remodelling so we have to loop forward
        # Initialize 'after_remodel_data' as empty
        after_remodel_data = pd.DataFrame()
        
        # Loop through the years after the remodel year to find when it was first reported
        for year in range(remodel_year_int, 2024):
            after_data = group[group['DATA_YEAR'] == year]
            # Check if any entry in 'after_data' has the 'YR_REMODELLED' we're looking for
            if not after_data.empty and (after_data['YR_REMODEL'] == remodel_year).any():
                after_remodel_data = after_data[after_data['YR_REMODEL'] == remodel_year].copy()  # Make a copy
                after_remodel_data.loc[:, 'REMODEL_YEAR'] = remodel_year  # Safely assign using .loc
                break

        # Ensure there is both a 'before' and an 'after' entry
        if not pre_remodel_data.empty and not after_remodel_data.empty:

            pre_remodel_data = pre_remodel_data.copy()  # Make a copy
            pre_remodel_data.loc[:, 'REMODEL_YEAR'] = remodel_year  # Safely assign using .loc

            # Add the remodel year for clarity
            after_remodel_data['REMODEL_YEAR'] = remodel_year

            rows_to_append.append(pre_remodel_data)
            rows_to_append.append(after_remodel_data)

# After the loop, concatenate all the rows at once
renovation_comparison = pd.concat(rows_to_append, ignore_index=True)


AttributeError: 'DataFrameGroupBy' object has no attribute 'groupby'