In [1]:
# Dependencies and Setup
import pandas as pd
from pathlib import Path

# File to Load
data_to_load = Path("Resources/Real_Estate_data.csv")

# Read Real Estate Data File and store into Pandas DataFrames
real_estate_data_df = pd.read_csv(data_to_load, low_memory=False)

# view the data
real_estate_data_df

Unnamed: 0,List Year,Date Recorded,Town,Address,Assessed Value,Sale Amount,Sales Ratio,Property Type,Residential Type,Location,Index,Longitude,Latitude
0,2001,3/14/2002,East Haven,111 PROSPECT RD,84630,10000,8.463000,,,POINT (-72.87317 41.27318),130157,-72.8731,41.2731
1,2001,10/3/2001,Clinton,1 W WOODS DR,137100,260000,0.527308,,,POINT (-72.50679 41.28246),130158,-72.5067,41.2824
2,2001,10/31/2001,Middletown,148-150 WASHINGTON ST,131110,230000,0.570043,,,POINT (-72.65351 41.56143),130159,-72.6535,41.5614
3,2001,9/27/2002,Danbury,2 QUAIL RUN DR,200100,435000,0.460000,,,POINT (-73.50208 41.3951),130160,-73.5020,41.3951
4,2001,7/18/2002,Hartford,104-106 EDWARDS ST,61810,19500,3.169744,,,POINT (-72.68094 41.7755),130161,-72.6809,41.7755
...,...,...,...,...,...,...,...,...,...,...,...,...,...
290539,2022,10/11/2022,Stamford,193 SADDLE HILL ROAD,483380,865000,0.558800,Residential,Single Family,POINT (-73.577612999 41.148977983),130152,-73.5776,41.1489
290540,2022,9/29/2023,Wethersfield,37 LUCA LN,20650,760857,0.027100,Residential,Single Family,POINT (-72.663607 41.712487),130153,-72.6636,41.7124
290541,2022,1/9/2023,Stamford,1096 EAST MAIN STREET #16-D-1,132900,220000,0.604000,Residential,Condo,POINT (-73.515726977 41.057837988),130154,-73.5157,41.0578
290542,2022,9/26/2023,Old Saybrook,115 SHEFFIELD ST,1099400,1575000,0.698000,Residential,Single Family,POINT (-72.368005967 41.289124997),130155,-72.3680,41.2891


In [2]:
real_estate_data_df['List Year']

0         2001
1         2001
2         2001
3         2001
4         2001
          ... 
290539    2022
290540    2022
290541    2022
290542    2022
290543    2022
Name: List Year, Length: 290544, dtype: int64

In [3]:
# Filter out rows where the Year is between 2001 and 2005 (inclusive)
df_filtered = real_estate_data_df[~real_estate_data_df['List Year'].between(2001, 2005)]

In [4]:
df_filtered.head()

Unnamed: 0,List Year,Date Recorded,Town,Address,Assessed Value,Sale Amount,Sales Ratio,Property Type,Residential Type,Location,Index,Longitude,Latitude
63457,2006,1/2/2007,New Britain,205 BUELL ST,66360,190000,0.349263,Single Family,Single Family,POINT (-72.783 41.65351),190584,-72.783,41.6535
63458,2006,7/27/2007,New Britain,30 SEFTON DR,62580,155000,0.403742,Single Family,Single Family,POINT (-72.76799 41.68673),190587,-72.7679,41.6867
63459,2006,10/5/2006,Fairfield,255 OLD DAM RD,412090,410000,1.005098,Single Family,Single Family,POINT (-73.26004 41.12223),190589,-73.26,41.1222
63460,2006,1/17/2007,East Windsor,241 S WATER ST UT 20,76480,211500,0.361608,Condo,Condo,POINT (-72.62132 41.93316),190598,-72.6213,41.9331
63461,2006,7/6/2007,Bristol,42 MATILDA DR,69220,165000,0.419515,Single Family,Single Family,POINT (-72.91069 41.69177),190601,-72.9106,41.6917


In [57]:
df_filtered["List Year"]

63457     2006
63458     2006
63459     2006
63460     2006
63461     2006
          ... 
290539    2022
290540    2022
290541    2022
290542    2022
290543    2022
Name: List Year, Length: 227087, dtype: int64

In [58]:
df_filtered["Property Type"].unique()

array(['Single Family', 'Condo', nan, 'Two Family', 'Three Family',
       'Four Family', 'Residential', 'Vacant Land', 'Commercial',
       'Apartments', 'Industrial', 'Public Utility'], dtype=object)

In [59]:
df_filtered_1 = df_filtered.dropna(subset=['Property Type'])
df_filtered_1["Property Type"].unique()

array(['Single Family', 'Condo', 'Two Family', 'Three Family',
       'Four Family', 'Residential', 'Vacant Land', 'Commercial',
       'Apartments', 'Industrial', 'Public Utility'], dtype=object)

In [60]:
df_filtered_1["Residential Type"].unique()

array(['Single Family', 'Condo', 'Two Family', 'Three Family',
       'Four Family', nan], dtype=object)

In [61]:
df_filtered_2 = df_filtered_1.dropna(subset=['Residential Type'])
df_filtered_2["Residential Type"].unique()

array(['Single Family', 'Condo', 'Two Family', 'Three Family',
       'Four Family'], dtype=object)

In [62]:
df_filtered_2["Longitude"].unique()

array(['-72.783', '-72.7679', '-73.26', ..., '-72.1598', '-71.9097',
       '-71.8258'], dtype=object)

In [63]:
property_counts = df_filtered_2["Residential Type"].value_counts
property_counts

<bound method IndexOpsMixin.value_counts of 63457     Single Family
63458     Single Family
63459     Single Family
63460             Condo
63461     Single Family
              ...      
290539    Single Family
290540    Single Family
290541            Condo
290542    Single Family
290543    Single Family
Name: Residential Type, Length: 207045, dtype: object>

In [71]:
clean_df = pd.DataFrame(df_filtered_2)
clean_df

Unnamed: 0,List Year,Date Recorded,Town,Address,Assessed Value,Sale Amount,Sales Ratio,Property Type,Residential Type,Location,Index,Longitude,Latitude
63457,2006,1/2/2007,New Britain,205 BUELL ST,66360,190000,0.349263,Single Family,Single Family,POINT (-72.783 41.65351),190584,-72.783,41.6535
63458,2006,7/27/2007,New Britain,30 SEFTON DR,62580,155000,0.403742,Single Family,Single Family,POINT (-72.76799 41.68673),190587,-72.7679,41.6867
63459,2006,10/5/2006,Fairfield,255 OLD DAM RD,412090,410000,1.005098,Single Family,Single Family,POINT (-73.26004 41.12223),190589,-73.26,41.1222
63460,2006,1/17/2007,East Windsor,241 S WATER ST UT 20,76480,211500,0.361608,Condo,Condo,POINT (-72.62132 41.93316),190598,-72.6213,41.9331
63461,2006,7/6/2007,Bristol,42 MATILDA DR,69220,165000,0.419515,Single Family,Single Family,POINT (-72.91069 41.69177),190601,-72.9106,41.6917
...,...,...,...,...,...,...,...,...,...,...,...,...,...
290539,2022,10/11/2022,Stamford,193 SADDLE HILL ROAD,483380,865000,0.558800,Residential,Single Family,POINT (-73.577612999 41.148977983),130152,-73.5776,41.1489
290540,2022,9/29/2023,Wethersfield,37 LUCA LN,20650,760857,0.027100,Residential,Single Family,POINT (-72.663607 41.712487),130153,-72.6636,41.7124
290541,2022,1/9/2023,Stamford,1096 EAST MAIN STREET #16-D-1,132900,220000,0.604000,Residential,Condo,POINT (-73.515726977 41.057837988),130154,-73.5157,41.0578
290542,2022,9/26/2023,Old Saybrook,115 SHEFFIELD ST,1099400,1575000,0.698000,Residential,Single Family,POINT (-72.368005967 41.289124997),130155,-72.368,41.2891


In [72]:
rs_data_df = pd.DataFrame(clean_df)
rs_data_df

Unnamed: 0,List Year,Date Recorded,Town,Address,Assessed Value,Sale Amount,Sales Ratio,Property Type,Residential Type,Location,Index,Longitude,Latitude
63457,2006,1/2/2007,New Britain,205 BUELL ST,66360,190000,0.349263,Single Family,Single Family,POINT (-72.783 41.65351),190584,-72.783,41.6535
63458,2006,7/27/2007,New Britain,30 SEFTON DR,62580,155000,0.403742,Single Family,Single Family,POINT (-72.76799 41.68673),190587,-72.7679,41.6867
63459,2006,10/5/2006,Fairfield,255 OLD DAM RD,412090,410000,1.005098,Single Family,Single Family,POINT (-73.26004 41.12223),190589,-73.26,41.1222
63460,2006,1/17/2007,East Windsor,241 S WATER ST UT 20,76480,211500,0.361608,Condo,Condo,POINT (-72.62132 41.93316),190598,-72.6213,41.9331
63461,2006,7/6/2007,Bristol,42 MATILDA DR,69220,165000,0.419515,Single Family,Single Family,POINT (-72.91069 41.69177),190601,-72.9106,41.6917
...,...,...,...,...,...,...,...,...,...,...,...,...,...
290539,2022,10/11/2022,Stamford,193 SADDLE HILL ROAD,483380,865000,0.558800,Residential,Single Family,POINT (-73.577612999 41.148977983),130152,-73.5776,41.1489
290540,2022,9/29/2023,Wethersfield,37 LUCA LN,20650,760857,0.027100,Residential,Single Family,POINT (-72.663607 41.712487),130153,-72.6636,41.7124
290541,2022,1/9/2023,Stamford,1096 EAST MAIN STREET #16-D-1,132900,220000,0.604000,Residential,Condo,POINT (-73.515726977 41.057837988),130154,-73.5157,41.0578
290542,2022,9/26/2023,Old Saybrook,115 SHEFFIELD ST,1099400,1575000,0.698000,Residential,Single Family,POINT (-72.368005967 41.289124997),130155,-72.368,41.2891


In [73]:
first_col = rs_data_df.pop('Index')
rs_data_df.insert(0, 'Index', first_col)
rs_data_df

Unnamed: 0,Index,List Year,Date Recorded,Town,Address,Assessed Value,Sale Amount,Sales Ratio,Property Type,Residential Type,Location,Longitude,Latitude
63457,190584,2006,1/2/2007,New Britain,205 BUELL ST,66360,190000,0.349263,Single Family,Single Family,POINT (-72.783 41.65351),-72.783,41.6535
63458,190587,2006,7/27/2007,New Britain,30 SEFTON DR,62580,155000,0.403742,Single Family,Single Family,POINT (-72.76799 41.68673),-72.7679,41.6867
63459,190589,2006,10/5/2006,Fairfield,255 OLD DAM RD,412090,410000,1.005098,Single Family,Single Family,POINT (-73.26004 41.12223),-73.26,41.1222
63460,190598,2006,1/17/2007,East Windsor,241 S WATER ST UT 20,76480,211500,0.361608,Condo,Condo,POINT (-72.62132 41.93316),-72.6213,41.9331
63461,190601,2006,7/6/2007,Bristol,42 MATILDA DR,69220,165000,0.419515,Single Family,Single Family,POINT (-72.91069 41.69177),-72.9106,41.6917
...,...,...,...,...,...,...,...,...,...,...,...,...,...
290539,130152,2022,10/11/2022,Stamford,193 SADDLE HILL ROAD,483380,865000,0.558800,Residential,Single Family,POINT (-73.577612999 41.148977983),-73.5776,41.1489
290540,130153,2022,9/29/2023,Wethersfield,37 LUCA LN,20650,760857,0.027100,Residential,Single Family,POINT (-72.663607 41.712487),-72.6636,41.7124
290541,130154,2022,1/9/2023,Stamford,1096 EAST MAIN STREET #16-D-1,132900,220000,0.604000,Residential,Condo,POINT (-73.515726977 41.057837988),-73.5157,41.0578
290542,130155,2022,9/26/2023,Old Saybrook,115 SHEFFIELD ST,1099400,1575000,0.698000,Residential,Single Family,POINT (-72.368005967 41.289124997),-72.368,41.2891


In [None]:
#Analysis on property type

In [None]:
#Analysis on residential type

In [None]:
#Analysis by state

In [None]:
#Analysis by year

In [None]:
#Analysis by price

In [None]:
#Future prediction (mean, median, mode)

In [None]:
#Visualize data/trend for house prices

In [None]:
#Analysis and comments/insights