In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

In [41]:
# read in data
df = pd.read_csv('/Users/christianl/Desktop/weekly_housing_market_data_most_recent.tsv000', sep='\t')

In [42]:
# drop rows with duration of '4 weeks' and '12 weeks'
df = df[df['duration'] != '4 weeks']
df = df[df['duration'] != '12 weeks']
df.head()

Unnamed: 0,region_id,region_type_id,region_name,region_type,period_begin,period_end,duration,total_homes_sold,total_homes_sold_yoy,average_homes_sold,...,average_adjustment_average_homes_sold,adjusted_average_homes_sold,average_adjustment_average_new_listings,adjusted_average_new_listings,average_adjustment_pending_sales,adjusted_pending_sales,adjusted_average_homes_delisted_yoy,adjusted_average_homes_sold_yoy,adjusted_average_new_listings_yoy,adjusted_pending_sales_yoy
2,2266,5,"Creek County, OK",county,2017-05-29,2017-06-04,1 weeks,17.0,-0.15,17.0,...,,17.0,,17.0,,7.0,-0.5,-0.15,-0.15,-0.125
5,2380,5,"Crawford County, PA",county,2017-05-29,2017-06-04,1 weeks,4.0,-0.2,4.0,...,,4.0,,6.0,,2.0,,-0.2,-0.142857,1.0
8,2715,5,"Fisher County, TX",county,2017-05-29,2017-06-04,1 weeks,1.0,,1.0,...,,1.0,,,,,,,,
11,2766,5,"Jones County, TX",county,2017-05-29,2017-06-04,1 weeks,3.0,2.0,3.0,...,,3.0,,2.0,,,,2.0,-0.714286,
14,3037,5,"Chesapeake, VA",county,2017-05-29,2017-06-04,1 weeks,125.0,0.302083,125.0,...,,125.0,,113.0,,82.0,0.0,0.302083,-0.017391,0.782609


In [43]:
len(df.region_name.unique())

3859

In [44]:
# find the top 10 regions with the most data
df['region_name'].value_counts().head(10)

Creek County, OK            328
El Dorado County, CA        328
Calhoun County, AL          328
Surry County, VA            328
Petersburg, VA              328
Columbia County, WI         328
Bemidji, MN metro area      328
Athens, OH metro area       328
Corsicana, TX metro area    328
Kerrville, TX metro area    328
Name: region_name, dtype: int64

In [45]:
# find the top 10 regions with the least data
df['region_name'].value_counts().tail(10)

Dundy County, NE        20
Cheyenne County, NE     20
Jackson County, SD      16
Thomas County, NE       12
Wichita County, KS      12
Dawes County, NE        11
Deuel County, NE        10
McPherson County, NE    10
Kenedy County, TX        8
Sheridan County, NE      2
Name: region_name, dtype: int64

In [46]:
# find data where region_name contains 'Nashville'
df_nash = df[df['region_name'].str.contains('Nashville')]

In [47]:
df_nash.head()

Unnamed: 0,region_id,region_type_id,region_name,region_type,period_begin,period_end,duration,total_homes_sold,total_homes_sold_yoy,average_homes_sold,...,average_adjustment_average_homes_sold,adjusted_average_homes_sold,average_adjustment_average_new_listings,adjusted_average_new_listings,average_adjustment_pending_sales,adjusted_pending_sales,adjusted_average_homes_delisted_yoy,adjusted_average_homes_sold_yoy,adjusted_average_new_listings_yoy,adjusted_pending_sales_yoy
18068,34980,-2,"Nashville, TN metro area",metro,2017-01-02,2017-01-08,1 weeks,425.0,-0.05765,425.0,...,,425.0,,694.0,,335.0,-0.18254,-0.05765,-0.02391,0.024465
26364,34980,-2,"Nashville, TN metro area",metro,2017-02-13,2017-02-19,1 weeks,541.0,0.113169,541.0,...,,541.0,,770.0,,392.0,-0.213592,0.113169,-0.027778,0.088889
49919,34980,-2,"Nashville, TN metro area",metro,2017-01-09,2017-01-15,1 weeks,499.0,-0.005976,499.0,...,,499.0,,726.0,,395.0,-0.071429,-0.005976,-0.017591,0.109551
59555,34980,-2,"Nashville, TN metro area",metro,2017-04-10,2017-04-16,1 weeks,617.0,-0.146611,617.0,...,,617.0,,836.0,,543.0,-0.21,-0.146611,-0.212064,-0.109836
71393,34980,-2,"Nashville, TN metro area",metro,2017-04-17,2017-04-23,1 weeks,775.0,0.108727,775.0,...,,775.0,,950.0,,596.0,-0.022727,0.108727,-0.102079,-0.088685


In [49]:
# find beginning and end dates of data
print(df_nash['period_begin'].min())
print(df_nash['period_end'].max())

2017-01-02
2023-04-16


In [50]:
# find all counties in the state of Tennessee
df_tn = df[df['region_name'].str.contains('TN')]

In [51]:
df_tn.region_name.unique()

array(['Martin, TN metro area', 'Lincoln County, TN',
       'Sullivan County, TN', 'Cookeville, TN metro area',
       'Henderson County, TN', 'Lawrence County, TN', 'Tipton County, TN',
       'Greeneville, TN metro area', 'Cheatham County, TN',
       'Pickett County, TN', 'Union County, TN', 'Newport, TN metro area',
       'Washington County, TN', 'Dyer County, TN', 'Grainger County, TN',
       'Johnson City, TN metro area', 'Morgan County, TN',
       'Hardin County, TN', 'Giles County, TN', 'Moore County, TN',
       'Meigs County, TN', 'Blount County, TN', 'Wilson County, TN',
       'Kingsport, TN metro area', 'Bradley County, TN',
       'Carter County, TN', 'Cumberland County, TN',
       'Paris, TN metro area', 'Madison County, TN',
       'Williamson County, TN', 'Rhea County, TN',
       'Dyersburg, TN metro area', 'Marion County, TN',
       'McMinnville, TN metro area', 'Clay County, TN',
       'White County, TN', 'Gibson County, TN', 'Montgomery County, TN',
       '

In [52]:
# find county data whith region name containing the following: 'Cheatham County, Davidson County, Dickson County, Maury County, Montgomery County, Robertson County, Rutherford County, Sumner County'
df_tn = df_tn[df_tn['region_name'].str.contains('Cheatham County|Davidson County|Dickson County|Maury County|Montgomery County|Robertson County|Rutherford County|Sumner County')]

In [54]:
df_tn.region_name.unique()

array(['Cheatham County, TN', 'Montgomery County, TN',
       'Davidson County, TN', 'Dickson County, TN', 'Maury County, TN',
       'Robertson County, TN', 'Sumner County, TN',
       'Rutherford County, TN'], dtype=object)

In [None]:
# 