In [19]:
import pandas as pd

In [21]:
df = pd.read_csv("Assessor_Parcel_Sales.csv")

In [23]:
print("Dataset preview:")
print(df.head())

Dataset preview:
            pin  year  township_code  neighborhood_code class  \
0  3.101210e+13  2000             32              32050   278   
1  1.429100e+13  2000             73              73150   299   
2  1.316410e+13  2000             71              71101   203   
3  2.423300e+13  2014             39              39250   100   
4  1.935400e+13  2016             72              72200   205   

          sale_date  is_mydec_date  sale_price  sale_document_num  \
0     April 01 2000          False      177500             317676   
1  February 01 2000          False      315000             326770   
2      June 01 2000          False      192000             519440   
3      June 01 2014          False         500         1427529079   
4    August 01 2016          False           1         1625129009   

  sale_deed_type mydec_deed_type sale_seller_name  is_multisale  \
0        Trustee             NaN              NaN         False   
1       Warranty             NaN           

In [25]:
print(f"\nDataset has {df.shape[0]} rows and {df.shape[1]} columns.")


Dataset has 1048575 rows and 20 columns.


In [27]:
print("\nMissing values per column:")
print(df.isnull().sum())


Missing values per column:
pin                                       0
year                                      0
township_code                             0
neighborhood_code                         0
class                                     0
sale_date                                 0
is_mydec_date                             0
sale_price                                0
sale_document_num                         0
sale_deed_type                           47
mydec_deed_type                     1048369
sale_seller_name                     101492
is_multisale                              0
num_parcels_sale                          0
sale_buyer_name                      101176
sale_type                              2776
sale_filter_same_sale_within_365          0
sale_filter_less_than_10k                 0
sale_filter_deed_type                     0
row_id                                    0
dtype: int64


In [29]:
print("\nPercentage of missing values per column:")
print((df.isnull().mean() * 100).round(2))


Percentage of missing values per column:
pin                                  0.00
year                                 0.00
township_code                        0.00
neighborhood_code                    0.00
class                                0.00
sale_date                            0.00
is_mydec_date                        0.00
sale_price                           0.00
sale_document_num                    0.00
sale_deed_type                       0.00
mydec_deed_type                     99.98
sale_seller_name                     9.68
is_multisale                         0.00
num_parcels_sale                     0.00
sale_buyer_name                      9.65
sale_type                            0.26
sale_filter_same_sale_within_365     0.00
sale_filter_less_than_10k            0.00
sale_filter_deed_type                0.00
row_id                               0.00
dtype: float64


In [31]:
print("Columns in dataset:", df.columns.tolist())

Columns in dataset: ['pin', 'year', 'township_code', 'neighborhood_code', 'class', 'sale_date', 'is_mydec_date', 'sale_price', 'sale_document_num', 'sale_deed_type', 'mydec_deed_type', 'sale_seller_name', 'is_multisale', 'num_parcels_sale', 'sale_buyer_name', 'sale_type', 'sale_filter_same_sale_within_365', 'sale_filter_less_than_10k', 'sale_filter_deed_type', 'row_id']


In [35]:
top5_townships = (
    df.groupby("township_code")["sale_price"]
      .median()
      .sort_values(ascending=False)
      .head(5)
)

In [37]:
print(top5_townships)

township_code
23    675000.0
10    532000.0
25    435000.0
33    375500.0
74    360000.0
Name: sale_price, dtype: float64


In [39]:
lowest5_townships = (
    df.groupby("township_code")["sale_price"]
      .median()
      .sort_values(ascending=True)
      .head(5)
)

In [41]:
print(lowest5_townships)

township_code
37    107500.0
14    115000.0
12    120000.0
13    140000.0
70    140000.0
Name: sale_price, dtype: float64


In [45]:
top10_sales = df.nlargest(10, 'sale_price')

In [47]:
top10_years = top10_sales['year']

In [49]:
print(top10_years.tolist())

[2007, 2007, 2004, 2007, 2007, 2002, 2006, 2006, 2006, 2006]


In [51]:
min_year = df['year'].min()

In [53]:
max_year = df['year'].max()

In [55]:
print(f" {min_year} to {max_year}.")

 1971 to 2024.


In [57]:
township_mapping = {
    1: "Barrington", 2: "Berkeley", 3: "Berwyn", 4: "Bloom", 5: "Bremen",
    6: "Calumet", 7: "Cicero", 8: "Elk Grove", 9: "Evanston", 10: "Hanover",
    11: "Lemont", 12: "Leyden", 13: "Lyons", 14: "Maine", 15: "Norwood Park",
    16: "Northfield", 17: "Niles", 18: "New Trier", 19: "Palatine", 20: "Proviso",
    21: "Re-go (or Ridge)", 22: "River Forest", 23: "Riverside", 24: "Schaumburg",
    25: "Stickney", 26: "Thornton", 27: "Wheeling", 28: "Worth", 29: "Bloomingdale",
    30: "Addison", 31: "Elk Grove", 32: "Leyden", 33: "Maine", 34: "Palatine",
    35: "Hanover", 36: "New Trier", 37: "Northfield", 38: "Niles", 39: "Proviso",
    40: "River Forest"
}

In [61]:
df['Township_Name'] = df['township_code'].map(township_mapping)

In [65]:
top5_median = df.groupby('Township_Name')['sale_price'].median().sort_values(ascending=False).head(5)
print("Top 5 townships by median sale price:")
print(top5_median)

Top 5 townships by median sale price:
Township_Name
Riverside     675000.0
Stickney      435000.0
Schaumburg    315000.0
Thornton      310500.0
Palatine      297692.0
Name: sale_price, dtype: float64


In [69]:
unique_codes = df['township_code'].nunique()

In [71]:
print(f"{unique_codes}")

38
