In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
file_name = 'usa-real-estate-dataset/realtor-data.zip.csv'
data = pd.read_csv('realtor-data.zip.csv')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2226382 entries, 0 to 2226381
Data columns (total 12 columns):
 #   Column          Dtype  
---  ------          -----  
 0   brokered_by     float64
 1   status          object 
 2   price           float64
 3   bed             float64
 4   bath            float64
 5   acre_lot        float64
 6   street          float64
 7   city            object 
 8   state           object 
 9   zip_code        float64
 10  house_size      float64
 11  prev_sold_date  object 
dtypes: float64(8), object(4)
memory usage: 203.8+ MB


In [13]:
for feature in data.columns:
    print(feature, data[feature].isna().value_counts("True"))

brokered_by brokered_by
False    0.997964
True     0.002036
Name: proportion, dtype: float64
status status
False    1.0
Name: proportion, dtype: float64
price price
False    0.999308
True     0.000692
Name: proportion, dtype: float64
bed bed
False    0.783812
True     0.216188
Name: proportion, dtype: float64
bath bath
False    0.770133
True     0.229867
Name: proportion, dtype: float64
acre_lot acre_lot
False    0.853759
True     0.146241
Name: proportion, dtype: float64
street street
False    0.995119
True     0.004881
Name: proportion, dtype: float64
city city
False    0.999368
True     0.000632
Name: proportion, dtype: float64
state state
False    0.999996
True     0.000004
Name: proportion, dtype: float64
zip_code zip_code
False    0.999866
True     0.000134
Name: proportion, dtype: float64
house_size house_size
False    0.74466
True     0.25534
Name: proportion, dtype: float64
prev_sold_date prev_sold_date
False    0.670184
True     0.329816
Name: proportion, dtype: float64


There is a lot of missing value for house_size, prev_sold_date, bed, bath, acre_lot. We will consider imputing strategy for these features.

The other features have very small proportion of missing values. We can consider dropping strategy for these features. 

In [15]:
data = data.dropna(subset=["brokered_by", "price", "street", "city", "state", "zip_code"])

In [5]:
data.head(20)

Unnamed: 0,brokered_by,status,price,bed,bath,acre_lot,street,city,state,zip_code,house_size,prev_sold_date
0,103378.0,for_sale,105000.0,3.0,2.0,0.12,1962661.0,Adjuntas,Puerto Rico,601.0,920.0,
1,52707.0,for_sale,80000.0,4.0,2.0,0.08,1902874.0,Adjuntas,Puerto Rico,601.0,1527.0,
2,103379.0,for_sale,67000.0,2.0,1.0,0.15,1404990.0,Juana Diaz,Puerto Rico,795.0,748.0,
3,31239.0,for_sale,145000.0,4.0,2.0,0.1,1947675.0,Ponce,Puerto Rico,731.0,1800.0,
4,34632.0,for_sale,65000.0,6.0,2.0,0.05,331151.0,Mayaguez,Puerto Rico,680.0,,
5,103378.0,for_sale,179000.0,4.0,3.0,0.46,1850806.0,San Sebastian,Puerto Rico,612.0,2520.0,
6,1205.0,for_sale,50000.0,3.0,1.0,0.2,1298094.0,Ciales,Puerto Rico,639.0,2040.0,
7,50739.0,for_sale,71600.0,3.0,2.0,0.08,1048466.0,Ponce,Puerto Rico,731.0,1050.0,
8,81909.0,for_sale,100000.0,2.0,1.0,0.09,734904.0,Ponce,Puerto Rico,730.0,1092.0,
9,65672.0,for_sale,300000.0,5.0,3.0,7.46,1946226.0,Las Marias,Puerto Rico,670.0,5403.0,


In [6]:
for feature in data.columns:
    print(feature, data[feature].nunique())

brokered_by 110143
status 3
price 102137
bed 99
bath 86
acre_lot 16057
street 2001358
city 20098
state 55
zip_code 30334
house_size 12061
prev_sold_date 14954


Since there is so many unique values for bed and bath. We suspect that there must be some issues with it (mistyped, wrong format)

In [7]:
sorted(data.bed.unique())

[1.0,
 2.0,
 3.0,
 4.0,
 5.0,
 6.0,
 7.0,
 8.0,
 9.0,
 nan,
 10.0,
 11.0,
 12.0,
 13.0,
 14.0,
 15.0,
 16.0,
 17.0,
 18.0,
 19.0,
 20.0,
 21.0,
 22.0,
 23.0,
 24.0,
 25.0,
 26.0,
 27.0,
 28.0,
 29.0,
 30.0,
 31.0,
 32.0,
 33.0,
 34.0,
 35.0,
 36.0,
 37.0,
 38.0,
 39.0,
 40.0,
 41.0,
 42.0,
 43.0,
 44.0,
 45.0,
 46.0,
 47.0,
 48.0,
 49.0,
 50.0,
 52.0,
 53.0,
 54.0,
 55.0,
 56.0,
 57.0,
 60.0,
 61.0,
 62.0,
 63.0,
 64.0,
 66.0,
 67.0,
 68.0,
 69.0,
 70.0,
 71.0,
 72.0,
 73.0,
 74.0,
 75.0,
 76.0,
 80.0,
 82.0,
 84.0,
 86.0,
 88.0,
 90.0,
 93.0,
 96.0,
 98.0,
 99.0,
 100.0,
 102.0,
 108.0,
 110.0,
 111.0,
 114.0,
 120.0,
 123.0,
 136.0,
 142.0,
 148.0,
 190.0,
 210.0,
 212.0,
 222.0,
 444.0,
 473.0]

In [8]:
data[data.bed == 444]

Unnamed: 0,brokered_by,status,price,bed,bath,acre_lot,street,city,state,zip_code,house_size,prev_sold_date
1379204,54823.0,for_sale,440000.0,444.0,222.0,0.34,118525.0,Sublimity,Oregon,97385.0,1700.0,1990-08-15
2188928,54823.0,sold,430000.0,444.0,222.0,0.34,118525.0,Sublimity,Oregon,97385.0,1700.0,2022-04-29
