In [1]:
import pandas as pd 
import numpy as np 
import requests
from bs4 import BeautifulSoup
import warnings 
warnings.filterwarnings('ignore')

# Initialize lists for each page
sold_date = []
sold_price = []
bed_room = []
bath_room = []
property_sqft = []
property_alot = []
address = []
final_df = pd.DataFrame()  # Initialize an empty DataFrame

for j in range(1, 17):
    url = f'https://www.realtor.com/realestateandhomes-search/Jackson_MI/show-recently-sold/sby-6/pg-{j}'
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'}
    webpage = requests.get(url, headers=headers).text
    soup = BeautifulSoup(webpage, 'lxml')
    page = soup.find_all('div', class_="CardContent__StyledCardContent-rui__sc-7ptz1z-0 kDqsxy card-content card-content")
    for i in page:
        date_text = i.find('div', class_="StatusBadgestyles__StyledStatusBadge-rui__sc-1wog16p-0 gEBEkT card-description")
        sold_date.append(date_text.text if date_text else np.nan)
        price_text = i.find('div', class_="price-wrapper")
        sold_price.append(price_text.text if price_text else np.nan)
        bed_text = i.find('li', class_='PropertyBedMetastyles__StyledPropertyBedMeta-rui__a4nnof-0 jkAoUn')
        bed_room.append(bed_text.text if bed_text else np.nan)
        bath_text = i.find('li', class_='PropertyBathMetastyles__StyledPropertyBathMeta-rui__sc-67m6bo-0 hGQdFx')
        bath_room.append(bath_text.text if bath_text else np.nan)
        sqft_text = i.find('li', class_='PropertySqftMetastyles__StyledPropertySqftMeta-rui__sc-1gdau7i-0 cYyTDO')
        property_sqft.append(sqft_text.text if sqft_text else np.nan)
        lot_size_text = i.find('li', class_="PropertyLotSizeMetastyles__StyledPropertyLotSizeMeta-rui__sc-1cz4zco-0 fPkhUg")
        property_alot.append(lot_size_text.text if lot_size_text else np.nan)
        address_container = i.find_all('div', class_='content-row')[-1]
        address.append(address_container.text.strip() if address_container else np.nan)
# Validate all lists have the same length before creating DataFrame
if all(len(lst) == len(sold_date) for lst in [sold_price, bed_room, bath_room, property_sqft, property_alot, address]):
    df_page_data= pd.DataFrame({
      'sold_date': sold_date,
      'sold_price': sold_price,
      'bed_room': bed_room,
      'bath_room': bath_room,
      'property_sqft': property_sqft,
      'property_alot': property_alot,
      'address': address
   })
    final_df= final_df.append(df_page_data, ignore_index=True)

else:  
   print("Error: Lists have different lengths.")

# After loop ends
print(final_df.head())

             sold_date   sold_price bed_room bath_room  \
0  Sold - Oct 17, 2023     $275,000     4bed   2.5bath   
1   Sold - Aug 4, 2023      $46,200     2bed     1bath   
2  Sold - Nov 14, 2023     $161,000     3bed     1bath   
3  Sold - Sep 18, 2023     $255,000     4bed     1bath   
4  Sold - Aug 18, 2023  $69,500$500     2bed     1bath   

                property_sqft                       property_alot  \
0  1,980sqft1,980 square feet           0.34acre lot0.34 acre lot   
1      920sqft920 square feet  3,920sqft lot3,920 square foot lot   
2  1,876sqft1,876 square feet           0.24acre lot0.24 acre lot   
3  1,508sqft1,508 square feet  6,970sqft lot6,970 square foot lot   
4      720sqft720 square feet  4,356sqft lot4,356 square foot lot   

                                 address  
0    1710 Lochmoor BlvdJackson, MI 49201  
1       1402 E North StJackson, MI 49202  
2        333 N Bowen StJackson, MI 49202  
3  508 Gilletts Lake RdJackson, MI 49201  
4        506 Detroit 

In [2]:
final_df

Unnamed: 0,sold_date,sold_price,bed_room,bath_room,property_sqft,property_alot,address
0,"Sold - Oct 17, 2023","$275,000",4bed,2.5bath,"1,980sqft1,980 square feet",0.34acre lot0.34 acre lot,"1710 Lochmoor BlvdJackson, MI 49201"
1,"Sold - Aug 4, 2023","$46,200",2bed,1bath,920sqft920 square feet,"3,920sqft lot3,920 square foot lot","1402 E North StJackson, MI 49202"
2,"Sold - Nov 14, 2023","$161,000",3bed,1bath,"1,876sqft1,876 square feet",0.24acre lot0.24 acre lot,"333 N Bowen StJackson, MI 49202"
3,"Sold - Sep 18, 2023","$255,000",4bed,1bath,"1,508sqft1,508 square feet","6,970sqft lot6,970 square foot lot","508 Gilletts Lake RdJackson, MI 49201"
4,"Sold - Aug 18, 2023","$69,500$500",2bed,1bath,720sqft720 square feet,"4,356sqft lot4,356 square foot lot","506 Detroit StJackson, MI 49201"
...,...,...,...,...,...,...,...
163,"Sold - Dec 5, 2023","$252,500",3bed,1.5bath,"1,456sqft1,456 square feet",1acre lot1 acre lot,"3760 Sargent RdJackson, MI 49201"
164,"Sold - Sep 15, 2023","$425,000$25k",5bed,3.5bath,"4,456sqft4,456 square feet",1.94acre lot1.94 acre lot,"3720 W Primilia LnJackson, MI 49201"
165,"Sold - Jul 31, 2023","$166,000",3bed,2bath,"1,506sqft1,506 square feet","5,227sqft lot5,227 square foot lot","616 Jefferson StJackson, MI 49202"
166,"Sold - Dec 1, 2023","$205,000",4bed,2bath,"1,956sqft1,956 square feet","6,534sqft lot6,534 square foot lot","206 S Grinnell StJackson, MI 49203"


In [3]:
final_df.isna().sum()

sold_date         0
sold_price        0
bed_room         23
bath_room        25
property_sqft    24
property_alot    10
address           0
dtype: int64