# 1. Frame the problem
As the contractor, I must develop an AI model that can accurately predict housing prices within in the LA area using attributes of the property.

Use different ML models to predict the survivability chances for a passenger.

# 2. Get the Data 
I will be scraping a website like RedFin in order to create my own data set of LA houses.

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time
from datetime import datetime

def convert_price_to_numeric(price_str):
    """
    Converts a price string (e.g., '$2.5M', '500K') to a numeric value.
    """
    if price_str is None:
        return None
    price_str = str(price_str).strip().upper()
    
    if isinstance(price_str, (int, float)):
        return price_str

    price_str = price_str.replace('$', '').replace(',', '')
    
    if 'M' in price_str:
        return int(float(price_str.replace('M', '')) * 1_000_000)
    elif 'K' in price_str:
        return int(float(price_str.replace('K', '')) * 1_000)
    
    try:
        return int(price_str)
    except (ValueError, TypeError):
        return None

def scrape_zillow_sold_data(city='los-angeles', state='ca'):
    """
    Scrapes sold housing data from Zillow for a given city and state,
    handling pagination and extracting key features from search result pages.
    """
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "Accept-Language": "en",
        "Cache-Control": "no-cache",
        "Pragma": "no-cache",
        "Sec-Ch-Ua": '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"',
        "Sec-Ch-Ua-Mobile": "?0",
        "Sec-Ch-Ua-Platform": '"Windows"',
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-User": "?1",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
    }
    
    all_properties = []
    page_number = 1

    while True:
        if page_number == 1:
            url = f"https://www.zillow.com/{city}-{state}/sold/"
        else:
            url = f"https://www.zillow.com/{city}-{state}/sold/{page_number}_p/"
        
        print(f"Requesting data from page {page_number}: {url}")

        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            print(f"Successfully fetched page {page_number}.")
        except requests.exceptions.RequestException as e:
            print(f"Error making request on page {page_number}: {e}")
            break

        soup = BeautifulSoup(response.content, 'html.parser')
        script_tag = soup.find('script', {'id': '__NEXT_DATA__'})
        
        if not script_tag:
            print("Could not find the data script tag. The page structure may have changed.")
            break

        try:
            json_data = json.loads(script_tag.string)
            search_results = json_data['props']['pageProps']['searchPageState']['cat1']['searchResults']['listResults']
            
            if not search_results:
                print("No more properties found. Reached the last page.")
                break
            
            page_properties = []
            for property_data in search_results:
                # The 'homeType' and 'parcelId' are nested deeper in the JSON. This accesses them safely.
                hdp_data = property_data.get('hdpData', {})
                home_info = hdp_data.get('homeInfo', {})
                property_type = home_info.get('homeType', 'N/A')
                # --- CHANGE START ---
                # Get the Assessor's Identification Number (AIN), which Zillow calls parcelId
                ain = home_info.get('parcelId')
                # --- CHANGE END ---

                page_properties.append({
                    'Address': property_data.get('address', 'N/A'),
                    # --- CHANGE START ---
                    # Added the new 'AIN' field to the dictionary
                    'AIN': ain,
                    # --- CHANGE END ---
                    'Sold Price': convert_price_to_numeric(property_data.get('soldPrice', property_data.get('price', 'N/A'))),
                    'Bedrooms': property_data.get('beds'),
                    'Bathrooms': property_data.get('baths'),
                    'Area (SqFt)': property_data.get('area'),
                    'Property Type': property_type,
                })
            
            all_properties.extend(page_properties)
            print(f"Found {len(page_properties)} properties on this page.")
            
            page_number += 1
            time.sleep(2) # Increased delay to be more respectful to the server

        except (KeyError, json.JSONDecodeError) as e:
            print(f"Error parsing JSON data: {e}. The data structure might have changed.")
            break

    if not all_properties:
        return pd.DataFrame()
    
    print(f"\nFinished scraping. Found a total of {len(all_properties)} properties.")
    return pd.DataFrame(all_properties)

if __name__ == "__main__":
    scraped_data = scrape_zillow_sold_data(city='los-angeles', state='ca')

    if not scraped_data.empty:
        output_filename = 'zillow_sold_los_angeles.csv'
        scraped_data.to_csv(output_filename, index=False)
        print(f"\nData successfully saved to '{output_filename}'")
    else:
        print("\nScraping failed or no data was found. No file was saved.")

Requesting data from page 1: https://www.zillow.com/los-angeles-ca/sold/
Successfully fetched page 1.
Found 41 properties on this page.
Requesting data from page 2: https://www.zillow.com/los-angeles-ca/sold/2_p/
Successfully fetched page 2.
Found 41 properties on this page.
Requesting data from page 3: https://www.zillow.com/los-angeles-ca/sold/3_p/
Successfully fetched page 3.
Found 41 properties on this page.
Requesting data from page 4: https://www.zillow.com/los-angeles-ca/sold/4_p/
Successfully fetched page 4.
Found 41 properties on this page.
Requesting data from page 5: https://www.zillow.com/los-angeles-ca/sold/5_p/
Successfully fetched page 5.
Found 41 properties on this page.
Requesting data from page 6: https://www.zillow.com/los-angeles-ca/sold/6_p/
Successfully fetched page 6.
Found 41 properties on this page.
Requesting data from page 7: https://www.zillow.com/los-angeles-ca/sold/7_p/
Successfully fetched page 7.
Found 41 properties on this page.
Requesting data from pag

# 3. Explore the Data
Gain insights into the data you have from step 2, making sure to identify any bias

It appears that sex has the largest tell for survivability, females being 75% and amle only being ~20%
Passenger class and age are close seconds with about ~63% for first class (gets lower as class decreases) and 60% for ages 10 and under.
The apparent bias is that Women, children, and wealthy passengers were prioritized.

# 4.Prepare the Data


Apply any data transformations and explain what and why


Filled in missing values for age, embarked, and fare using median, mode, and median respectively. This was because the model can only work with numbers.
Converted sex to numbers (male =0, f =1)

When i was doing research on the problem, i found that family size had a great effect, so I learned how to make a derived variable for familiy size and 
if thjey were alone or not.

Also created a Title variable to get iunformation that may be missing in the table such as if the person is married, or their estimated age.

Removed uselesss features like Cabin (too many missing), Name(useless....), and Ticket(formatting was different among the values so i couldnt use it).

# 5. Model the data
Using selected ML models, experment with your choices and describe your findings. Finish by selecting a Model to continue with


Didnt even try linear regression as there are too many features. 

Tried logistic regression in vscode. Got a 80% accuracy, but after doing more research i found that people had sucess with random forests, 
so I stuck with that.

# 6. Fine Tune the Model

With the select model descibe the steps taken to acheve the best rusults possiable 


when I began i started with only 50 tress, but i realized that was too few and it was overfitting the model, so I bumped it up to 400 trees and things were fine.
I asked chatgpt about it and it explained that with the amount of rows in the data, 200 trees was sufficient, so i stuck with that.

I read that with less than 5k rows, the max depth should be 5-10, so i just chose 7 and stuck with it.

I just started at 1 for the random state and started playing with it and hit 45 as the highest percentage of 82 and stuck with it, but really it doesnt
matter, its just something used to ensure the data is reproducible.

# 7. Present
In a customer faceing Document provide summery of finding and detail approach taken


Summary of Findings and Approach
We analyzed passenger survivability on the titanic using a dataset provided from Kaggle. Our analysis aimed to identify which factors most influenced survival and to develop a predictive model. From the data:
Sex is the strongest predictor: Females had a roughly 75% survival rate, while males were around 20%.


Passenger class and age are also significant: First-class passengers had a survival rate of ~63%, decreasing with lower classes, and children aged 10 and under had about a 60% chance of survival.


Bias in survival: The data reflects historical prioritization of women, children, and wealthier passengers.


Cleaning the data:
Missing values were filled in
Categorical variables were converted
New features were created, like family size and title.
Useless features were removed


For modeling:
Initial testing with logistic regression had 80% accuracy.
Moved to Random Forest model
Model optimization:


Number of trees: Increased from 50 (overfitting) to 200.


Max depth: Set to 7, as it is a small dataset
Set a random state to make sure the results are reproduced
Outcome: The Random Forest model gave an accuracy of approximately 82%. Therefore, it can be reliably used to predict whether a passenger will survive or not, given a new set of data..


# 8. Launch the Model System
Define your production run code, This should be self susficent and require only your model pramaters 


In [1]:
def inference(params):
    params = params.reindex(columns=X.columns, fill_value=0)
    results = model.predict(params)
    return results