In [None]:
%pip install scikit-learn
%pip install xgboost
%pip install shap
%pip install pandas

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import sklearn
df = pd.read_csv("/csvData/floridaHomeDetailsV3.csv",on_bad_lines='skip')

# Function to extract zipcode from URL
def extract_zipcode_from_url(url):
    if isinstance(url, str):
        try:
            parts = url.split('/homedetails/')[1].split('/')
            # Assuming the zipcode is the part before the _zpid
            zip_part = parts[-2] if '-' in parts[-2] else parts[-1]
            zipcode = zip_part.split('-')[0]
            return zipcode
        except:
            return np.nan
    return np.nan

# Function to recalculate Average Rental Price
def recalculate_average_rent(row):
    times_rented = row['Times Rented']
    last_rental_price = row['Last Rental Price']
    average_time_on_market = row['Average Time on Market'] # Assuming this is the old average

    # Check for conditions where calculation is not possible or would result in NaN
    if pd.isna(times_rented) or times_rented < 2 or pd.isna(last_rental_price) or pd.isna(average_time_on_market):
        return np.nan
    else:
        # Apply the formula: (Old Average * Times Rented - Last Rental Price) / (Times Rented - 1)
        # Need to be careful if Times Rented is 1, as it would result in division by zero.
        if times_rented - 1 == 0:
            return np.nan # Avoid division by zero
        else:
            new_average = (average_time_on_market * times_rented - last_rental_price) / (times_rented - 1)
            return new_average

# Apply the function to fill missing zipcodes
df['Zipcode'] = df.apply(
    lambda row: extract_zipcode_from_url(row['URL']) if pd.isna(row['Zipcode']) else row['Zipcode'],
    axis=1
)

# Apply the function to recalculate Average Time on Market
df['Average Time on Market'] = df.apply(recalculate_average_rent, axis=1)

# Remove the top 10% of outliers based on 'Most Recent Time on Market'
cutoff = df['Most Recent Time on Market'].quantile(0.8)
print(f"Using a cutoff of {cutoff:.2f} days")

# Create a new DataFrame without the top 10% of outliers
df = df[df['Most Recent Time on Market'] <= cutoff]

display(df.head())
display(df.info())
df['Most Recent Time on Market'].describe()

In [None]:
# Calculate 'Rent Estimate to actual Price' column
df['Rent Estimate to actual Price'] = df['Rent Zestimate'] / df['Price']

# Calculate 'price to area average by zipcode' column
# First, calculate the average price per zipcode
average_price_by_zipcode = df.groupby('Zipcode')['Price'].transform('mean')

# Then, calculate the ratio of 'Price' to the average price by zipcode
df['price to area average by zipcode'] = df['Price'] / average_price_by_zipcode

# Function to extract the numerical month of the most recent 'Listed for rent' or 'Sold' event
import json

def get_most_recent_month(events_string):
    try:
        events = json.loads(events_string)
        # Filter for 'Listed for rent' or 'Sold' events and sort by date
        relevant_events = sorted([e for e in events if e['type'] in ['Listed for rent', 'Sold']],
                                 key=lambda x: x['date'], reverse=True)
        if relevant_events:
            # Get the date of the most recent relevant event
            most_recent_date_str = relevant_events[0]['date']
            # Extract and return the numerical month
            return pd.to_datetime(most_recent_date_str).month
    except:
        # Return NaN if there's an error parsing or no relevant events
        return np.nan
    return np.nan

# Apply the function to create the 'month listed' column
df['month listed'] = df['Events'].apply(get_most_recent_month)

display(df[['Rent Estimate to actual Price', 'price to area average by zipcode', 'month listed']].head())

## Filter Data

### Subtask:
Filter out the 1% most expensive houses based on the 'Price' column.

**Reasoning**:
Calculate the 99th percentile of the 'Price' column and filter the DataFrame to exclude houses with prices above this threshold. Display the head and info of the filtered DataFrame.

In [None]:

# Function to split and one-hot encode the 'Appliances' column
def encode_appliances(df):
    # Split the 'Appliances' string into a list of appliances, handling NaN values
    appliances_list = df['Appliances'].str.split(', ').apply(lambda x: x if isinstance(x, list) else [])
    # Get all unique appliances
    all_appliances = sorted(list(set([item for sublist in appliances_list for item in sublist])))
    # Create new columns for each appliance and fill with 1 if the appliance is present
    for appliance in all_appliances:
        df[f'Appliance_{appliance}'] = appliances_list.apply(lambda x: 1 if appliance in x else 0)
    return df

# Apply one-hot encoding to 'Appliances'
df_filtered = encode_appliances(df)

# Apply one-hot encoding to other string columns
df_filtered = pd.get_dummies(df_filtered, columns=['Cooling', 'Heating', 'Parking', 'Laundry', 'Home Type'])

display(df_filtered.head())
display(df_filtered.info())

In [None]:
import pandas as pd
import numpy as np

def fill_nan_with_median(df):
    """
    Fills NaN values in a DataFrame with the median of each column using a manual loop.
    If the median cannot be calculated for a column, the NaN values in that column will remain NaN.
    """
    df_copy = df.copy()
    for col in df_copy.columns:
        try:
            # Attempt to calculate the median. Use try-except to catch errors for non-numeric columns
            median_val = df_copy[col].median()
            if not pd.isna(median_val):
                df_copy[col] = df_copy[col].fillna(median_val)
        except:
            # If median cannot be calculated (e.g., non-numeric column), leave NaN values as they are
            pass # Do nothing, leave NaN values
    return df_copy


# Apply the custom NaN filling function
df_filled = fill_nan_with_median(df_filtered.copy())

# Shuffle the DataFrame
df_shuffled = df_filled.sample(frac=1, random_state=41).reset_index(drop=True)

display(df_shuffled.head())
display(df_shuffled.info())

## Train XGBoost Model

### Subtask:
Train an XGBoost model to predict the 'Price' column.

**Reasoning**:
Separate the features (X) and target variable (y), split the data into training and testing sets, and train an XGBoost Regressor model.

In [None]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
import numpy as np

# Separate features (X) and target variable (y)
# Exclude non-numeric and irrelevant columns
X = df_shuffled.drop(columns=['Most Recent Time on Market', 'Street Address', 'City', 'State', 'URL', 'Appliances', "Image URLs", "Date Details Fetched", "Events", "Price"])
y = df_shuffled['Most Recent Time on Market']

# Select only numeric columns for training
X = X.select_dtypes(include=np.number)

# Replace infinite values with NaN and then fill NaN with the median
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X = X.fillna(X.median())

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

y_test.head().to_csv("testOutput.csv")

# Initialize and train the XGBoost Regressor model
xgbr = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=400, learning_rate=0.01, random_state=42)
xgbr.fit(X_train, y_train)

print("XGBoost model training complete.")

## Cross-evaluate Hyperparameters and Evaluate Model

### Subtask:
Cross-evaluate the hyperparameters of the trained XGBoost model and calculate the R2 score.

**Reasoning**:
Use cross-validation to evaluate the model's performance across different subsets of the data and calculate the R2 score on the test set to assess the model's goodness of fit.

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_log_error, mean_absolute_error, mean_absolute_percentage_error

# Cross-evaluate the model using cross-validation
# Using R2 as the scoring metric
scores = cross_val_score(xgbr, X_train, y_train, cv=10, scoring='r2')

print(f"Cross-validation R2 scores: {scores}")
print(f"Mean cross-validation R2 score: {scores.mean()}")

# Predict on the test set
y_pred = xgbr.predict(X_test)

# Calculate the R2 score on the test set
r2 = r2_score(y_test, y_pred)

print(f"R2 score on the test set: {r2}")

scores = cross_val_score(xgbr, X_train, y_train, cv=10, scoring='neg_mean_absolute_error')

print(f"Cross-validation MAE scores: {scores}")
print(f"Mean cross-validation MAE score: {scores.mean()}")



## Analyze Highest Errors

### Subtask:
Show a list of the highest errors between predicted and actual rent.

In [None]:
xgbr.save_model("flDaysModel.json")

**Reasoning**:
Calculate the absolute errors between the predicted and actual prices, sort the results by error in descending order, and display the instances with the highest errors.

In [None]:
# Calculate the absolute errors
errors = abs(y_test - y_pred)

# Create a DataFrame to show actual price, predicted price, and error
error_df = pd.DataFrame({'Actual Days on Zillow': y_test, 'Predicted': y_pred, 'Error': errors})

# Get the original index from the test set
error_df = error_df.join(df_shuffled['URL'], how='left')


# Sort by error in descending order and display the top errors
display(error_df.sort_values(by='Error', ascending=False).head())
error_df.sort_values(by='Error', ascending=False).to_csv('error_df.csv')

In [None]:
%pip install matplotlib

## SHAP Analysis

### Subtask:
Generate a SHAP summary plot to visualize feature importance.

**Reasoning**:
Use the `shap` library to calculate SHAP values for the test set and generate a summary plot to show the impact of each feature on the model's predictions.

In [None]:
import shap

# Create a SHAP explainer object
explainer = shap.TreeExplainer(xgbr)

# Calculate SHAP values for the test set
shap_values = explainer.shap_values(X_test)

# Generate the SHAP summary plot
shap.summary_plot(shap_values, X_test)

print(df_shuffled['Most Recent Time on Market'].describe())

## Calculate MAE, MSE, and RMSE

### Subtask:
Calculate and display the Mean Absolute Error (MAE), Mean Squared Error (MSE), and Root Mean Squared Error (RMSE) of the model's predictions in one cell.

**Reasoning**:
Use `sklearn.metrics.mean_absolute_error`, `sklearn.metrics.mean_squared_error`, and `numpy.sqrt` to calculate MAE, MSE, and RMSE using the actual and predicted values and display the results.

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae}")

# Calculate MSE
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")

# Calculate RMSE
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Task
Read "floridaHomeDetailsIncome.csv", sort it by Latitude then Longitude, calculate MAE and RMSE for the XGBoost model, generate a Shap summary plot, and create a function that takes the dataframe, the trained XGBoost model, and a dictionary representing a new house, predicts the price of the new house by filling missing values from the closest house in the dataframe and one-hot encoding the features, and returns the predicted price.

## Define the new house dictionary

### Subtask:
Define the new house dictionary.


**Reasoning**:
Define a dictionary for the new house with relevant features for prediction.



**Reasoning**:
Combine all the necessary steps for predicting the price of a new house into one code cell for clarity and to ensure consistent data handling.

In [None]:
xgbr.save_model("xgbModelFloridaV2.json")

# Task
Edit the copy to perform binary classification to determine whether a house will sell fast (< 30 days) or not, evaluate the model's performance using appropriate metrics, and analyze feature importance using SHAP.

## Modify data preparation

### Subtask:
Create a new target variable for binary classification based on 'Most Recent Time on Market' (e.g., 1 for < 30 days, 0 otherwise).


**Reasoning**:
Create the binary target variable 'is_fast_sale' based on the 'Most Recent Time on Market' column, display the head and info of the updated DataFrame, and show the value counts of the new target variable.



In [None]:
import pandas as pd
import numpy as np
import json

# Try loading from a common Google Drive path
try:
    df = pd.read_csv("/content/drive/MyDrive/csvData/floridaHomeDetailsV3.csv",on_bad_lines='skip')
except FileNotFoundError:
    # If that fails, try the original path (in case the environment is different)
    try:
        df = pd.read_csv("csvData/floridaHomeDetailsV3.csv",on_bad_lines='skip')
    except FileNotFoundError:
        # If that fails, try the path specified in the notebook description
        try:
            df = pd.read_csv("/content/amenity_analysis_results.csv", on_bad_lines='skip')
        except FileNotFoundError:
            print("Error: Data file not found in common paths.")
            # Set df to None or an empty DataFrame to prevent further errors
            df = None

if df is not None:
    # Function to extract zipcode from URL
    def extract_zipcode_from_url(url):
        if isinstance(url, str):
            try:
                parts = url.split('/homedetails/')[1].split('/')
                # Assuming the zipcode is the part before the _zpid
                zip_part = parts[-2] if '-' in parts[-2] else parts[-1]
                zipcode = zip_part.split('-')[0]
                return zipcode
            except:
                return np.nan
        return np.nan

    # Function to recalculate Average Rental Price
    def recalculate_average_rent(row):
        times_rented = row['Times Rented']
        last_rental_price = row['Last Rental Price']
        average_time_on_market = row['Average Time on Market'] # Assuming this is the old average

        # Check for conditions where calculation is not possible or would result in NaN
        if pd.isna(times_rented) or times_rented < 2 or pd.isna(last_rental_price) or pd.isna(average_time_on_market):
            return np.nan
        else:
            # Apply the formula: (Old Average * Times Rented - Last Rental Price) / (Times Rented - 1)
            # Need to be careful if Times Rented is 1, as it would result in division by zero.
            if times_rented - 1 == 0:
                return np.nan # Avoid division by zero
            else:
                new_average = (average_time_on_market * times_rented - last_rental_price) / (times_rented - 1)
                return new_average

    # Apply the function to fill missing zipcodes
    df['Zipcode'] = df.apply(
        lambda row: extract_zipcode_from_url(row['URL']) if pd.isna(row['Zipcode']) else row['Zipcode'],
        axis=1
    )

    # Apply the function to recalculate Average Time on Market
    df['Average Time on Market'] = df.apply(recalculate_average_rent, axis=1)

    # Remove the top 10% of outliers based on 'Most Recent Time on Market'
    cutoff = df['Most Recent Time on Market'].quantile(0.8)
    df = df[df['Most Recent Time on Market'] <= cutoff].copy() # Added .copy() to avoid SettingWithCopyWarning

    # Calculate 'Rent Estimate to actual Price' column
    df['Rent Estimate to actual Price'] = df['Rent Zestimate'] / df['Price']

    # Calculate 'price to area average by zipcode' column
    # First, calculate the average price per zipcode
    average_price_by_zipcode = df.groupby('Zipcode')['Price'].transform('mean')

    # Then, calculate the ratio of 'Price' to the average price by zipcode
    df['price to area average by zipcode'] = df['Price'] / average_price_by_zipcode

    # Function to extract the numerical month of the most recent 'Listed for rent' or 'Sold' event
    def get_most_recent_month(events_string):
        try:
            events = json.loads(events_string)
            # Filter for 'Listed for rent' or 'Sold' events and sort by date
            relevant_events = sorted([e for e in events if e['type'] in ['Listed for rent', 'Sold']],
                                     key=lambda x: x['date'], reverse=True)
            if relevant_events:
                # Get the date of the most recent relevant event
                most_recent_date_str = relevant_events[0]['date']
                # Extract and return the numerical month
                return pd.to_datetime(most_recent_date_str).month
        except:
            # Return NaN if there's an error parsing or no relevant events
            return np.nan
        return np.nan

    # Apply the function to create the 'month listed' column
    df['month listed'] = df['Events'].apply(get_most_recent_month)


    # Filter out the 1% most expensive houses
    price_99th_percentile = df['Price'].quantile(0.99)
    df_filtered = df[df['Price'] <= price_99th_percentile].copy() # Added .copy()

    # Function to split and one-hot encode the 'Appliances' column
    def encode_appliances(df):
        # Split the 'Appliances' string into a list of appliances, handling NaN values
        appliances_list = df['Appliances'].str.split(', ').apply(lambda x: x if isinstance(x, list) else [])
        # Get all unique appliances
        all_appliances = sorted(list(set([item for sublist in appliances_list for item in sublist])))
        # Create new columns for each appliance and fill with 1 if the appliance is present
        for appliance in all_appliances:
            df[f'Appliance_{appliance}'] = appliances_list.apply(lambda x: 1 if appliance in x else 0)
        return df

    # Apply one-hot encoding to 'Appliances'
    df_filtered = encode_appliances(df_filtered)

    # Apply one-hot encoding to other string columns
    df_filtered = pd.get_dummies(df_filtered, columns=['Cooling', 'Heating', 'Parking', 'Laundry', 'Home Type'])

    # Function to fill NaN values with the median
    def fill_nan_with_median(df):
        df_copy = df.copy()
        for col in df_copy.columns:
            try:
                median_val = df_copy[col].median()
                if not pd.isna(median_val):
                    df_copy[col] = df_copy[col].fillna(median_val)
            except:
                pass
        return df_copy

    # Apply the custom NaN filling function
    df_filled = fill_nan_with_median(df_filtered.copy())

    # Shuffle the DataFrame
    df_shuffled = df_filled.sample(frac=1, random_state=41).reset_index(drop=True)

    # Create the binary target variable
    df_shuffled['is_fast_sale'] = (df_shuffled['Most Recent Time on Market'] < 30).astype(int)

    # Display the head of the DataFrame with the new column
    display(df_shuffled.head())

    # Display the data types
    display(df_shuffled.info())

    # Display the count of each unique value in 'is_fast_sale'
    display(df_shuffled['is_fast_sale'].value_counts())
else:
    print("DataFrame not loaded. Skipping subsequent steps.")

**Reasoning**:
The variable `df_shuffled` was not defined because the previous code cell that created and shuffled the dataframe was not executed in the current runtime. To fix this, I will re-execute the code cell that defines and shuffles the dataframe before creating the new target variable.



In [None]:
import pandas as pd
import numpy as np

def fill_nan_with_median(df):
    """
    Fills NaN values in a DataFrame with the median of each column using a manual loop.
    If the median cannot be calculated for a column, the NaN values in that column will remain NaN.
    """
    df_copy = df.copy()
    for col in df_copy.columns:
        try:
            # Attempt to calculate the median. Use try-except to catch errors for non-numeric columns
            median_val = df_copy[col].median()
            if not pd.isna(median_val):
                df_copy[col] = df_copy[col].fillna(median_val)
        except:
            # If median cannot be calculated (e.g., non-numeric column), leave NaN values as they are
            pass # Do nothing, leave NaN values
    return df_copy


# Apply the custom NaN filling function
df_filled = fill_nan_with_median(df_filtered.copy())

# Shuffle the DataFrame
df_shuffled = df_filled.sample(frac=1, random_state=41).reset_index(drop=True)

# Create the binary target variable
df_shuffled['is_fast_sale'] = (df_shuffled['Most Recent Time on Market'] < 30).astype(int)

# Display the head of the DataFrame with the new column
display(df_shuffled.head())

# Display the data types
display(df_shuffled.info())

# Display the count of each unique value in 'is_fast_sale'
display(df_shuffled['is_fast_sale'].value_counts())

**Reasoning**:
The variable `df_filtered` was not defined because the previous code cell that created and filtered the dataframe was not executed in the current runtime. To fix this, I will re-execute the code cell that filters and performs one-hot encoding on the dataframe before creating the new target variable. I will also include the code from the subsequent cells that create `df_shuffled` and the `is_fast_sale` column to complete the subtask.



In [None]:
# Function to split and one-hot encode the 'Appliances' column
def encode_appliances(df):
    # Split the 'Appliances' string into a list of appliances, handling NaN values
    appliances_list = df['Appliances'].str.split(', ').apply(lambda x: x if isinstance(x, list) else [])
    # Get all unique appliances
    all_appliances = sorted(list(set([item for sublist in appliances_list for item in sublist])))
    # Create new columns for each appliance and fill with 1 if the appliance is present
    for appliance in all_appliances:
        df[f'Appliance_{appliance}'] = appliances_list.apply(lambda x: 1 if appliance in x else 0)
    return df

# Apply one-hot encoding to 'Appliances'
df_filtered = encode_appliances(df)

# Apply one-hot encoding to other string columns
df_filtered = pd.get_dummies(df_filtered, columns=['Cooling', 'Heating', 'Parking', 'Laundry', 'Home Type'])

# Function to fill NaN values with the median
def fill_nan_with_median(df):
    df_copy = df.copy()
    for col in df_copy.columns:
        try:
            median_val = df_copy[col].median()
            if not pd.isna(median_val):
                df_copy[col] = df_copy[col].fillna(median_val)
        except:
            pass
    return df_copy

# Apply the custom NaN filling function
df_filled = fill_nan_with_median(df_filtered.copy())

# Shuffle the DataFrame
df_shuffled = df_filled.sample(frac=1, random_state=41).reset_index(drop=True)

# Create the binary target variable
df_shuffled['is_fast_sale'] = (df_shuffled['Most Recent Time on Market'] < 30).astype(int)

# Display the head of the DataFrame with the new column
display(df_shuffled.head())

# Display the data types
display(df_shuffled.info())

# Display the count of each unique value in 'is_fast_sale'
display(df_shuffled['is_fast_sale'].value_counts())

**Reasoning**:
The variable `df` was not defined because the previous code cell that loaded the data was not executed in the current runtime. To fix this, I will re-execute the code cell that loads the data and performs initial processing before filtering, one-hot encoding, filling NaNs, shuffling, and creating the new target variable. This will ensure that all necessary data and variables are available.



**Reasoning**:
The previous attempt to load the data failed because the file path was incorrect. Based on the file listing, the data file is not directly in '/content/' or '/content/sample_data/'. Since the original notebook attempted to mount Google Drive, it's likely the data is located there. I will try loading the data from a common Google Drive path. I will include the rest of the code for the subtask as well, assuming the data load is successful.



In [None]:
import pandas as pd
import numpy as np
import json

# Try loading from a common Google Drive path
try:
    df = pd.read_csv("/content/drive/MyDrive/csvData/floridaHomeDetailsV3.csv",on_bad_lines='skip')
except FileNotFoundError:
    # If that fails, try the original path (in case the environment is different)
    try:
        df = pd.read_csv("csvData/floridaHomeDetailsV3.csv",on_bad_lines='skip')
    except FileNotFoundError:
        print("Error: Data file not found in common paths.")
        # Set df to None or an empty DataFrame to prevent further errors
        df = None

if df is not None:
    # Function to extract zipcode from URL
    def extract_zipcode_from_url(url):
        if isinstance(url, str):
            try:
                parts = url.split('/homedetails/')[1].split('/')
                # Assuming the zipcode is the part before the _zpid
                zip_part = parts[-2] if '-' in parts[-2] else parts[-1]
                zipcode = zip_part.split('-')[0]
                return zipcode
            except:
                return np.nan
        return np.nan

    # Function to recalculate Average Rental Price
    def recalculate_average_rent(row):
        times_rented = row['Times Rented']
        last_rental_price = row['Last Rental Price']
        average_time_on_market = row['Average Time on Market'] # Assuming this is the old average

        # Check for conditions where calculation is not possible or would result in NaN
        if pd.isna(times_rented) or times_rented < 2 or pd.isna(last_rental_price) or pd.isna(average_time_on_market):
            return np.nan
        else:
            # Apply the formula: (Old Average * Times Rented - Last Rental Price) / (Times Rented - 1)
            # Need to be careful if Times Rented is 1, as it would result in division by zero.
            if times_rented - 1 == 0:
                return np.nan # Avoid division by zero
            else:
                new_average = (average_time_on_market * times_rented - last_rental_price) / (times_rented - 1)
                return new_average

    # Apply the function to fill missing zipcodes
    df['Zipcode'] = df.apply(
        lambda row: extract_zipcode_from_url(row['URL']) if pd.isna(row['Zipcode']) else row['Zipcode'],
        axis=1
    )

    # Apply the function to recalculate Average Time on Market
    df['Average Time on Market'] = df.apply(recalculate_average_rent, axis=1)

    # Remove the top 10% of outliers based on 'Most Recent Time on Market'
    cutoff = df['Most Recent Time on Market'].quantile(0.8)
    #df = df[df['Most Recent Time on Market'] <= cutoff].copy() # Added .copy() to avoid SettingWithCopyWarning

    # Calculate 'Rent Estimate to actual Price' column
    df['Rent Estimate to actual Price'] = df['Rent Zestimate'] / df['Price']

    # Calculate 'price to area average by zipcode' column
    # First, calculate the average price per zipcode
    average_price_by_zipcode = df.groupby('Zipcode')['Price'].transform('mean')

    # Then, calculate the ratio of 'Price' to the average price by zipcode
    df['price to area average by zipcode'] = df['Price'] / average_price_by_zipcode

    # Function to extract the numerical month of the most recent 'Listed for rent' or 'Sold' event
    def get_most_recent_month(events_string):
        try:
            events = json.loads(events_string)
            # Filter for 'Listed for rent' or 'Sold' events and sort by date
            relevant_events = sorted([e for e in events if e['type'] in ['Listed for rent', 'Sold']],
                                     key=lambda x: x['date'], reverse=True)
            if relevant_events:
                # Get the date of the most recent relevant event
                most_recent_date_str = relevant_events[0]['date']
                # Extract and return the numerical month
                return pd.to_datetime(most_recent_date_str).month
        except:
            # Return NaN if there's an error parsing or no relevant events
            return np.nan
        return np.nan

    # Apply the function to create the 'month listed' column
    df['month listed'] = df['Events'].apply(get_most_recent_month)


    # Filter out the 1% most expensive houses
    price_99th_percentile = df['Most Recent Time on Market'].quantile(0.8)
    df_filtered = df[df['Most Recent Time on Market'] <= price_99th_percentile].copy() # Added .copy()

    # Function to split and one-hot encode the 'Appliances' column
    def encode_appliances(df):
        # Split the 'Appliances' string into a list of appliances, handling NaN values
        appliances_list = df['Appliances'].str.split(', ').apply(lambda x: x if isinstance(x, list) else [])
        # Get all unique appliances
        all_appliances = sorted(list(set([item for sublist in appliances_list for item in sublist])))
        # Create new columns for each appliance and fill with 1 if the appliance is present
        for appliance in all_appliances:
            df[f'Appliance_{appliance}'] = appliances_list.apply(lambda x: 1 if appliance in x else 0)
        return df

    # Apply one-hot encoding to 'Appliances'
    df_filtered = encode_appliances(df_filtered)

    # Apply one-hot encoding to other string columns
    df_filtered = pd.get_dummies(df_filtered, columns=['Cooling', 'Heating', 'Parking', 'Laundry', 'Home Type'])

    # Function to fill NaN values with the median
    def fill_nan_with_median(df):
        df_copy = df.copy()
        for col in df_copy.columns:
            try:
                median_val = df_copy[col].median()
                if not pd.isna(median_val):
                    df_copy[col] = df_copy[col].fillna(median_val)
            except:
                pass
        return df_copy

    # Apply the custom NaN filling function
    df_filled = fill_nan_with_median(df_filtered.copy())

    # Shuffle the DataFrame
    df_shuffled = df_filled.sample(frac=1, random_state=41).reset_index(drop=True)

    # Create the binary target variable
    df_shuffled['is_fast_sale'] = (df_shuffled['Most Recent Time on Market'] < 30).astype(int)

    # Display the head of the DataFrame with the new column
    display(df_shuffled.head())

    # Display the data types
    display(df_shuffled.info())

    # Display the count of each unique value in 'is_fast_sale'
    display(df_shuffled['is_fast_sale'].value_counts())
else:
    print("DataFrame not loaded. Skipping subsequent steps.")

**Reasoning**:
The data file was not found in the previously attempted paths. Since the original notebook attempted to mount Google Drive, I will assume the file is accessible through that method and attempt to load it from the path specified in the notebook description: "/content/amenity_analysis_results.csv". I will then proceed with the steps for the current subtask, which is to create the binary target variable 'is_fast_sale'.



**Reasoning**:
The data file was not found at the specified path. I need to check the available files again to determine the correct file path.



In [None]:
import os
print(os.listdir('/content/'))
print(os.listdir('/content/sample_data/'))

## Modify model training

### Subtask:
Change the XGBoost model to a classifier (`xgb.XGBClassifier`) and train it using the new binary target variable.


**Reasoning**:
Separate features and the new binary target, select numeric columns, handle missing values, split the data, initialize and train the XGBoost classifier.



In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
import numpy as np

# Separate features (X) and the new binary target variable (y)
# Exclude non-numeric, irrelevant columns, and the original target 'Price'
X = df_shuffled.drop(columns=['Most Recent Time on Market', 'Price', 'Street Address', 'City', 'State', 'URL', 'Appliances', "Image URLs", "Date Details Fetched", "Events", 'is_fast_sale'])
y = df_shuffled['is_fast_sale']

# Select only numeric columns for training
X = X.select_dtypes(include=np.number)

# Replace infinite values with NaN and then fill NaN with the median
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X = X.fillna(X.median())

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) # Use stratify for binary classification

# Initialize and train the XGBoost Classifier model
# Using objective='binary:logistic' for binary classification
# use_label_encoder=False is used to suppress a deprecation warning
# eval_metric='logloss' is a common metric for binary classification
xgbc = xgb.XGBClassifier(objective='binary:logistic', n_estimators=400, learning_rate=0.01, random_state=42, use_label_encoder=False, eval_metric='logloss')
xgbc.fit(X_train, y_train)

print("XGBoost classification model training complete.")

## Evaluate Classification Model

### Subtask:
Evaluate the model's performance using appropriate metrics for binary classification (e.g., accuracy, precision, recall, F1-score, ROC AUC).

**Reasoning**:
Calculate and display common classification metrics such as accuracy, precision, recall, F1-score, and ROC AUC to assess the performance of the trained XGBoost classifier.

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Predict on the test set
y_pred = xgbc.predict(X_test)
y_pred_proba = xgbc.predict_proba(X_test)[:, 1] # Get the probabilities for the positive class

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print("Confusion Matrix:")
display(conf_matrix)

## SHAP Analysis for Classification

### Subtask:
Generate a SHAP summary plot to visualize feature importance for the classification model.

**Reasoning**:
Use the `shap` library to calculate SHAP values for the test set and generate a summary plot to show the impact of each feature on the classification model's predictions.

In [None]:
import shap

# Create a SHAP explainer object for the classification model
explainer = shap.TreeExplainer(xgbc)

# Calculate SHAP values for the test set
# For classification, shap_values will be a list of arrays, one for each class
shap_values = explainer.shap_values(X_test)

# Generate the SHAP summary plot for the positive class (class 1: fast sale)
# We use shap_values[1] to get the SHAP values for the positive class
shap.summary_plot(shap_values[1], X_test)

# Task
Generate a scatter plot of predicted vs actual values, fix the `ValueError: could not convert string to float: '[3.8168877E-1]'` in the SHAP graph, and create a correlation heatmap using the data in "/content/floridaHomeDetailsV3.csv".

## Fix shap error

### Subtask:
Identify and convert any remaining non-numeric data in the feature set `X_test` to a numeric type to resolve the `ValueError` in the SHAP analysis.


**Reasoning**:
The SHAP analysis failed because of a `ValueError` indicating a string could not be converted to a float. This suggests there are non-numeric values remaining in the `X_test` DataFrame that the SHAP explainer cannot handle. I will iterate through the columns of `X_test`, attempt to convert them to numeric, and then inspect any columns that still have non-numeric data to identify the cause of the error and apply appropriate cleaning.



**Reasoning**:
The scatter plot has been generated. The next subtask is to fix the ValueError in the SHAP graph.



## Generate Scatter Plot of Predicted vs. Actual

**Reasoning**:
Create a scatter plot to visualize the relationship between the actual and predicted 'is_fast_sale' values from the classification model. This will help assess the model's performance visually.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create a scatter plot of actual vs. predicted values
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred)
plt.title('Actual vs. Predicted Days on Market')
plt.xlabel('Actual Days on Market')
plt.ylabel('Predicted Days on Market')
plt.show()

## Generate Correlation Heatmap

**Reasoning**:
Calculate the correlation matrix for the relevant numeric columns in the DataFrame `df_shuffled` and generate a heatmap to visualize the correlations. This will help understand the relationships between the features used in the model.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Select only numeric columns from the shuffled DataFrame
df_shuffled_numeric = df_shuffled.select_dtypes(include=np.number)

# Calculate the correlation matrix
correlation_matrix_shuffled = df_shuffled_numeric.corr()

# Set the figure size
plt.figure(figsize=(18, 16))

# Create the correlation heatmap
sns.heatmap(correlation_matrix_shuffled, cmap='coolwarm', annot=False)
plt.title('Correlation Heatmap of Numeric Features (Shuffled Data)')
plt.show()