In [1]:
# read data_cleaned.csv
import pandas as pd

data = pd.read_csv("data_cleaned.csv")

In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59759 entries, 0 to 59758
Data columns (total 15 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   BOROUGH                         59759 non-null  object 
 1   NEIGHBORHOOD                    59759 non-null  object 
 2   LOT                             59759 non-null  int64  
 3   RESIDENTIAL UNITS               59759 non-null  int64  
 4   COMMERCIAL UNITS                59759 non-null  int64  
 5   TOTAL UNITS                     59759 non-null  int64  
 6   LAND SQUARE FEET                38571 non-null  float64
 7   GROSS SQUARE FEET               38020 non-null  float64
 8   TAX CLASS AT TIME OF SALE       59759 non-null  int64  
 9   SALE PRICE                      59759 non-null  float64
 10  SALE YEAR                       59759 non-null  int64  
 11  SALE YEAR_MONTH                 59759 non-null  object 
 12  BUILDING CLASS AT PRESENT       

In [3]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

categorical_cols = data.select_dtypes(include=['object']).columns.tolist()

# Initialize the CatBoostRegressor with categorical features
model = CatBoostRegressor(iterations=100,
                         learning_rate=0.1,
                         depth=6,
                         loss_function='RMSE',
                         eval_metric='RMSE',
                         random_seed=42,
                         cat_features=categorical_cols)  # Specify categorical columns

X = data.drop('SALE PRICE', axis=1)  
y = data['SALE PRICE']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Pool for training and testing data
train_pool = Pool(X_train, y_train, cat_features=categorical_cols)
test_pool = Pool(X_test, cat_features=categorical_cols)

# Train the CatBoost model
model.fit(train_pool, verbose=100)

# Make predictions on the test set
y_pred = model.predict(test_pool)

# Calculate RMSE and R^2 scores for evaluation
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"R^2: {r2}")

0:	learn: 590113.9601698	total: 229ms	remaining: 22.6s
99:	learn: 392854.2250291	total: 1.25s	remaining: 0us
RMSE: 395888.4976105999
R^2: 0.5845559811319112


In [4]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Assuming `data` is your DataFrame and 'SALE PRICE' is the target variable
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
data[categorical_cols] = data[categorical_cols].astype('category')

X = data.drop('SALE PRICE', axis=1)
y = data['SALE PRICE']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the LGBMRegressor, specifying categorical_feature parameter is not needed as LightGBM will auto-detect them based on dtype
model = LGBMRegressor(num_leaves=31,
                      learning_rate=0.1,
                      n_estimators=100,
                      categorical_feature=categorical_cols)  # This line is optional, LightGBM uses dtype to detect

# Train the model
model.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric='rmse')

# Make predictions
y_pred = model.predict(X_test)

# Calculate RMSE and R^2 scores for evaluation
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"R^2: {r2}")


Please use categorical_feature argument of the Dataset constructor to pass this parameter.
Please use categorical_feature argument of the Dataset constructor to pass this parameter.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001041 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1422
[LightGBM] [Info] Number of data points in the train set: 47807, number of used features: 14
[LightGBM] [Info] Start training from score 811724.945552
RMSE: 365038.0085748024
R^2: 0.6467819272906778


# Could not do the rest because of the hardware constraints, fyi

In [5]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Calculate the median values for 'LAND SQUARE FEET' and 'GROSS SQUARE FEET'
land_square_feet_median = data['LAND SQUARE FEET'].median()
gross_square_feet_median = data['GROSS SQUARE FEET'].median()

# Impute missing values using the calculated medians
data['LAND SQUARE FEET'].fillna(land_square_feet_median, inplace=True)
data['GROSS SQUARE FEET'].fillna(gross_square_feet_median, inplace=True)

# Assuming 'data' is your DataFrame and 'target_column' is the name of your target variable
target_column = 'SALE PRICE'  # Replace with your actual target column name

# Separate features and target. The target column is not included in the transformations.
X = data.drop(target_column, axis=1)
y = data[target_column]

# Identify categorical and numerical columns from the features only
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Define the ColumnTransformer to one-hot encode categorical variables
# and scale numerical variables. The target variable 'y' is not transformed.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Apply the transformations to the features only
X_transformed = preprocessor.fit_transform(X)

# Retrieve the feature names after transformations
feature_names = preprocessor.get_feature_names_out()

# Proceed with the train/test split or any other operations
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

# X_train and X_test are now preprocessed and ready for model training and evaluation


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Create and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate RMSE and R^2 scores for evaluation
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"R^2: {r2}")


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

# Create and train the XGBoost Regression model
model = xgb.XGBRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate RMSE and R^2 scores for evaluation
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"R^2: {r2}")

# Get feature importances
feature_importances = model.feature_importances_

# Check the lengths of feature_names and feature_importances
if len(feature_names) != len(feature_importances):
    print(f"Length of feature_names: {len(feature_names)}")
    print(f"Length of feature_importances: {len(feature_importances)}")
    # Raise an error or handle the mismatch accordingly

# Create a DataFrame to show feature importance
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
import shap

numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns

# Separate features and target
X = data_filtered[numerical_cols]
y = data_filtered['SALE PRICE']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a model (e.g., XGBoost) on the training data
model = xgb.XGBRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Initialize the SHAP explainer with your trained model and the background dataset (X_train)
explainer = shap.Explainer(model, X_train)

# Compute SHAP values for the test set
shap_values = explainer.shap_values(X_test)

# Visualize the SHAP summary plot to understand the impact of features on predictions
shap.summary_plot(shap_values, X_test)


# After all these, my plan was to optimize the selected model using RFE method for feature selection and then perform hyperparameter tuning which I could not manage due to hardware constraints; reason being simply taking too much time