In [87]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler

class BoostClass:
    r2 =0
    rmse = 0
    n_estimators = 0
    max_depth = 0
    model = None

    def __init__(self, r2 ,rmse , n_estimators , max_depth,model ):
        self.r2 =r2
        self.rmse = rmse
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.model = model

    def toString(self):
        return "r2: " , self.r2 , ", rmse:" , self.rmse , ", n_estimators:" , self.n_estimators , ", max depth:" , self.max_depth

def fixData(housing):
    """Take out the column to predict before applying this method"""
    # Convert the date column to datetime format
    housing['date'] = pd.to_datetime(housing['date'], format='%Y%m%dT%H%M%S')

    housing['day'] = housing['date'].dt.day
    # Calculate the number of days in the current year
    housing['days_since_april_1st'] = (housing['date'].dt.dayofyear + 365 - 91) % 365

    housing = housing.drop(["price", "id", "date", "day"], axis=1)

    # Add total sqft to housing
    housing["total_sqft"] = housing["sqft_basement"] + housing["sqft_living"]
    # housing["age_since_renovation"] = 2025 - housing["yr_renovated"] if housing["yr_renovated"] != 0 else 2025 - housing["yr_built"]
    # Calculate age since renovation or age since built
    housing["age_since_renovation"] = housing.apply(lambda row: 2025 - row["yr_renovated"] if row["yr_renovated"] != 0 else 2025 - row["yr_built"], axis=1)

    # turn zip code into category
    housing["zipcode"] = housing["zipcode"].astype("category")

    scaler = MinMaxScaler(feature_range=(0, 1))

    for sqft in ['sqft_basement', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_living15', 'sqft_lot15', 'total_sqft']:
        housing[sqft] = scaler.fit_transform(housing[[sqft]])

    return housing

In [20]:
!pip install scikit-learn==1.5.2
# run this line without the '!' in your terminal to have it installed locally
# then you can skip running this



# create days_since_april_1 column
# convert dateTime to days out of 365
# for each value, add 365, subtract april 1st as a number of days
# for each value, mod by 365




[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [88]:
# Load dataset
housing = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing.csv')
y = np.log1p(housing["price"])  # Log transform target

X = fixData(housing)
print(X.head())
X = pd.get_dummies(X)

   bedrooms  bathrooms  sqft_living  sqft_lot  floors  waterfront  view  \
0         4       3.25     0.261887  0.003569     2.0           0     0   
1         3       1.75     0.088302  0.006253     1.0           0     0   
2         4       1.00     0.079245  0.004535     1.5           0     2   
3         3       1.75     0.086792  0.006468     1.0           0     0   
4         2       1.50     0.112453  0.003633     1.0           0     0   

   condition  grade  sqft_above  ...  yr_built  yr_renovated  zipcode  \
0          3      8    0.268640  ...      2007             0    98038   
1          2      7    0.082237  ...      1979             0    98023   
2          3      7    0.115132  ...      1914             0    98116   
3          3      8    0.126096  ...      1985             0    98023   
4          4      7    0.086623  ...      1947             0    98117   

       lat     long  sqft_living15  sqft_lot15  days_since_april_1st  \
0  47.3862 -122.048       0.495784    

In [89]:
# DO NOT RUN MORE THAN ONCE, we want to keep these variables
boostClassList = []
bestClass = None
# best RMSE so far is ~127000
# 1700 n-estimators, 5 depth

In [None]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   bedrooms       20000 non-null  int64  
 1   bathrooms      20000 non-null  float64
 2   sqft_living    20000 non-null  int64  
 3   sqft_lot       20000 non-null  int64  
 4   floors         20000 non-null  float64
 5   waterfront     20000 non-null  int64  
 6   view           20000 non-null  int64  
 7   condition      20000 non-null  int64  
 8   grade          20000 non-null  int64  
 9   sqft_above     20000 non-null  int64  
 10  sqft_basement  20000 non-null  int64  
 11  yr_built       20000 non-null  int64  
 12  yr_renovated   20000 non-null  int64  
 13  zipcode        20000 non-null  int64  
 14  lat            20000 non-null  float64
 15  long           20000 non-null  float64
 16  sqft_living15  20000 non-null  int64  
 17  sqft_lot15     20000 non-null  int64  
dtypes: flo

In [97]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Create XGBoost regressor
#1720, depth 6

best_l = None
best_cols = None

for l in range(1, 11, 1):
    l = l/1000.0
    for colsample in range(3,11,1):
        colsample = colsample/10
        n = 1720
        d = 6
        model = xgb.XGBRegressor(objective="reg:squarederror", eval_metric="rmse", subsample=0.6,
                            n_estimators=n, max_depth=d, learning_rate=l, colsample_bytree=colsample)

        # Train model
        model.fit(X_train, y_train)

        # Make predictions on test set
        y_pred = model.predict(X_test)
        y_non_log_test = np.expm1(y_test)
        y_non_log_pred = np.expm1(y_pred)

        if (best_l == None):
            best_l = l
        if (best_cols == None):
            best_cols = colsample


        rmse = root_mean_squared_error(y_non_log_test, y_non_log_pred)
        # return from a log of cost to cost
        # rmse = np.expm1(rmse)
        r2 = r2_score(y_test, y_pred)  # Compute R^2
        r2_non_log = r2_score(y_non_log_test, y_non_log_pred)

        print(f"R² Score: {r2:.4f} - {r2_non_log:.4f}")
        print(f"RMSE: {rmse:.4f}")
        print("estimators: ",n,", depth: ",d)

        boostClass = BoostClass(r2, rmse, n, d, model)
        boostClassList.append(boostClass)
        if (bestClass != None):
            if (boostClass.rmse < bestClass.rmse):
                bestClass = boostClass
                best_l = l
                best_cols = colsample
        else:
            bestClass = boostClass
        #r2 =0,rmse = 0, n_estimators = 0, max_depth = 0,model = None
print("Done!")

R² Score: 0.7560 - 0.6177
RMSE: 240133.7821
estimators:  1720 , depth:  6
R² Score: 0.7807 - 0.6455
RMSE: 231225.0406
estimators:  1720 , depth:  6
R² Score: 0.7956 - 0.6623
RMSE: 225695.8794
estimators:  1720 , depth:  6
R² Score: 0.8050 - 0.6721
RMSE: 222381.1891
estimators:  1720 , depth:  6
R² Score: 0.8106 - 0.6788
RMSE: 220111.1730
estimators:  1720 , depth:  6
R² Score: 0.8145 - 0.6830
RMSE: 218651.0569
estimators:  1720 , depth:  6
R² Score: 0.8176 - 0.6858
RMSE: 217690.4189
estimators:  1720 , depth:  6
R² Score: 0.8195 - 0.6877
RMSE: 217015.6205
estimators:  1720 , depth:  6
R² Score: 0.8575 - 0.7991
RMSE: 174064.8262
estimators:  1720 , depth:  6
R² Score: 0.8695 - 0.8150
RMSE: 167057.5290
estimators:  1720 , depth:  6
R² Score: 0.8760 - 0.8243
RMSE: 162800.9947
estimators:  1720 , depth:  6
R² Score: 0.8796 - 0.8306
RMSE: 159842.5073
estimators:  1720 , depth:  6
R² Score: 0.8815 - 0.8339
RMSE: 158264.5429
estimators:  1720 , depth:  6
R² Score: 0.8826 - 0.8362
RMSE: 157159

KeyboardInterrupt: 

In [99]:
# Evaluate performance
print(bestClass.toString())
print(best_l, best_cols)

('r2: ', 0.9058674026993602, ', rmse:', np.float64(126122.8966237912), ', n_estimators:', 1700, ', max depth:', 5)
0.001 0.3


In [None]:
# from sklearn.metrics import confusion_matrix
# cm = confusion_matrix(y_test, y_pred)
# print(cm)


True Positives: 0
True Negatives: 4000


In [None]:
# Load the holdout dataset
holdout_data = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing_holdout_test_mini.csv')

# Drop unnecessary columns
X_holdout = holdout_data.drop(["id", "date"], axis=1)

# Predict using the trained model
y_holdout_pred = model.predict(X_holdout)

# Convert predictions back to original scale
y_holdout_pred_original = np.expm1(y_holdout_pred)

# Create DataFrame with correct format
predictions_df = pd.DataFrame({"price": y_holdout_pred_original})


In [None]:

# Save in the required format
team_name = "team8"  # Replace with your actual team name
filename = f"{team_name}-module3-predictions.csv"
predictions_df.to_csv(filename, index=False)

print(f"Predictions saved as {filename}")

Predictions saved as team8-module3-predictions.csv
