In [6]:
# quelle für code: https://www.kaggle.com/code/maarklam/gradient-boosted-regression-with-pipelines/notebook

# Gruppenmitglieder:
# Joshua Brenzinger
# Pascal Breucker

In [7]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
import seaborn as sns

from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [8]:
df = pd.read_csv('../ames.csv')

In [9]:
# Read the data
X = df[:2001].copy()
X_test = df[2001:].copy()
X_test2 = X_test.drop(columns=['Sale_Price'])
y_test = df['Sale_Price'][2001:].copy()


y = X.Sale_Price
X.drop(columns=['Sale_Price'], inplace=True)

In [13]:
df.head()

Unnamed: 0,MS_SubClass,MS_Zoning,Lot_Frontage,Lot_Area,Street,Alley,Lot_Shape,Land_Contour,Utilities,Lot_Config,...,Fence,Misc_Feature,Misc_Val,Mo_Sold,Year_Sold,Sale_Type,Sale_Condition,Sale_Price,Longitude,Latitude
0,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,141,31770,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Corner,...,No_Fence,,0,5,2010,WD,Normal,215000,-93.619754,42.054035
1,One_Story_1946_and_Newer_All_Styles,Residential_High_Density,80,11622,Pave,No_Alley_Access,Regular,Lvl,AllPub,Inside,...,Minimum_Privacy,,0,6,2010,WD,Normal,105000,-93.619756,42.053014
2,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,81,14267,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Corner,...,No_Fence,Gar2,12500,6,2010,WD,Normal,172000,-93.619387,42.052659
3,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,93,11160,Pave,No_Alley_Access,Regular,Lvl,AllPub,Corner,...,No_Fence,,0,4,2010,WD,Normal,244000,-93.61732,42.051245
4,Two_Story_1946_and_Newer,Residential_Low_Density,74,13830,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Inside,...,Minimum_Privacy,,0,3,2010,WD,Normal,189900,-93.638933,42.060899


In [14]:
categorical_cols = [col for col in X.columns if X[col].dtype == 'object']
numerical_cols = [col for col in X.columns if (X[col].dtype == 'int64' or X[col].dtype == 'float64')]

In [15]:
# Replace missing values using a descriptive statistic (e.g. mean, median, or most frequent) along each column, or using a constant value.
numerical_transformer = SimpleImputer()

# This encoding is needed for feeding categorical data to many scikit-learn estimators, notably linear models and SVMs with the standard kernels.
categorical_transformer = Pipeline(steps=
                                   [('imputer', SimpleImputer(strategy='most_frequent')),
                                    ('onehot', OneHotEncoder(handle_unknown='ignore'))])
# preprocessor
preprocessor = ColumnTransformer(transformers=
                                 [('num', numerical_transformer, numerical_cols), 
                                  ('cat', categorical_transformer, categorical_cols)])
# model
model = XGBRegressor(random_state=0)

# pipeline
pipeline = Pipeline(steps=
                   [('preprocess', preprocessor),
                   ('model', model)])

# grid
grid = GridSearchCV(pipeline,  
                    param_grid={'model__n_estimators': [2000, 3000],
                                'model__learning_rate' : [0.01, 0.05],                                
                                'model__min_child_weight' : [0, 1]
                               },
                    cv = 10,
                    # scoring = 'neg_mean_absolute_error',
                    scoring = 'r2'
                    )

grid.fit(X, y)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocess',
                                        ColumnTransformer(transformers=[('num',
                                                                         SimpleImputer(),
                                                                         ['Lot_Frontage',
                                                                          'Lot_Area',
                                                                          'Year_Built',
                                                                          'Year_Remod_Add',
                                                                          'Mas_Vnr_Area',
                                                                          'BsmtFin_SF_1',
                                                                          'BsmtFin_SF_2',
                                                                          'Bsmt_Unf_SF',
                                         

In [16]:
print(f"Best model parameters: {grid.best_params_}")
print(f"Best score: {grid.best_score_}")

Best model parameters: {'model__learning_rate': 0.05, 'model__min_child_weight': 0, 'model__n_estimators': 3000}
Best score: 0.8673040335499806


In [17]:
# make predictions
y_pred = grid.predict(X_test2)

In [18]:
# measure results

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'R2: {r2_score(y_test, y_pred)}')

MAE: 15648.011651473358
MSE: 768928095.4808434
R2: 0.8703462471681407


In [9]:
output = pd.DataFrame({'Id': X_test2.index, 'Sale_Price': y_pred})

In [11]:
output.to_csv('submission.csv', index=False)    
