In [29]:
import pandas as pd
import numpy as np
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
import random
import os

In [30]:
def convert_train(df):
    les = {}
    for col in df.columns:
        if df[col].dtype == object:
            le = preprocessing.LabelEncoder()
            df[col] = le.fit_transform(df[col].values)  
            les[col] = le
        if df[col].dtype == float:
            df[col] = df[col].fillna(0)
            df[col] = df[col].apply(lambda x: round(x))
            df[col] = df[col].astype(int)      
    return df, les

In [31]:
def convert_test(df, les):
    for col, le in les.items():
        le = preprocessing.LabelEncoder()
        df[col] = le.fit_transform(df[col].values)  
        les[col] = le
    for col in df.columns:
        if df[col].dtype == float:
            df[col] = df[col].fillna(0)
            df[col] = df[col].apply(lambda x: round(x))
            df[col] = df[col].astype(int)  
    return df

In [66]:
import pandas as pd

def clean_dataframe(df):
    # Define mappings for categorical features
    mappings = {
        'MSZoning': {
            'A': 0,
            'C': 1,
            'FV': 2,
            'I': 3,
            'RH': 4,
            'RL': 5,
            'RP': 6,
            'RM': 7
        },
        'Street': {
            'Grvl': 0,
            'Pave': 1
        },
        'Alley': {
            'Grvl': 1,
            'Pave': 2,
            'NA': 0
        },
        'LotShape': {
            'Reg': 0,
            'IR1': 1,
            'IR2': 2,
            'IR3': 3
        },
        'LandContour': {
            'Lvl': 0,
            'Bnk': 1,
            'HLS': 2,
            'Low': 3
        },
        'Utilities': {
            'AllPub': 0,
            'NoSewr': 1,
            'NoSeWa': 2,
            'ELO': 3
        },
        'LotConfig': {
            'Inside': 0,
            'Corner': 1,
            'CulDSac': 2,
            'FR2': 3,
            'FR3': 4
        },
        'LandSlope': {
            'Gtl': 0,
            'Mod': 1,
            'Sev': 2
        },
        'Neighborhood': {
            'Blmngtn': 0,
            'Blueste': 1,
            'BrDale': 2,
            'BrkSide': 3,
            'ClearCr': 4,
            'CollgCr': 5,
            'Crawfor': 6,
            'Edwards': 7,
            'Gilbert': 8,
            'IDOTRR': 9,
            'MeadowV': 10,
            'Mitchel': 11,
            'Names': 12,
            'NoRidge': 13,
            'NPkVill': 14,
            'NridgHt': 15,
            'NWAmes': 16,
            'OldTown': 17,
            'SWISU': 18,
            'Sawyer': 19,
            'SawyerW': 20,
            'Somerst': 21,
            'StoneBr': 22,
            'Timber': 23,
            'Veenker': 24
        },
        'Condition1': {
            'Artery': 0,
            'Feedr': 1,
            'Norm': 2,
            'RRNn': 3,
            'RRAn': 4,
            'PosN': 5,
            'PosA': 6,
            'RRNe': 7,
            'RRAe': 8
        },
        'Condition2': {
            'Artery': 0,
            'Feedr': 1,
            'Norm': 2,
            'RRNn': 3,
            'RRAn': 4,
            'PosN': 5,
            'PosA': 6,
            'RRNe': 7,
            'RRAe': 8
        },
        'BldgType': {
            '1Fam': 0,
            '2FmCon': 1,
            'Duplx': 2,
            'TwnhsE': 3,
            'TwnhsI': 4
        },
        'HouseStyle': {
            '1Story': 0,
            '1.5Fin': 1,
            '1.5Unf': 2,
            '2Story': 3,
            '2.5Fin': 4,
            '2.5Unf': 5,
            'SFoyer': 6,
            'SLvl': 7
        },
        'RoofStyle': {
            'Flat': 0,
            'Gable': 1,
            'Gambrel': 2,
            'Hip': 3,
            'Mansard': 4,
            'Shed': 5
        },
        'RoofMatl': {
            'ClyTile': 0,
            'CompShg': 1,
            'Membran': 2,
            'Metal': 3,
            'Roll': 4,
            'Tar&Grv': 5,
            'WdShake': 6,
            'WdShngl': 7
        },
        'ExterQual': {
            'Ex': 0,
            'Gd': 1,
            'TA': 2,
            'Fa': 3,
            'Po': 4
        },
        'ExterCond': {
            'Ex': 0,
            'Gd': 1,
            'TA': 2,
            'Fa': 3,
            'Po': 4
        },
        'Foundation': {
            'BrkTil': 0,
            'CBlock': 1,
            'PConc': 2,
            'Slab': 3,
            'Stone': 4,
            'Wood': 5
        },
        'BsmtQual': {
            'Ex': 0,
            'Gd': 1,
            'TA': 2,
            'Fa': 3,
            'Po': 4,
            'NA': 5
        },
        'BsmtCond': {
            'Ex': 0,
            'Gd': 1,
            'TA': 2,
            'Fa': 3,
            'Po': 4,
            'NA': 5
        },
        'BsmtExposure': {
            'Gd': 0,
            'Av': 1,
            'Mn': 2,
            'No': 3,
            'NA': 4
        },
        'BsmtFinType1': {
            'GLQ': 0,
            'ALQ': 1,
            'BLQ': 2,
            'Rec': 3,
            'LwQ': 4,
            'Unf': 5,
            'NA': 6
        },
        'BsmtFinType2': {
            'GLQ': 0,
            'ALQ': 1,
            'BLQ': 2,
            'Rec': 3,
            'LwQ': 4,
            'Unf': 5,
            'NA': 6
        },
        'Heating': {
            'Floor': 0,
            'GasA': 1,
            'GasW': 2,
            'Grav': 3,
            'OthW': 4,
            'Wall': 5
        },
        'HeatingQC': {
            'Ex': 0,
            'Gd': 1,
            'TA': 2,
            'Fa': 3,
            'Po': 4
        },
        'CentralAir': {
            'N': 0,
            'Y': 1
        },
        'Electrical': {
            'SBrkr': 0,
            'FuseA': 1,
            'FuseF': 2,
            'FuseP': 3,
            'Mix': 4
        },
        'KitchenQual': {
            'Ex': 0,
            'Gd': 1,
            'TA': 2,
            'Fa': 3,
            'Po': 4
        },
        'Functional': {
            'Typ': 0,
            'Min1': 1,
            'Min2': 2,
            'Mod': 3,
            'Maj1': 4,
            'Maj2': 5,
            'Sev': 6,
            'Sal': 7
        },
        'FireplaceQu': {
            'Ex': 0,
            'Gd': 1,
            'TA': 2,
            'Fa': 3,
            'Po': 4,
            'NA': 5
        },
        'GarageType': {
            '2Types': 0,
            'Attchd': 1,
            'Basment': 2,
            'BuiltIn': 3,
            'CarPort': 4,
            'Detchd': 5,
            'NA': 6
        },
        'GarageFinish': {
            'Fin': 0,
            'RFn': 1,
            'Unf': 2,
            'NA': 3
        },
        'GarageQual': {
            'Ex': 0,
            'Gd': 1,
            'TA': 2,
            'Fa': 3,
            'Po': 4,
            'NA': 5
        },
        'GarageCond': {
            'Ex': 0,
            'Gd': 1,
            'TA': 2,
            'Fa': 3,
            'Po': 4,
            'NA': 5
        },
        'PavedDrive': {
            'Y': 0,
            'P': 1,
            'N': 2
        },
        'PoolQC': {
            'Ex': 0,
            'Gd': 1,
            'TA': 2,
            'Fa': 3,
            'NA': 4
        },
        'Fence': {
            'GdPrv': 0,
            'MnPrv': 1,
            'GdWo': 2,
            'MnWw': 3,
            'NA': 4
        },
        'MiscFeature': {
            'Elev': 0,
            'Gar2': 1,
            'Othr': 2,
            'Shed': 3,
            'TenC': 4,
            'NA': 5
        },
        'SaleType': {
            'WD': 0,
            'CWD': 1,
            'VWD': 2,
            'New': 3,
            'COD': 4,
            'Con': 5,
            'ConLw': 6,
            'ConLI': 7,
            'ConLD': 8,
            'Oth': 9
        },
        'SaleCondition': {
            'Normal': 0,
            'Abnorml': 1,
            'AdjLand': 2,
            'Alloca': 3,
            'Family': 4,
            'Partial': 5
        }
    }
    
    # Handle missing values for remaining columns
    df.fillna(0, inplace=True)

    # Apply mappings to categorical columns
    for col, mapping in mappings.items():
        df[col] = df[col].apply(lambda x: mapping.get(x))
    
    # Convert specific columns to integers
    int_columns = [
        'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 
        'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 
        'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 
        'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 
        'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 
        'MiscVal','MSSubClass'
    ]
    df[int_columns] = df[int_columns].fillna(0).astype(int)
    
    # Convert float columns
    float_columns = ['LotFrontage', 'LotArea', 'MasVnrArea']
    df[float_columns] = df[float_columns].astype(float)
    df.fillna(0, inplace=True)

    return df


In [67]:
def eval(y_test, y_pred):
    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    print(f"R-squared (R²): {r2}")
    print(f"Mean Absolute Percentage Error (MAPE): {mape}%")
    return rmse

In [68]:
def create_submission(model, df):
    str_date = datetime.today().strftime('%Y_%m_%d_%H_%M_%S')
    path_submission = os.path.join("/workspaces/ml_challenge/data", "submission/")
    filename = f"submission_{str_date}.csv"
    filepath_submission = os.path.join(path_submission, filename)
    ids_series = df["Id"]
    predicted_prices = model.predict(df)
    predicted_prices_series = pd.Series(predicted_prices, name="SalePrice")

    submission = pd.DataFrame({
        'id': ids_series,
        'SalePrice': predicted_prices_series
    }).set_index('id')
    submission.to_csv(path_or_buf=filepath_submission)

In [69]:
df = pd.read_csv("/workspaces/ml_challenge/data/train.csv")
df_test = pd.read_csv("/workspaces/ml_challenge/data/test.csv")

In [70]:
df = clean_dataframe(df)

In [72]:
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,5.0,65.0,8450.0,1,0.0,0,0,0,...,0,0.0,0.0,0.0,0,2,2008,0,0,208500
1,2,20,5.0,80.0,9600.0,1,0.0,0,0,0,...,0,0.0,0.0,0.0,0,5,2007,0,0,181500
2,3,60,5.0,68.0,11250.0,1,0.0,1,0,0,...,0,0.0,0.0,0.0,0,9,2008,0,0,223500
3,4,70,5.0,60.0,9550.0,1,0.0,1,0,0,...,0,0.0,0.0,0.0,0,2,2006,0,1,140000
4,5,60,5.0,84.0,14260.0,1,0.0,1,0,0,...,0,0.0,0.0,0.0,0,12,2008,0,0,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,5.0,62.0,7917.0,1,0.0,0,0,0,...,0,0.0,0.0,0.0,0,8,2007,0,0,175000
1456,1457,20,5.0,85.0,13175.0,1,0.0,0,0,0,...,0,0.0,1.0,0.0,0,2,2010,0,0,210000
1457,1458,70,5.0,66.0,9042.0,1,0.0,0,0,0,...,0,0.0,0.0,3.0,2500,5,2010,0,0,266500
1458,1459,20,5.0,68.0,9717.0,1,0.0,0,0,0,...,0,0.0,0.0,0.0,0,4,2010,0,0,142125


In [36]:
df, les = convert_train(df)
df_test = convert_test(df_test, les)

In [37]:
X = df.loc[:, df.columns != 'SalePrice']
Y = df.loc[:, 'SalePrice']

seed = 42
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=random.randint(0,100))

In [46]:
model = xgb.XGBRegressor(learning_rate =0.01,
n_estimators=3000,
max_depth=6,
subsample=0.6,
colsample_bytree=0.75,
nthread=4,
scale_pos_weight=1,
seed=random.randint(0,100))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
eval(y_test, y_pred)
create_submission(model, df_test)

Root Mean Squared Error (RMSE): 27669.886609984154
