In [1]:
from google.colab import files
files.upload()  # Upload your kaggle.json file here


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"ankancode","key":"80fafe2047aa07bb7db1421320fa6a7d"}'}

In [2]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download the competition data
!kaggle competitions download -c house-prices-advanced-regression-techniques
!unzip house-prices-advanced-regression-techniques.zip


Downloading house-prices-advanced-regression-techniques.zip to /content
  0% 0.00/199k [00:00<?, ?B/s]
100% 199k/199k [00:00<00:00, 266MB/s]
Archive:  house-prices-advanced-regression-techniques.zip
  inflating: data_description.txt    
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score


In [4]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Save 'Id' columns
train_ID = train['Id']
test_ID = test['Id']

# Save target
y = train['SalePrice']

# Drop Id and SalePrice from train
train.drop(['Id', 'SalePrice'], axis=1, inplace=True)
test.drop(['Id'], axis=1, inplace=True)

# Combine for uniform preprocessing
all_data = pd.concat([train, test], axis=0).reset_index(drop=True)
print("Combined shape:", all_data.shape)


Combined shape: (2919, 79)


In [5]:
# Fill 'None' where feature is categorical with missing = no feature
none_cols = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
             'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
             'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
             'MasVnrType']
for col in none_cols:
    all_data[col] = all_data[col].fillna("None")

# Fill 0 where missing means "not present"
zero_cols = ['GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFullBath',
             'BsmtHalfBath', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'MasVnrArea']
for col in zero_cols:
    all_data[col] = all_data[col].fillna(0)

# Fill with mode (most frequent)
mode_cols = ['MSZoning', 'KitchenQual', 'Electrical', 'Functional', 'Exterior1st', 'Exterior2nd', 'SaleType']
for col in mode_cols:
    all_data[col] = all_data[col].fillna(all_data[col].mode()[0])

# Grouped median for LotFrontage
all_data['LotFrontage'] = all_data.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median())
)

# Drop Utilities - too dominated by 1 value
all_data.drop("Utilities", axis=1, inplace=True)


In [6]:
# Convert numeric to categorical
all_data['MSSubClass'] = all_data['MSSubClass'].astype(str)
all_data['OverallCond'] = all_data['OverallCond'].astype(str)
all_data['YrSold'] = all_data['YrSold'].astype(str)
all_data['MoSold'] = all_data['MoSold'].astype(str)

# Label Encoding for ordinal variables
ordinal_cols = ['FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond',
                'ExterQual', 'ExterCond', 'HeatingQC', 'PoolQC', 'KitchenQual',
                'BsmtFinType1', 'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure',
                'GarageFinish', 'LandSlope', 'LotShape', 'PavedDrive', 'Street', 'Alley']
for col in ordinal_cols:
    lbl = LabelEncoder()
    all_data[col] = lbl.fit_transform(all_data[col].astype(str))

# Create new features
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
all_data['TotalBath'] = (all_data['BsmtFullBath'] + 0.5 * all_data['BsmtHalfBath'] +
                         all_data['FullBath'] + 0.5 * all_data['HalfBath'])
all_data['TotalPorchSF'] = (all_data['OpenPorchSF'] + all_data['EnclosedPorch'] +
                            all_data['3SsnPorch'] + all_data['ScreenPorch'])

# One-hot encode
all_data = pd.get_dummies(all_data)

# Feature Scaling
scaler = StandardScaler()
all_data = pd.DataFrame(scaler.fit_transform(all_data), columns=all_data.columns)


In [7]:
X = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]


In [8]:
def rmse_cv(model):
    rmse = -cross_val_score(model, X, y, scoring="neg_root_mean_squared_error", cv=5)
    return rmse.mean()

model = Ridge(alpha=10)
print("Ridge RMSE:", rmse_cv(model))

# Fit on full data
model.fit(X, y)


Ridge RMSE: 34485.3690061799


In [9]:
preds = model.predict(X_test)
submission = pd.DataFrame({
    "Id": test_ID,
    "SalePrice": preds
})
submission.to_csv("submission.csv", index=False)


In [10]:
!kaggle competitions submit -c house-prices-advanced-regression-techniques -f submission.csv -m "My submission"


100% 33.6k/33.6k [00:00<00:00, 132kB/s]
Successfully submitted to House Prices - Advanced Regression Techniques