Andreas Bjelland Berg *(ID: 767921)*, Jitka Polaskova *(ID: 566613)* and Andreas Brennsæter *(ID: 507400)*

**Kaggle competition name:** Moscow housing

**Kaggle team name:** Team 95

In [163]:
import pandas as pd
from pandas.api.types import CategoricalDtype
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

pd.set_option('display.max_rows', None)

%matplotlib inline

# Read files
We read the raw kaggle-given file from GitHub, to simplify running the notebook. These files have **not** been preprocessed, and are the same as what's given from Kaggle.

In [164]:
def read_file(url):
  url = url + "?raw=true"
  df = pd.read_csv(url)
  return df

url = "https://github.com/andbren/TDT-4173/blob/main/dataset/apartments_train.csv"
apartments = read_file(url)

url = "https://github.com/andbren/TDT-4173/blob/main/dataset/buildings_train.csv"
buildings = read_file(url)

print(f'All apartments have an associated building: {apartments.building_id.isin(buildings.id).all()}')
data = pd.merge(apartments, buildings.set_index('id'), how='left', left_on='building_id', right_index=True)
data.set_index("id", inplace=True)

All apartments have an associated building: True


In [165]:
url = "https://github.com/andbren/TDT-4173/blob/main/dataset/apartments_test.csv"
apartments_test = read_file(url)

url = "https://github.com/andbren/TDT-4173/blob/main/dataset/buildings_test.csv"
buildings_test = read_file(url)

print(f'All apartments have an associated building: {apartments.building_id.isin(buildings.id).all()}')
data_test = pd.merge(apartments_test, buildings_test.set_index('id'), how='left', left_on='building_id', right_index=True)
data_test.set_index("id", inplace=True)

All apartments have an associated building: True


# Define target functions
RMSLE is missing from a lot of ML-libraries, so we define it ourself.

In [166]:
def root_mean_squared_log_error(y_true, y_pred):
    # Alternatively: sklearn.metrics.mean_squared_log_error(y_true, y_pred) ** 0.5
    assert (y_true >= 0).all() 
    assert (y_pred >= 0).all()
    log_error = np.log1p(y_pred) - np.log1p(y_true)  # Note: log1p(x) = log(1 + x)
    return np.mean(log_error ** 2) ** 0.5

def evaluate_predictions(predictions: pd.DataFrame, y_true: pd.DataFrame):
    """Evaluate predictions, the same way as done when uploading to Kaggle.

    Args:
      predictions: pandas DataFrame with predictions. Should be in the same
        order as the True data.
    
    Example:
      >>> # model = a previously trained model
      >>> results = model.predict(X_valid)
      >>> score = evaluate_predictions(results, y_valid)
    """
    return root_mean_squared_log_error(y_true, predictions)

def lightgbm_feval(y_true, y_pred):
    return "RMSLE", evaluate_predictions(np.expm1(y_pred), np.expm1(y_true)), False

# Preprocessing

## Fill in missing values

In [167]:
default_values = {   
    "seller":             4,        # Add a new category to seller - UNKNOWN = 4
    "area_kitchen":       np.median(data["area_kitchen"].dropna()),
    "area_living":        np.median(data["area_living"].dropna()),
    "layout":             3,        # Add a new category to layout - UNKNOWN = 3
    "ceiling":            np.median(data["ceiling"].dropna()),
    "bathrooms_shared":   0,
    "bathrooms_private":  0,
    "windows_court":      2,        # Change "windows_court" to categorical. New category - UNKNOWN = 2
    "windows_street":     2,        # Change "windows_street" to categorical. New category - UNKNOWN = 2
    "balconies":          0,
    "loggias":            0,
    "condition":          4,        # Add a new category to condition - UNKNOWN = 4
    "phones":             0,
    "new":                2,        # Change "new" to be categorical. New category - UNKNOWN = 2
    "district":           12,       # Add new category to district - UNKNOWN = 12
    "constructed":        np.median(data["constructed"].dropna()),
    "material":           7,        # Add new category to material - UNKNOWN = 7
    "elevator_without":   0,
    "elevator_passenger": 0,
    "elevator_service":   0,
    "parking":            3,         # Add new category to parking - UNKNOWN = 3
    "garbage_chute":      0,
    "heating":            4,         # Add new category to heating - UNKNOWN = 4
    "latitude":           data["latitude"].dropna().median(),
    "longitude":          data["longitude"].dropna().median(),
}

data.fillna(value=default_values, inplace = True)
data_test.fillna(value=default_values, inplace = True)

## Drop duplicates
(Not done in CSV that gives good results)

In [168]:
# print(f"Duplicated rows in data: ( {data.duplicated().sum()} ) of ( {data.shape[0]} )")
# data.drop_duplicates(inplace=True)
# print(f"Duplicated rows in data after removal: ( {data.duplicated().sum()} ) of ( {data.shape[0]} )")

## Remove outliers in data

In [169]:
_rows = data.shape[0]
print(f"Data rows before removing outliers: ( {_rows} )")

data.drop(data[data["price"] > 1.5e9].index, inplace=True)
data.drop(data[(data["price"] > 0.5e9) & (data["seller"] == 1)].index, inplace=True)
data.drop(data[data["area_living"] > 600].index, inplace=True)
data.drop(data[(data["price"] > 0.5e9) & (data["constructed"] > 1900) & (data["constructed"] < 1925)].index, inplace=True,)

print(f"Data rows after removing outliers: ( {data.shape[0]} ), removed ( {_rows - data.shape[0]} ) rows")

Data rows before removing outliers: ( 23285 )
Data rows after removing outliers: ( 23277 ), removed ( 8 ) rows


## Transform latitude/longitude-outliers in test data

In [170]:
# Transform extreme outliers in data_test
print(
    "Test rows where latitude < 50 || longitude < 30 || longitude > 40: (",
    data_test[(data_test['latitude'] < 50) | (data_test['longitude'] < 30) | (data_test['longitude'] > 40)].shape[0], 
    ") of total rows ( ", data_test.shape[0], ")"
)
data_test.loc[data_test["latitude"] < 50, "latitude"] = data_test[data_test["latitude"] >= 50]["latitude"].min()
data_test.loc[data_test["longitude"] < 30, "longitude"] = data_test[data_test["longitude"] >= 30]["longitude"].min()
data_test.loc[data_test["longitude"] > 40, "longitude"] = data_test[data_test["longitude"] <= 40]["longitude"].max()
print(
    "Test rows where latitude < 50 || longitude < 30 || longitude > 40 after transform: (",
    data_test[(data_test['latitude'] < 50) | (data_test['longitude'] < 30) | (data_test['longitude'] > 40)].shape[0], 
    ") of total rows ( ", data_test.shape[0], ")"
)

Test rows where latitude < 50 || longitude < 30 || longitude > 40: ( 7 ) of total rows (  9937 )
Test rows where latitude < 50 || longitude < 30 || longitude > 40 after transform: ( 0 ) of total rows (  9937 )


## Transform ceiling errors, likely from input errors

In [171]:
print(f"Data rows where ceiling > 10: ( {data[data['ceiling'] > 10].shape[0]} ) of total rows ( {data.shape[0]} )")
data.loc[data["ceiling"] > 100, "ceiling"] /= 10
data.loc[data["ceiling"] > 10, "ceiling"] /= 10
print(f"Data rows where ceiling > 10 after transform: ( {data[data['ceiling'] > 10].shape[0]} ) of total rows ( {data.shape[0]} )")

Data rows where ceiling > 10: ( 23 ) of total rows ( 23277 )
Data rows where ceiling > 10 after transform: ( 0 ) of total rows ( 23277 )


In [172]:
print(f"Test rows where ceiling > 10: ( {data_test[data_test['ceiling'] > 10].shape[0]} ) of total rows ( {data_test.shape[0]} )")
data_test.loc[data_test["ceiling"] > 100, "ceiling"] /= 10
data_test.loc[data_test["ceiling"] > 10, "ceiling"] /= 10
print(f"Test rows where ceiling > 10 after transform: ( {data_test[data_test['ceiling'] > 10].shape[0]} ) of total rows ( {data_test.shape[0]} )")

Test rows where ceiling > 10: ( 16 ) of total rows ( 9937 )
Test rows where ceiling > 10 after transform: ( 0 ) of total rows ( 9937 )


## Fix apartments where area_living + area_kitchen > area_total

In [173]:
# Set area_living and area_kitchen to be a ratio * area_total if their sum is higher than area_total
living_to_total_ratio = np.mean(data["area_living"].append(data_test["area_living"]) / data["area_total"].append(data_test["area_total"]))
kitchen_to_total_ratio = np.mean(data["area_kitchen"].append(data_test["area_kitchen"]) / data["area_total"].append(data_test["area_total"]))

In [174]:
print(
    "Data rows where area_living + area_kitchen > area_total: (",
    data[(data["area_living"] + data["area_kitchen"]) > data["area_total"]].shape[0],
    ") of total rows: (", data.shape[0], ")"
)
data.loc[(data["area_living"] + data["area_kitchen"]) > data["area_total"], "area_kitchen"] = kitchen_to_total_ratio * data["area_total"]
data.loc[(data["area_living"] + data["area_kitchen"]) > data["area_total"], "area_living"] = living_to_total_ratio * data["area_total"]
print(
    "Data rows where area_living + area_kitchen > area_total after transform: (",
    data[(data["area_living"] + data["area_kitchen"]) > data["area_total"]].shape[0],
    ") of total rows: (", data.shape[0], ")"
)

Data rows where area_living + area_kitchen > area_total: ( 1095 ) of total rows: ( 23277 )
Data rows where area_living + area_kitchen > area_total after transform: ( 0 ) of total rows: ( 23277 )


In [175]:
print(
    "Test rows where area_living + area_kitchen > area_total: (",
    data_test[(data_test["area_living"] + data_test["area_kitchen"]) > data_test["area_total"]].shape[0],
    ") of total rows: (", data_test.shape[0], ")"
)
data_test.loc[(data_test["area_living"] + data_test["area_kitchen"]) > data_test["area_total"], "area_kitchen"] = kitchen_to_total_ratio * data_test["area_total"]
data_test.loc[(data_test["area_living"] + data_test["area_kitchen"]) > data_test["area_total"], "area_living"] = living_to_total_ratio * data_test["area_total"]
print(
    "Test rows where area_living + area_kitchen > area_total after transform: (",
    data_test[(data_test["area_living"] + data_test["area_kitchen"]) > data_test["area_total"]].shape[0],
    ") of total rows: (", data_test.shape[0], ")"
)

Test rows where area_living + area_kitchen > area_total: ( 538 ) of total rows: ( 9937 )
Test rows where area_living + area_kitchen > area_total after transform: ( 0 ) of total rows: ( 9937 )


## Fix apartments where floor > stories in building

In [176]:
print("Data rows where floor > stories: (", data[data["floor"] > data["stories"]].shape[0], ") of total rows: (", data.shape[0], ")")
data.loc[data["floor"] > data["stories"], "floor"] = data["stories"]
print("Data rows where floor > stories after transform: (", data[data["floor"] > data["stories"]].shape[0], ")")

Data rows where floor > stories: ( 321 ) of total rows: ( 23277 )
Data rows where floor > stories after transform: ( 0 )


In [177]:
print("Test rows where floor > stories: (", data_test[data_test["floor"] > data_test["stories"]].shape[0], ") of total rows: (", data_test.shape[0], ")")
data_test.loc[data_test["floor"] > data_test["stories"], "floor"] = data_test["stories"]
print("Test rows where floor > stories after transform: (", data_test[data_test["floor"] > data_test["stories"]].shape[0], ")")

Test rows where floor > stories: ( 93 ) of total rows: ( 9937 )
Test rows where floor > stories after transform: ( 0 )


## Fix missing features in data_test

In [178]:
data_test['latitude'] = data_test['latitude'].fillna(data_test['latitude'].median())
data_test['longitude'] = data_test['longitude'].fillna(data_test['longitude'].median())

# Add new features

## Add meta-column "price_bin"
price_bin is a categorization of price, used for weights in the polar coordinates and to stratify train/test-splitting

In [179]:
NUM_BUCKETS = 10
log_price = np.log10(data['price'])

price_bin_max = log_price.max()
price_bin_min = log_price.min()
price_bin_size = (price_bin_max - price_bin_min) / NUM_BUCKETS

price_bins = [
    i*price_bin_size + price_bin_min for i in range(NUM_BUCKETS)
]
labels = [i for i in range(len(price_bins) - 1)]

data['price_bin'] = pd.cut(log_price, bins=price_bins, labels=labels)
data["price_bin"].fillna(8, inplace=True)

## Add polar coordinates

In [180]:
# Convert latitude, longitude to polar coordinates
def cartesian_to_polar_coordinates(x, y):
    rho = np.sqrt(x**2 + y**2)
    phi = np.arctan2(y, x)
    return(rho, phi)
    
geographical_weighted_center_latitude = np.average(data["latitude"], weights=data["price_bin"])
geographical_weighted_center_longitude = np.average(data["longitude"], weights=data["price_bin"])

In [181]:
delta_latitude = data["latitude"] - geographical_weighted_center_latitude
delta_longitude = data["longitude"] - geographical_weighted_center_longitude

data["distance_from_center"], data["angle"] = cartesian_to_polar_coordinates(delta_latitude, delta_longitude)

In [182]:
delta_latitude = data_test["latitude"] - geographical_weighted_center_latitude
delta_longitude = data_test["longitude"] - geographical_weighted_center_longitude

data_test["distance_from_center"], data_test["angle"] = cartesian_to_polar_coordinates(delta_latitude, delta_longitude)

## Add average_room_size

In [183]:
data["average_room_size"] = data["area_total"] / data["rooms"]
data_test["average_room_size"] = data_test["area_total"] / data_test["rooms"]

## Add bathroom_amount

In [184]:
data["bathroom_amount"] = data["bathrooms_private"] + data["bathrooms_shared"]
data_test["bathroom_amount"] = data_test["bathrooms_private"] + data_test["bathrooms_shared"]

## Add relative floor (how high up the building the apartment is)

In [185]:
data["relative_floor"] = data["floor"] / data["stories"]
data_test["relative_floor"] = data_test["floor"] / data_test["stories"]

# Drop columns

In [186]:
# layout has lots of missing values - drop the column from data and data_test
# address and street have string values, so we lose them as well
data.drop(["layout", "address", "street"], axis=1, inplace=True)
data_test.drop(["layout", "address", "street"], axis=1, inplace=True)

In [187]:
data.reset_index(inplace=True)

## Convert dtypes

In [188]:
needed_dtypes = {
    "seller": CategoricalDtype(categories=[0, 1, 2, 3, 4]),
    "floor": "uint8",
    "rooms": "category", # "uint8",
    "bathrooms_shared": "category", # "uint8",
    "bathrooms_private": "category", # "uint8",
    "windows_court": CategoricalDtype(categories=[0, 1, 2]),
    "windows_street": CategoricalDtype(categories=[0, 1, 2]),
    "balconies": "category", # "uint8",
    "loggias": "category", # "uint8",
    "condition": CategoricalDtype(categories=[0, 1, 2, 3, 4]),
    "phones": "category", # "uint8",
    "new": CategoricalDtype(categories=[0, 1, 2]),
    "district": CategoricalDtype(categories=list(range(13))),
    "constructed": "uint16",
    "material": CategoricalDtype(categories=list(range(8))),
    "stories": "uint8",
    "elevator_without": "bool",
    "elevator_passenger": "bool",
    "elevator_service": "bool",
    "parking": CategoricalDtype(categories=[0, 1, 2, 3]),
    "garbage_chute": "bool",
    "heating": CategoricalDtype(categories=[0, 1, 2, 3, 4]),
    "bathroom_amount": "category",
}
data = data.astype(needed_dtypes)
data_test = data_test.astype(needed_dtypes)

# Train/test-split

In [189]:
data_train, data_valid = train_test_split(
    data.drop(["price_bin"], axis=1),
    test_size = 0.3,
    stratify = data["price_bin"],
    random_state = 42,
)

# Model

In [190]:
data_train["price"] = np.log1p(data_train["price"])
data_valid["price"] = np.log1p(data_valid["price"])

In [191]:
data_train["area_total"] = np.log1p(data_train["area_total"])
data_valid["area_total"] = np.log1p(data_valid["area_total"])
data_test["area_total"] = np.log1p(data_test["area_total"])

data_train["area_kitchen"] = np.log1p(data_train["area_kitchen"])
data_valid["area_kitchen"] = np.log1p(data_valid["area_kitchen"])
data_test["area_kitchen"] = np.log1p(data_test["area_kitchen"])

data_train["area_living"] = np.log1p(data_train["area_living"])
data_valid["area_living"] = np.log1p(data_valid["area_living"])
data_test["area_living"] = np.log1p(data_test["area_living"])

data_train.drop('loggias', axis = 1, inplace = True)
data_valid.drop('loggias', axis = 1, inplace = True)
data_test.drop('loggias', axis = 1, inplace = True)

data_train.drop('balconies', axis = 1, inplace = True)
data_valid.drop('balconies', axis = 1, inplace = True)
data_test.drop('balconies', axis = 1, inplace = True)

In [192]:
X_test = data_test

y_train = data_train["price"]
y_valid = data_valid["price"]

X_train = data_train.drop(["id", "price"], axis=1)
X_valid = data_valid.drop(["id", "price"], axis=1)

In [193]:
test_Id = data_test.index

In [194]:
# from sklearn import preprocessing

# categorical_features = ['rooms', 'bathrooms_shared', 'bathrooms_private', 'windows_court', 'windows_street',
# 'new', 'district', 'material', 'elevator_without', 'elevator_passenger', 'elevator_service', 'parking',
# 'garbage_chute', 'heating']

# lbl = [preprocessing.LabelEncoder() for _ in categorical_features]

# for i, col in enumerate(categorical_features):
#     X_train[col] = lbl[i].fit_transform(X_train[col].astype('str'))
#     X_valid[col] = lbl[i].transform(X_valid[col].astype('str'))
#     X_test[col] = lbl[i].transform(X_test[col].astype('str'))

In [199]:
categorical_columns = [
    "seller",
    "rooms",
    "bathrooms_shared",
    "bathrooms_private",
    "windows_court",
    "windows_street",
    "condition",
    "phones",
    "new",
    "district",
    "material",
    "elevator_without",
    "elevator_passenger",
    "elevator_service",
    "parking",
    "garbage_chute",
    "heating",
    "bathroom_amount",
]

In [200]:
hyper_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['l1','l2'],
    'learning_rate': 0.005,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.7,
    'bagging_freq': 10,
    'verbose': 0,
    "max_depth": 6,
    "num_leaves": 60,  
    "max_bin": 256,
    "num_iterations": 100000
}

In [201]:
import lightgbm as lgb
gbm = lgb.LGBMRegressor(**hyper_params)


In [202]:
gbm.fit(X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric=lightgbm_feval,
        callbacks=[lgb.early_stopping(stopping_rounds=1000)],
        categorical_feature=categorical_columns
)

New categorical_feature is ['bathroom_amount', 'bathrooms_private', 'bathrooms_shared', 'condition', 'district', 'elevator_passenger', 'elevator_service', 'elevator_without', 'garbage_chute', 'heating', 'material', 'new', 'parking', 'phones', 'rooms', 'seller', 'windows_court', 'windows_street']


You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 1000 rounds
Early stopping, best iteration is:
[45240]	valid_0's l1: 0.0776452	valid_0's l2: 0.0171058	valid_0's RMSLE: 0.130789


LGBMRegressor(bagging_fraction=0.7, bagging_freq=10, feature_fraction=0.9,
              learning_rate=0.005, max_bin=256, max_depth=6,
              metric=['l1', 'l2'], num_iterations=100000, num_leaves=60,
              objective='regression', task='train', verbose=0)

In [203]:
y_pred_valid = np.expm1(gbm.predict(X_valid, num_iteration=gbm.best_iteration_))
print(f"Validation score: {evaluate_predictions(y_pred_valid, np.expm1(y_valid))}")

Validation score: 0.13078900702449067


In [204]:
y_pred_test = np.expm1(gbm.predict(X_test, num_iteration=gbm.best_iteration_))

In [205]:
submission = pd.DataFrame(data= {'id' : test_Id, 'price_prediction': y_pred_test})
submission.to_csv('submissions/short_notebook_submission.csv', index=False)