In [11]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import mean_squared_error as MSE, r2_score, make_scorer
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, ShuffleSplit
from sklearn.utils import shuffle
from scipy.stats import uniform, randint
import lightgbm as lgb


In [13]:
!pip install optuna



In [14]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from lightgbm.callback import early_stopping
from sklearn.model_selection import KFold


# Define the KNN imputation function
def impute_knn(df, n_neighbors=5):
    imputer = KNNImputer(n_neighbors=n_neighbors)
    return pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# Set the random seed for reproducibility
RS = 20170501
np.random.seed(RS)

# Load datasets
print("Loading data...")
path = '/content/drive/MyDrive/NTU/Courses/CZ4041/'
train_df = pd.read_csv(path+'train.csv', parse_dates=['timestamp'])
test_df = pd.read_csv(path+'test.csv', parse_dates=['timestamp'])
macro_df = pd.read_csv(path+'macro.csv', parse_dates=['timestamp'])

train_df.head()

Loading data...


Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc
0,1,2011-08-20,43,27.0,4.0,,,,,,...,9,4,0,13,22,1,0,52,4,5850000
1,2,2011-08-23,34,19.0,3.0,,,,,,...,15,3,0,15,29,1,10,66,14,6000000
2,3,2011-08-27,43,29.0,2.0,,,,,,...,10,3,0,11,27,0,4,67,10,5700000
3,4,2011-09-01,89,50.0,9.0,,,,,,...,11,2,1,4,4,0,0,26,3,13100000
4,5,2011-09-05,77,77.0,4.0,,,,,,...,319,108,17,135,236,2,91,195,14,16331452


In [15]:
train_df=train_df[(train_df.price_doc>1e6) & (train_df.price_doc!=2e6) & (train_df.price_doc!=3e6)]
train_df.loc[(train_df.product_type=='Investment') & (train_df.build_year<2000),'price_doc']*=0.895
train_df.loc[train_df.product_type!='Investment','price_doc']*=0.96

In [16]:
import time
# Prepare the target variable
train_y = np.log1p(train_df['price_doc'].values)
train_ids = train_df['id']
test_ids = test_df['id']

# Concatenate train and test data for preprocessing
df = pd.concat([train_df.drop(['price_doc'], axis=1), test_df], sort=False).reset_index(drop=True)
# print(df.info())
# Merge with macro data
macro_cols = ["timestamp", "balance_trade", "balance_trade_growth", "usdrub", "average_provision_of_build_contract",
              "micex_rgbi_tr", "micex_cbi_tr", "deposits_rate", "mortgage_value", "mortgage_rate", "income_per_cap",
              "museum_visitis_per_100_cap", "apartment_build", "oil_urals"]
df = df.merge(macro_df[macro_cols], on='timestamp', how='left')
# print(df.info())
# Process date information and drop the timestamp
df['year'] = df.timestamp.dt.year
df['month'] = df.timestamp.dt.month
df['dow'] = df.timestamp.dt.dayofweek
df['quarter'] = df['timestamp'].dt.quarter
df.drop(['timestamp'], axis=1, inplace=True)

# Perform imputation only on numerical columns
# numeric_cols = df.select_dtypes(include=[np.number]).columns
# print("starting imputation")
# start_time = time.time()
# df[numeric_cols] = impute_knn(df[numeric_cols])
# end_time = time.time()
# print(f"Imputation took {end_time - start_time} seconds.")
# print("completed imputation")
print(df.info())


<class 'pandas.core.frame.DataFrame'>
Int64Index: 36064 entries, 0 to 36063
Columns: 307 entries, id to quarter
dtypes: float64(132), int64(160), object(15)
memory usage: 84.7+ MB
None


In [17]:
print("Feature Engineering: ...")
# Feature engineering

# List to store new feature columns
new_features = []

if 'usdrub' in df.columns and 'oil_urals' in df.columns:
    new_features.append((df['usdrub'] * df['oil_urals']).rename('economic_stability'))

# Similarly, ensure 'build_year' is in the DataFrame before creating 'building_age'
if 'build_year' in df.columns:
    new_features.append((df['year'] - df['build_year']).rename('building_age'))
# Calculating new features and appending them to the list
new_features.append((df['kremlin_km'] <= 3).astype(int).rename('near_kremlin'))
new_features.append((df['full_sq'] / df['num_room']).rename('room_size'))
new_features.append((df['life_sq'] / df['num_room']).rename('area_per_room'))
new_features.append((df['kitch_sq'] / df['full_sq']).rename('kitchen_to_total'))
new_features.append((df['num_room'] / df['full_sq']).rename('num_rooms_to_total'))
new_features.append(pd.cut(df['year'] - df['build_year'], bins=[0, 5, 10, 20, 50, 100], labels=False, right=False).rename('property_age_group'))

# Combining original DataFrame with new features
df = pd.concat([df] + new_features, axis=1)
# Prepare the target variable and drop unused columns
df.drop(['id', 'build_year', 'usdrub', 'oil_urals'], axis=1, inplace=True)


# Encode categorical features
df_obj = df.select_dtypes(include=['object']).copy()
for c in df_obj:
    df_obj[c] = pd.factorize(df_obj[c])[0]
df_num = df.select_dtypes(exclude=['object'])
df_values = pd.concat([df_num, df_obj], axis=1)

# Split back into train and test sets
pos = train_df.shape[0]
train_df = df_values[:pos]
test_df = df_values[pos:]
del df, df_num, df_obj, df_values

train_df.info()
test_df.info()

Feature Engineering: ...
<class 'pandas.core.frame.DataFrame'>
Int64Index: 28402 entries, 0 to 28401
Columns: 311 entries, full_sq to ecology
dtypes: float64(136), int64(175)
memory usage: 67.6 MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7662 entries, 28402 to 36063
Columns: 311 entries, full_sq to ecology
dtypes: float64(136), int64(175)
memory usage: 18.2 MB


In [None]:
# Define the objective function for Optuna optimization
def objective(trial):
    param = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'verbose': -1
    }

    # K-fold cross-validation
    kf = KFold(n_splits=7, shuffle=True, random_state=42)
    rmse_scores = []

    for train_index, valid_index in kf.split(train_df):
        X_train, X_valid = train_df.iloc[train_index], train_df.iloc[valid_index]
        y_train, y_valid = train_y[train_index], train_y[valid_index]

        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_valid = lgb.Dataset(X_valid, y_valid)

        # Training a preliminary LightGBM model to get feature importances
        prelim_model = lgb.train(param, lgb_train, valid_sets=[lgb_valid], num_boost_round=500, callbacks=[early_stopping(stopping_rounds=50, verbose=False)])

        # Feature selection based on importance
        feature_importances = prelim_model.feature_importance(importance_type='gain')
        top_n_features = sorted(zip(prelim_model.feature_name(), feature_importances), key=lambda x: x[1], reverse=True)[:50]
        selected_features = [feature for feature, _ in top_n_features]

        # Reducing X_train and X_valid to selected features
        X_train_selected = X_train[selected_features]
        X_valid_selected = X_valid[selected_features]

        # Training the final model on reduced feature set
        final_model = lgb.train(param, lgb.Dataset(X_train_selected, y_train), valid_sets=lgb.Dataset(X_valid_selected, y_valid), num_boost_round=1000, callbacks=[early_stopping(stopping_rounds=50, verbose=False)])

        # Predicting and calculating RMSE
        preds = final_model.predict(X_valid_selected, num_iteration=final_model.best_iteration)
        rmse = mean_squared_error(y_valid, preds, squared=False)
        rmse_scores.append(rmse)

    # Average RMSE across all folds
    avg_rmse = np.mean(rmse_scores)
    return avg_rmse
ROUNDS = 1000
# Create a study object and specify the direction is 'minimize'.
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Output the result
print('Number of finished trials: {}'.format(len(study.trials)))
print('Best trial:')

[I 2023-11-24 09:51:12,554] A new study created in memory with name: no-name-0a81e258-3248-4ad0-ae85-b96c2d6de42a
[I 2023-11-24 09:55:35,556] Trial 0 finished with value: 0.2464411766133207 and parameters: {'num_leaves': 117, 'learning_rate': 0.028203210045010876, 'feature_fraction': 0.8422773948407818, 'bagging_fraction': 0.7155429031542473, 'min_child_samples': 48, 'max_depth': 6}. Best is trial 0 with value: 0.2464411766133207.
[I 2023-11-24 09:59:30,112] Trial 1 finished with value: 0.24564457345247095 and parameters: {'num_leaves': 102, 'learning_rate': 0.05058520634177205, 'feature_fraction': 0.6465955941940382, 'bagging_fraction': 0.5373552923126219, 'min_child_samples': 15, 'max_depth': 17}. Best is trial 1 with value: 0.24564457345247095.
[I 2023-11-24 10:01:53,745] Trial 2 finished with value: 0.2469284695963437 and parameters: {'num_leaves': 23, 'learning_rate': 0.07760209054981866, 'feature_fraction': 0.8872438240077075, 'bagging_fraction': 0.5769919700904993, 'min_child_sa

In [None]:

# trial = study.best_trial

# print('  Value: {}'.format(trial.value))
# print('  Params: ')
# for key, value in trial.params.items():
#     print('    {}: {}'.format(key, value))

# Train the final model with the best hyperparameters found
best_params = {'num_leaves': 135, 'learning_rate': 0.02332658702810065, 'feature_fraction': 0.4013146306422676, 'bagging_fraction': 0.7812052089280968, 'min_child_samples': 13, 'max_depth': 10}
best_params['verbose'] = -1
best_params['metric'] = 'rmse'
best_params['objective'] = 'regression'
best_params['boosting_type'] = 'gbdt'

# Split the dataset into training and validation sets for early stopping
X_train_prelim, X_valid_prelim, y_train_prelim, y_valid_prelim = train_test_split(
    train_df, train_y, test_size=0.15, random_state=42
)

# Creating LightGBM datasets for early stopping
lgb_train_prelim = lgb.Dataset(X_train_prelim, y_train_prelim)
lgb_valid_prelim = lgb.Dataset(X_valid_prelim, y_valid_prelim)

# Training the preliminary model with early stopping
prelim_model = lgb.train(
    best_params,
    lgb_train_prelim,
    num_boost_round=1000,
    valid_sets=[lgb_valid_prelim],
    callbacks=[early_stopping(stopping_rounds=50, verbose=False)]
)

# Get feature importances and select top N features
feature_importances = prelim_model.feature_importance(importance_type='gain')
top_n_features = sorted(zip(train_df.columns, feature_importances), key=lambda x: x[1], reverse=True)[:100]
selected_features = [feature for feature, _ in top_n_features]

print("Selected top 50 features based on importance")
# # You can re-split the data here if needed
# X_train, X_valid, y_train, y_valid = train_test_split(train_df, train_y, test_size=0.15, random_state=42)
# lgb_train = lgb.Dataset(X_train, y_train, free_raw_data= False)
# lgb_valid = lgb.Dataset(X_valid, y_valid, free_raw_data= False)

# Redefine K-Fold cross-validation
kf = KFold(n_splits=7, shuffle=True, random_state=42)

# Array to store predictions for each fold
fold_preds = np.zeros(test_df.shape[0])
final_model = None  # This will store the model from the last fold

# Iterate over each fold using only selected features
for fold, (train_index, valid_index) in enumerate(kf.split(train_df[selected_features])):
    print(f"Training on fold {fold + 1}")

    # Splitting the data using only selected features
    X_train_fold = train_df.iloc[train_index][selected_features]
    X_valid_fold = train_df.iloc[valid_index][selected_features]
    y_train_fold = train_y[train_index]
    y_valid_fold = train_y[valid_index]

    # Creating LightGBM datasets
    lgb_train_fold = lgb.Dataset(X_train_fold, y_train_fold)
    lgb_valid_fold = lgb.Dataset(X_valid_fold, y_valid_fold)

    # Training the model on the current fold
    model = lgb.train(
        best_params,
        lgb_train_fold,
        num_boost_round=ROUNDS,
        valid_sets=[lgb_valid_fold],
        callbacks=[early_stopping(stopping_rounds=100, verbose=False)]
    )

    # Predicting on the test set for the current fold
    fold_preds += model.predict(test_df[selected_features]) / kf.n_splits

    # Storing the last model for feature importance
    if fold == kf.n_splits - 1:
        final_model = model

# Averaging predictions over all folds
final_preds = fold_preds

# Output final predictions
output_path = "lgb_{}_optimized_kfold.csv".format(RS)
out_df = pd.DataFrame({"id": test_ids, "price_doc": np.expm1(final_preds)})
out_df.to_csv(output_path, index=False)


# Feature importance from the model of the last fold
print("Features importance...")
gain = final_model.feature_importance('gain')
ft = pd.DataFrame({'feature': final_model.feature_name(), 'split': final_model.feature_importance('split'),
                   'gain': 100 * gain / gain.sum()}).sort_values('gain', ascending=False)
print(ft.head(25))

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.title("LGBM Feature Importance")
ft[['feature', 'gain']].head(25).plot(kind='barh', x='feature', y='gain', legend=False, figsize=(10, 20))
plt.xlabel('Relative Importance')
plt.ylabel('Features')
plt.gca().invert_yaxis()
plt.savefig("features_importance.png")

