In [3]:
# Importing libraries

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import RFE, SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor
from datetime import datetime
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split
import pickle
import os
import re
import joblib
import sys


In [4]:
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
pd.set_option("display.float_format", '{:.2f}'.format)

##### Utils Functions

In [3]:
## Imputation
# Fill missing values with Unknown
def FillWithUnknown(data, cols):
    if isinstance(cols, list):
        for col in cols:
            data[col] = data[col].fillna("Unknown")
        
    elif isinstance(cols, str):
        data[cols] = data[cols].fillna("Unknown")

# Sort and Ffill
def FillAfterSorting(data, cols, sorting_col=""):
    if sorting_col:
        data.sort_values(by=sorting_col, inplace=True)
        if isinstance(cols, list):
            for col in cols:
                data[col] = data[col].ffill().bfill()
        
        elif isinstance(cols, str):
            data[cols] = data[cols].ffill().bfill()


In [4]:
# Remove outliers using IQR
def RemoveOutliers(data, cols):
    for col in cols:
        q1, q2, q3 = data.loc[:, col].quantile([0.25, 0.5, 0.75])
        iqr = q3 - q1
        lower_threshold = round(q1 - (1.5 * iqr), 2)
        upper_threshold = round(q3 + (1.5 * iqr), 2)
        median = data.loc[:, col].median()
        data.loc[ (data[col] < lower_threshold) | (data[col] > upper_threshold), col] = median

In [5]:
## Feature Engineering 
def ConvertRoomsToNumber(value):
    
    if 'B/R' in value:
        return int(re.findall('\d+', value)[0])  # Extract the number before "B/R"
    elif 'STUDIO' in value or 'UNKNOWN' in value:
        return 0  # Represent Studio as 0 bedrooms
    elif value in ['OFFICE', 'SHOP', 'PENTHOUSE', 'SINGLE ROOM', 'HOTEL']:
        return -1  # Assign -1 or another placeholder for non-bedroom categories
    elif 'SINGLE ROOM' in value:
        return 1
    else:
        return np.nan  # Handle unexpected values

def TransactionAgeInDays(data, col):
    current_time = pd.Timestamp.now()
    data['transaction_age_in_days'] = round((current_time - pd.to_datetime(data[col])).dt.total_seconds() / (60 * 60 * 24))

def RoomsCleaner(df, col):
    df[col] = df[col].str.upper()
    df[col] = df[col].apply(ConvertRoomsToNumber)

In [6]:
## One Hot Encoding
def ApplyOneHotEncoder(df, cols, save_path="encoders"):
    ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoded_data = ohe.fit_transform(df[cols])
    encoded_df = pd.DataFrame(encoded_data, columns=ohe.get_feature_names_out(df[cols].columns))
    df.drop(cols, axis=1, inplace=True)
    df = pd.concat([df, encoded_df], axis=1)
    df.reset_index(drop=True, inplace=True)
    os.makedirs(save_path, exist_ok=True)
    joblib.dump(ohe, f"{save_path}/OneHotEncoder.pkl")
    return df

## Encoding Features
def ApplyLabelEncoder(df, cols, save_path="encoders"):
    os.makedirs(save_path, exist_ok=True)

    label_encoders = {}
    for col in cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

    # Save the label encoder
    for col, le in label_encoders.items():
        joblib.dump(le, f"{save_path}/LabelEncoder_{col}.pkl")


In [7]:
def ScaleData(df, columns_to_scale, save_path="scaler.pkl"):
    scaler = StandardScaler()
    df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])
    with open(save_path, "wb") as f:
        pickle.dump(scaler, f)

#### Utils Transformers

In [121]:
class FillWithUnknown(BaseEstimator, TransformerMixin):
    def __init__(self, cols):
        self.cols = cols
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        for col in self.cols:
            X[col] = X[col].fillna("Unknown")
        return X

class FillAfterSorting(BaseEstimator, TransformerMixin):
    def __init__(self, cols, sorting_col=""):
        self.cols = cols
        self.sorting_col = sorting_col
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        if self.sorting_col:
            X.sort_values(by=self.sorting_col, inplace=True)
        for col in self.cols:
            X[col] = X[col].ffill().bfill()
        return X

class RemoveOutliers(BaseEstimator, TransformerMixin):
    def __init__(self, cols):
        self.cols = cols
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        for col in self.cols:
            q1, q3 = X[col].quantile([0.25, 0.75])
            iqr = q3 - q1
            lower_threshold = q1 - 1.5 * iqr
            upper_threshold = q3 + 1.5 * iqr
            median = X[col].median()
            X.loc[ (X[col] < lower_threshold) | (X[col] > upper_threshold), col] = median
        return X       

class RoomsCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, col):
        self.col = col
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X[self.col] = X[self.col].astype(str).str.upper()
        X[self.col] = X[self.col].apply(self.convert_rooms_to_number)
        return X                                

    @staticmethod
    def convert_rooms_to_number(value):
        if 'B/R' in value:
            return int(re.findall(r'\d+', value)[0])
        elif 'STUDIO' in value or 'UNKNOWN' in value:
            return 0
        elif value in ['OFFICE', 'SHOP', 'PENTHOUSE', 'SINGLE ROOM', 'HOTEL']:
            return -1
        else:
            return np.nan
        
# Custom Transformer for Transaction Age
class TransactionAgeInDays(BaseEstimator, TransformerMixin):
    def __init__(self, col):
        self.col = col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        current_time = pd.Timestamp.now()
        X['transaction_age_in_days'] = (current_time - pd.to_datetime(X[self.col])).dt.days
        X.drop(columns=[self.col], inplace=True)
        return X

# Custom Transformer for OneHotEncoder
class CustomOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cols, save_path="encoders"):
        self.cols = cols
        self.save_path = save_path
        self.encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

    def fit(self, X, y=None):
        self.encoder.fit(X[self.cols])
        os.makedirs(self.save_path, exist_ok=True)
        joblib.dump(self.encoder, f"{self.save_path}/OneHotEncoder.pkl")
        return self

    def transform(self, X):
        encoded = self.encoder.transform(X[self.cols])
        encoded_df = pd.DataFrame(encoded, columns=self.encoder.get_feature_names_out(self.cols))
        X.drop(columns=self.cols, inplace=True)
        X.reset_index(drop=True, inplace=True)
        return pd.concat([X, encoded_df], axis=1)

# Custom Transformer for LabelEncoder
class CustomLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cols, save_path="encoders"):
        self.cols = cols
        self.save_path = save_path
        self.encoders = {}

    def fit(self, X, y=None):
        for col in self.cols:
            le = LabelEncoder()
            le.fit(X[col])
            self.encoders[col] = le
            os.makedirs(self.save_path, exist_ok=True)
            joblib.dump(le, f"{self.save_path}/LabelEncoder_{col}.pkl")
        return self

    def transform(self, X):
        for col in self.cols:
            X[col] = self.encoders[col].transform(X[col])
        return X

# Custom Transformer for Scaling
class ScaleData(BaseEstimator, TransformerMixin):
    def __init__(self, target_col, save_path="scaler.pkl"):
        self.target_col = target_col
        self.save_path = save_path
        self.scaler = StandardScaler()

    def fit(self, X, y=None):
        self.cols = [col for col in X.columns if col != self.target_col]
        self.scaler.fit(X[self.cols])
        with open(self.save_path, "wb") as f:
            pickle.dump(self.scaler, f)
        return self

    def transform(self, X):
        X[self.cols] = self.scaler.transform(X[self.cols])
        return X  
        

#### Feature Selection

In [None]:
def FeatureSelection(df_, target_column, mode="train"):

    df = df_.copy()
    corr_feats = CorrelationAnalysis(df)
    randomforest_feats = FeatureImportance(df,  target_column )
    rfe_feats = RecursiveFeatureElimination(df, target_column)
    univariate_feats = UnivariateFeatureSelection(df, target_column)
    final_selected_features = CombineSelectedFeatures(
        corr_feats,
        randomforest_feats,
        rfe_feats,
        univariate_feats)
    
    if target_column in final_selected_features:
        final_selected_features.remove(target_column)
    return final_selected_features

target_column = 'amount'
Selected_Features = FeatureSelection(preprocessed_train_df, target_column)

In [None]:
# base_models.py

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb
from skopt import BayesSearchCV
from skopt.space import Real, Integer
import joblib
import os

# Function to evaluate models
def evaluate_model(model, X_train, y_train, X_test, y_test):
    """
    Evaluate the model on train and test data, and return evaluation metrics.
    """
    # Training the model
    model.fit(X_train, y_train)
    
    # Predicting on train and test data
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    
    # Evaluating model
    train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
    
    train_r2 = r2_score(y_train, train_pred)
    test_r2 = r2_score(y_test, test_pred)

    train_mae = mean_absolute_error(y_train, train_pred)
    test_mae = mean_absolute_error(y_test, test_pred)

    print(f"Train RMSE: {train_rmse:.4f}, Test RMSE: {test_rmse:.4f}")
    print(f"Train R^2: {train_r2:.4f}, Test R^2: {test_r2:.4f}")
    print(f"Train MAE: {train_mae:.4f}, Test MAE: {test_mae:.4f}")
    
    return train_rmse, test_rmse, train_r2, test_r2

# XGBoost model
def xgboost_model(X_train, y_train, X_test, y_test, param_space=None):
    """
    Train and evaluate the XGBoost model with or without hyperparameter optimization.
    """
    # Initialize XGBoost Regressor
    model = xgb.XGBRegressor(objective="reg:squarederror")
    
    # Bayesian Optimization for Hyperparameters
    if param_space:
        opt = BayesSearchCV(model, param_space, n_iter=50, cv=3, verbose=0, n_jobs=-1)
        opt.fit(X_train, y_train)
        model = opt.best_estimator_
        print(f"Best parameters for XGBoost: {opt.best_params_}")
    
    # Evaluate model performance
    evaluate_model(model, X_train, y_train, X_test, y_test)
    
    return model

# Random Forest model
def random_forest_model(X_train, y_train, X_test, y_test, param_space=None):
    """
    Train and evaluate the Random Forest model with or without hyperparameter optimization.
    """
    model = RandomForestRegressor(random_state=42)
    
    # Bayesian Optimization for Hyperparameters
    if param_space:
        opt = BayesSearchCV(model, param_space, n_iter=50, cv=3, verbose=0, n_jobs=-1)
        opt.fit(X_train, y_train)
        model = opt.best_estimator_
        print(f"Best parameters for Random Forest: {opt.best_params_}")
    
    # Evaluate model performance
    evaluate_model(model, X_train, y_train, X_test, y_test)
    
    return model

# Support Vector Regression (SVR) model
def svr_model(X_train, y_train, X_test, y_test, param_space=None):
    """
    Train and evaluate the Support Vector Regression model with or without hyperparameter optimization.
    """
    model = SVR(kernel='rbf')
    
    # Bayesian Optimization for Hyperparameters
    if param_space:
        opt = BayesSearchCV(model, param_space, n_iter=50, cv=3, verbose=0, n_jobs=-1)
        opt.fit(X_train, y_train)
        model = opt.best_estimator_
        print(f"Best parameters for SVR: {opt.best_params_}")
    
    # Evaluate model performance
    evaluate_model(model, X_train, y_train, X_test, y_test)
    
    return model

# Hyperparameter Search Spaces for Bayesian Optimization
def get_param_spaces():
    """
    Define the parameter search spaces for each model.
    """
    xgboost_params = {
        'learning_rate': Real(0.01, 0.2, prior='uniform'),
        'max_depth': Integer(3, 10),
        'n_estimators': Integer(50, 300),
        'subsample': Real(0.5, 1.0, prior='uniform'),
        'colsample_bytree': Real(0.5, 1.0, prior='uniform'),
        'gamma': Real(0, 0.5, prior='uniform')
    }
    
    rf_params = {
        'n_estimators': Integer(50, 300),
        'max_depth': Integer(5, 20),
        'min_samples_split': Integer(2, 10),
        'min_samples_leaf': Integer(1, 4),
        'bootstrap': [True, False]
    }
    
    svr_params = {
        'C': Real(0.1, 1000, prior='uniform'),
        'epsilon': Real(0.01, 0.1, prior='uniform'),
        'kernel': ['linear', 'poly', 'rbf']
    }
    
    return xgboost_params, rf_params, svr_params

# Train and evaluate models with hyperparameter optimization
def train_and_evaluate_models(X, y):
    """
    Split the data, train models with hyperparameter tuning, and evaluate their performance.
    """
    # Split the data into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    
    # Get parameter spaces for each model
    xgboost_params, rf_params, svr_params = get_param_spaces()
    
    # Train and evaluate XGBoost model
    print("Training and Evaluating XGBoost Model:")
    xgb_model = xgboost_model(X_train, y_train, X_test, y_test, param_space=xgboost_params)
    
    # Train and evaluate Random Forest model
    print("Training and Evaluating Random Forest Model:")
    rf_model = random_forest_model(X_train, y_train, X_test, y_test, param_space=rf_params)
    
    # Train and evaluate SVR model
    # print("Training and Evaluating Support Vector Regression Model:")
    # svr_model(X_train, y_train, X_test, y_test, param_space=svr_params)

    return (xgb_model, rf_model)



In [None]:
base_models = train_and_evaluate_models(preprocessed_train_df.drop(['amount'], axis=1), preprocessed_train_df['amount'])

Training and Evaluating XGBoost Model:


#### Exploratory Data Analysis

In [6]:
# Explore Datasets
# rents_df = pd.read_csv("../data/snp_dld_2024_rents.csv", low_memory=False)
# transactions_df = pd.read_csv("../data/snp_dld_2024_transactions.csv",)

file_path = "../data/snp_dld_2024_transactions.csv" 

train_df, test_df = train_test_split(
    pd.read_csv(file_path),
    test_size=0.1,           
    random_state=42          
)

# Check the resulting splits
print(f"Training set shape: {train_df.shape}")
print(f"Testing set shape: {test_df.shape}")


Training set shape: (146525, 46)
Testing set shape: (16281, 46)


##### EDA snp_dld_2024_transactions.csv

In [7]:
# transactions_df.info(show_counts=True)
# transactions_df.filter(regex="^(?!.*_ar$).*$").head(5)

# Add the project root directory to the Python path
project_root = r"d:\Work\Smart Brick Assessment Update\Smart-Brick-Assessment"
sys.path.append(project_root)
# print(sys.path)
                
# from ..pipeline.preprocess import Preprocess
from Preprocessing.preprocess import Preprocess
# preprocess_df = Preprocess(transactions_df)

preprocessed_train_df = Preprocess(train_df, mode="train")
preprocessed_test_df = Preprocess(test_df, mode="test")

In [None]:
# Filter columns after manual analysis
ONE_HOT_COLUMNS = ['transaction_type_en', 'registration_type_en', 'is_freehold_text', 'property_usage_en', 'is_offplan',
                    'is_freehold', 'property_type_en', 'nearest_mall_en']

LE_COLUMNS = ['nearest_landmark_en', 'transaction_subtype_en', 'property_subtype_en', 'area_en', 'nearest_metro_en']

SCALINGFEATURES_COLUMNS = LE_COLUMNS + ['rooms_en', 'transaction_age_in_days'] 

target_col = ['amount']

other_cols = ["rooms_en", "transaction_datetime"]

df = transactions_df.loc[:, ONE_HOT_COLUMNS + LE_COLUMNS + other_cols + target_col]

In [None]:
# train_df = df.sample(frac=0.9, random_state=123).copy()
# test_df = df.drop(train_df.index).copy()

# from sklearn.compose import ColumnTransformer

# pipeline = Pipeline(steps=[
#     ('fill_with_unknown', FillWithUnknown(cols=['property_subtype_en', 'rooms_en'])),
#     ('fill_after_sorting', FillAfterSorting(cols=["is_freehold", "nearest_landmark_en", "nearest_mall_en", "nearest_metro_en"], sorting_col='area_en')),
#     ('rooms_cleaner', RoomsCleaner(col='rooms_en')),
#     ('transaction_age', TransactionAgeInDays(col='transaction_datetime')),
#     ('remove_outliers', RemoveOutliers(cols=["amount", "transaction_age_in_days"])),
#     ('label_encoder', CustomLabelEncoder(cols=LE_COLUMNS)),
#     ('one_hot_encoder', CustomOneHotEncoder(cols=ONE_HOT_COLUMNS)),
#     ('scaler', ScaleData(target_col='amount'))
# ])

# Fit the pipeline on the training data
# pipeline.fit(train_df)

# Transform the training and testing data using the fitted pipeline
# processed_train_df = pipeline.transform(train_df)
# processed_test_df = pipeline.transform(test_df)



In [14]:
# Imputation
FillAfterSorting(df, ["is_freehold", "nearest_landmark_en", "nearest_mall_en", "nearest_metro_en"], 'area_en')
FillWithUnknown(df, ['property_subtype_en', 'rooms_en'])

In [16]:
# Feature Engineering
RoomsCleaner(df, 'rooms_en')
TransactionAgeInDays(df, 'transaction_datetime')
df.drop(['transaction_datetime'], axis=1, inplace=True)


In [18]:
# Encoding Categorical Columns
# ONE_HOT_COLUMNS = []
# LE_COLUMNS = []
# for col in df.select_dtypes('object').columns:
#     if df[col].nunique() <= 10:
#         ONE_HOT_COLUMNS.append(col)
#     else:
#         LE_COLUMNS.append(col)

ApplyLabelEncoder(df, LE_COLUMNS)
final_df = ApplyOneHotEncoder(df, ONE_HOT_COLUMNS)


In [16]:
## Remove Outliers
RemoveOutliers(final_df, ["amount", "transaction_age_in_days"])

## Scale Dataset
ScaleData(final_df, df.drop('amount', axis=1).columns.tolist())


In [None]:
## Correlation Analysis
def FeatureSelectionCorr(final_df):
    corr_df = final_df.corr(method='pearson')
    COLUMNS = corr_df[(corr_df['amount']>0.01).round(2) | (corr_df['amount'].round(2)<-0.01)]['amount'].index.tolist()
    COLUMNS.remove('amount')
    return

COLUMNS = FeatureSelectionCorr(final_df)
TARGET = "amount"


In [18]:
X_train, X_test, y_train, y_test = train_test_split(final_df[COLUMNS], final_df[TARGET], random_state=104, test_size=0.1, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=104, test_size=0.1, shuffle=True)


In [19]:
def evaluate_model(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    return {"RMSE" : round(rmse, 2), "R2 Score": round(r2, 2), "MAE": round(mae, 2)}

In [18]:
from xgboost import XGBRegressor

def TrainXGBRegressor(model_parameters):
    xgb = XGBRegressor(**model_parameters)
    xgb.fit(X_train, y_train)
    xgb_preds = xgb.predict(X_val)
    xgb_metrics = evaluate_model(y_val, xgb_preds)
    return xgb_metrics

{'RMSE': np.float64(590410.62), 'R2 Score': 0.72, 'MAE': np.float64(356801.66)}

In [19]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_val)
rf_metrics = evaluate_model(y_val, rf_preds)
rf_metrics

{'RMSE': np.float64(596917.61), 'R2 Score': 0.71, 'MAE': np.float64(321926.51)}

In [None]:
# NOTE: It is taking too much time to train
# from sklearn.svm import SVR
# from sklearn.pipeline import make_pipeline

# svr = make_pipeline(StandardScaler(), SVR())
# svr.fit(X_train, y_train)
# svr_preds = svr.predict(X_val)
# svr_metrics = evaluate_model(y_val, svr_preds)
# svr_metrics

In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

def find_best_params(X_train, y_train, param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2):

    xgb = XGBRegressor(random_state=42)
    grid_search = GridSearchCV(
        estimator=xgb,
        param_grid=param_grid,
        cv=cv,
        scoring=scoring,
        verbose=verbose
    )
    
    grid_search.fit(X_train, y_train)
    return grid_search.best_params_

# Example usage:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 10],
    'learning_rate': [0.01, 0.1, 0.2]
}

best_params = find_best_params(X_train, y_train, param_grid)
print("Best parameters:", best_params)


Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] END ...learning_rate=0.01, max_depth=3, n_estimators=50; total time=   0.4s
[CV] END ...learning_rate=0.01, max_depth=3, n_estimators=50; total time=   0.2s
[CV] END ...learning_rate=0.01, max_depth=3, n_estimators=50; total time=   0.2s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.4s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.4s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.4s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   0.8s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   0.8s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   0.9s
[CV] END ...learning_rate=0.01, max_depth=5, n_estimators=50; total time=   0.3s
[CV] END ...learning_rate=0.01, max_depth=5, n_estimators=50; total time=   0.3s
[CV] END ...learning_rate=0.01, max_depth=5, n_e

In [None]:
def TrainXGBRegressor(model_parameters):
    xgb = XGBRegressor(**model_parameters)
    xgb.fit(X_train, y_train)
    xgb_preds = xgb.predict(X_val)
    xgb_metrics = evaluate_model(y_val, xgb_preds)
    return xgb_metrics

params = {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 100, "random_state": 42}
xgb_metrics = TrainXGBRegressor(params)
print(xgb_metrics)

In [27]:
### Drop
    # [transaction_number, transaction_datetime, transaction_type_id, transaction_size_sqm, transaction_subtype_id, property_usage_id,
    #  property_id, property_type_ar, property_type_id, building_age, rooms_ar, project_name_en, property_subtype_ar,
    #  property_subtype_id, area_ar, area_id, nearest_landmark_ar, nearest_metro_ar, nearest_mall_ar, master_project_ar, req_from,
    #  req_to, entry_id, meta_ts, master_project_en, project_name_en]


### Encoding
    ## Label Encoding
    # ["transaction_subtype_en", "property_subtype_en", "area_en", "project_name_en", "nearest_metro_en"]

    ## One Hot Encoding
    ## ["transaction_type_en", "registration_type_en", "is_freehold_text", "property_usage_en", "is_offplan", "is_freehold",
    #  "nearest_landmark_en", "property_type_en", "rooms_en", "nearest_mall_en"]

### Engineered Features
    # [transaction_age_in_days] -> transactions_df.loc[:, 'transaction_age_in_days'] = round((pd.Timestamp.now() - pd.to_datetime(transactions_df['transaction_datetime'])).dt.total_seconds() / (60 * 60 * 24))

# transactions_df['property_size_sqm'].plot(kind='box', title='Property Size SQM')

157600

##### EDA snp_dld_2024_rents.csv

In [23]:
rents_df.head(10)

Unnamed: 0,ejari_contract_number,registration_date,contract_start_date,contract_end_date,version_number,version_text,contract_amount,annual_amount,is_freehold,is_freehold_text,property_size_sqm,parcel_id,property_id,land_property_id,property_type_en,property_type_ar,property_subtype_en,property_subtype_ar,property_usage_en,property_usage_ar,property_usage_id,total_properties,rooms,parking,project_name_en,project_name_ar,area_en,area_ar,area_id,nearest_landmark_en,nearest_landmark_ar,nearest_metro_en,nearest_metro_ar,nearest_mall_en,nearest_mall_ar,master_project_en,master_project_ar,ejari_property_type_id,ejari_property_sub_type_id,req_from,req_to,entry_id,meta_ts
0,120130625001365,2024-01-24 11:14:33,2024-01-01,2024-12-31,12,Renewed,58000.0,58000.0,f,Non Free Hold,120.76,2410596.0,0,0,Unit,وحدة,Flat,شقه,Residential,سكني,0,1,,,,,Al Nahda Second,النهده الثانيه,0,Dubai International Airport,مطار دبي الدولي,Al Nahda Metro Station,محطة مترو النهضة,City Centre Mirdif,سيتي سنتر مردف,,,0,0,2024-01-01,2024-01-31,94368,2024-09-26 11:18:46.859
1,120130626005726,2024-01-09 16:48:47,2024-03-18,2025-03-17,12,Renewed,88000.0,88000.0,t,Free Hold,159.03,2514114.0,0,0,Unit,وحدة,Flat,شقه,Residential,سكني,0,1,,,,,Mirdif,مردف,0,Dubai International Airport,مطار دبي الدولي,Rashidiya Metro Station,محطة مترو الراشدية,City Centre Mirdif,سيتي سنتر مردف,,,0,0,2024-01-01,2024-01-31,94368,2024-09-26 11:18:46.859
2,120130626007829,2024-01-16 11:39:26,2024-01-01,2024-12-31,12,Renewed,94000.0,94000.0,f,Non Free Hold,60.94,1230681.0,0,0,Unit,وحدة,Shop,محل,Commercial,تجاري,0,1,,,,,Al Muteena,المطينه,0,Dubai International Airport,مطار دبي الدولي,Salah Al Din Metro Station,محطة مترو صلاح الدين,Dubai Mall,مول دبي,,,0,0,2024-01-01,2024-01-31,94368,2024-09-26 11:18:46.859
3,120130627008661,2024-01-15 18:28:05,2024-01-01,2024-12-31,12,Renewed,66000.0,66000.0,f,Non Free Hold,108.88,3730405.0,0,0,Unit,وحدة,Flat,شقه,Residential,سكني,0,1,,,,,Al Barsha First,البرشاء الاولى,0,Burj Al Arab,برج العرب,Sharaf Dg Metro Station,محطة مترو شرف دي جي,Mall of the Emirates,مول الإمارات,,,0,0,2024-01-01,2024-01-31,94368,2024-09-26 11:18:46.859
4,120130627009439,2024-01-22 13:34:29,2024-01-01,2024-12-31,12,Renewed,31200.0,31200.0,f,Non Free Hold,57.6,2320537.0,0,0,Unit,وحدة,Flat,شقه,Residential,سكني,0,1,,,,,Al Qusais First,القصيص الاولى,0,Dubai International Airport,مطار دبي الدولي,Airport Free Zone,المنطقة الحرة بالمطار,City Centre Mirdif,سيتي سنتر مردف,,,0,0,2024-01-01,2024-01-31,94368,2024-09-26 11:18:46.859
5,120130630014940,2024-01-05 15:45:05,2024-01-08,2024-04-07,12,Renewed,10106.25,40425.0,f,Non Free Hold,106.74,2450405.0,0,0,Unit,وحدة,Flat,شقه,Residential,سكني,0,1,,,,,Muhaisanah Fourth,محيصنه الرابعه,0,Dubai International Airport,مطار دبي الدولي,Etisalat Metro Station,محطة مترو اتصالات,City Centre Mirdif,سيتي سنتر مردف,,,0,0,2024-01-01,2024-01-31,94368,2024-09-26 11:18:46.859
6,120130701019951,2024-01-09 16:49:15,2023-11-08,2024-02-07,14,Renewed,12750.0,51000.0,t,Free Hold,103.05,2514313.0,0,0,Unit,وحدة,Flat,شقه,Residential,سكني,0,1,,,,,Mirdif,مردف,0,Dubai International Airport,مطار دبي الدولي,Rashidiya Metro Station,محطة مترو الراشدية,City Centre Mirdif,سيتي سنتر مردف,,,0,0,2024-01-01,2024-01-31,94368,2024-09-26 11:18:46.859
7,120130701021228,2024-01-24 12:37:03,2023-12-21,2024-12-20,13,Renewed,96000.0,96000.0,f,Non Free Hold,148.0,3140136.0,0,0,Unit,وحدة,Flat,شقه,Residential,سكني,0,1,,,,,Um Hurair First,ام هرير الاولى,0,Dubai International Airport,مطار دبي الدولي,Burjuman Metro Station,محطة مترو برجمان,Dubai Mall,مول دبي,,,0,0,2024-01-01,2024-01-31,94368,2024-09-26 11:18:46.859
8,120130701021441,2024-01-04 12:27:24,2024-01-01,2024-12-31,12,Renewed,162350.0,162350.0,f,Non Free Hold,302.0,3640312.0,0,0,Unit,وحدة,Warehouse complex,مجمع مخازن,Industrial,صناعي,0,1,,,,,Al Goze Industrial First,القوز الصناعيه الاولى,0,Burj Al Arab,برج العرب,Noor Bank Metro Station,محطة مترو نور بنك,Mall of the Emirates,مول الإمارات,,,0,0,2024-01-01,2024-01-31,94368,2024-09-26 11:18:46.859
9,120130702002774,2024-01-05 15:15:39,2024-01-01,2024-12-31,11,Renewed,70000.0,70000.0,f,Non Free Hold,211.82,3430471.0,0,0,Villa,فيلا,Complex Villas,مجمع فلل,Residential,سكني,0,1,,0.0,,,Al Wasl,الوصل,0,Burj Khalifa,برج خليفة,Business Bay Metro Station,محطة مترو الخليج التجاري,Dubai Mall,مول دبي,,,0,0,2024-01-01,2024-01-31,94368,2024-09-26 11:18:46.859


(747408, 43) (162806, 46)


In [33]:
# !pip install optuna

# tt = pd.read_csv("D:/Work/Telecomunication Project/xaa")
# tt.isna().sum().tolist()
tt.head()

Unnamed: 0,CONTRNO,MASKED_A_PARTY,MASKED_B_PARTY,TRANSDATE,ACT_DURATION,DURATION,BILLAMOUNT,GROSS_AMOUNT,BILLTEXT,AR_BILLTEXT,UPDDATE,DISC_TYPE,LAST_TRAFFIC_DATE,TARIFFCLASS,TARIFF_GROUP,FILE_ID,CALL_TYPE,CHARGETYPE,LAC,CELL_ID,RATE_TYPE,RATE_POS,PARTY_FLAG,TARIFF_PROFILE,DEST_CODE,IMEI_NUMBER,IMSI_NUMBER,THIRD_PARTY_NO,ROUTE_IN,ROUTE_OUT,RATE_PLAN,CALL_LINK,BILLAMOUNT1,BILLAMOUNT2,BILLAMOUNT3,BILLAMOUNT4,BILLAMOUNT5,GROSS_AMOUNT1,GROSS_AMOUNT2,GROSS_AMOUNT3,GROSS_AMOUNT4,GROSS_AMOUNT5,BILLCODE1,LOC,INT_FLAG,TARIFFCLASS1,TARIFF_GROUP1,SUBSCR_TYPE,AREA,FREE_DURATION,...,PROFILE,DURATION5,DURATION1,DURATION2,DURATION3,DURATION4,GROSS_AMOUNT_INT5,BILLAMOUNT_INT5,CURRENCY,BILLAMOUNT_INT,GROSS_AMOUNT_INT,BILLAMOUNT_INT1,BILLAMOUNT_INT2,BILLAMOUNT_INT3,BILLAMOUNT_INT4,GROSS_AMOUNT_INT1,GROSS_AMOUNT_INT2,GROSS_AMOUNT_INT3,GROSS_AMOUNT_INT4,VOLUME,UPLINK,DOWNLINK,BILLGROUP,ZONE_BAND,CHARGE_GROUP,CHARGE_SCHEMA,TIME_SEGMENT1,TIME_SEGMENT2,TIME_SEGMENT3,TIME_SEGMENT4,TIME_SEGMENT5,CDR_TYPE,GSM_FACTOR_IN,BILL_AMOUNT_EX1,BILL_AMOUNT_EX2,OPERATOR,UPD,NEWGROSS_AMOUNT,NEWTARIFFCLASS,UPD1,UPDATED,DRCID,INCHARGEINFO,IMSI_NO,CATEGORY,FILTER_RES_CODE,EXCID,DRCRN,IMEI,RECORD_TYPE
0,1000002821,373619**,373919**,8/1/2024 12:00:52 AM,22,60,0.0,0.0,ZAIN ONNET MOBILE,ZAIN ONNET MOBILE,8/1/2024 12:05:53 AM,0,7/31/2024,,SRONN_L,70297889,1,L,,700534031,,,,INDI,973,353915100959540,426021532106716.94,,,,PPKG4,0,0.0,0,0,0,0,0.0,0,0,0,0,20.0,,N,A,AIRT,G,0.0,0.0,...,1.0,0.0,60.0,0.0,0.0,0.0,,,BHD,,,,,,,,,,,,,,2.0,SR_ONNET_L,SRONN_L,SRONN_L,TIME_SR,,,,,SRVIMS,1.0,0.0,0.0,,,,,,,,,,,,,,,4
1,1000004383,366663**,366699**,8/1/2024 12:01:01 AM,17,60,0.0,0.0,ZAIN ONNET MOBILE,ZAIN ONNET MOBILE,8/1/2024 12:05:53 AM,0,7/31/2024,,SRONN_L,70297889,1,L,,700507031,,,,CORP,973,357926958791000,426021531126234.0,,,,PPKG4,0,0.0,0,0,0,0,0.0,0,0,0,0,20.0,,N,A,AIRT,G,0.0,0.0,...,1.0,0.0,60.0,0.0,0.0,0.0,,,BHD,,,,,,,,,,,,,,2.0,SR_ONNET_L,SRONN_L,SRONN_L,TIME_SR,,,,,SRVIMS,1.0,0.0,0.0,,,,,,,,,,,,,,,4
2,1000006018,394447**,388369**,8/1/2024 12:00:18 AM,39,60,0.01,0.01,BATELCO MOBILE,BATELCO MOBILE,8/1/2024 12:05:53 AM,0,8/9/2024,,SRBATM_L,70297889,1,L,,700674015,,,,INDI,973,353832101897630,426021531764763.06,,,,PPKG4,0,0.01,0,0,0,0,0.01,0,0,0,0,20.0,,N,A,AIRT,G,0.0,0.0,...,1.0,0.0,60.0,0.0,0.0,0.0,,,BHD,,,,,,,,,,,,,,3.0,SR_BATELCOMOBILE_L,SRBATM_L,SRBATM_L,TIME_SR,,,,,SRVIMS,1.0,0.0,0.0,,,,,,,,,,,,,,,4
3,1000007235,364413**,367069**,8/1/2024 12:00:50 AM,28,60,0.0,0.0,ZAIN ONNET MOBILE,ZAIN ONNET MOBILE,8/1/2024 12:05:53 AM,0,7/31/2024,,SRONN_L,70297889,1,L,,700577015,,,,CORP,973,355458120596910,426021532635744.0,,,,TBV2N,0,0.0,0,0,0,0,0.0,0,0,0,0,20.0,,N,A,AIRT,G,0.0,0.0,...,1.0,0.0,60.0,0.0,0.0,0.0,,,BHD,,,,,,,,,,,,,,2.0,SR_ONNET_L,SRONN_L,SRONN_L,TIME_SR,,,,,SRVIMS,1.0,0.0,0.0,,,,,,,,,,,,,,,4
4,1000007738,372282**,363692**,8/1/2024 12:00:03 AM,38,60,0.0,0.0,ZAIN ONNET MOBILE,ZAIN ONNET MOBILE,8/1/2024 12:05:53 AM,0,7/31/2024,,SRONN_L,70297889,1,L,,700168011,,,,INDI,973,357218096189750,426021529160584.0,,,,PPKG1,0,0.0,0,0,0,0,0.0,0,0,0,0,20.0,,N,A,AIRT,G,0.0,0.0,...,1.0,0.0,60.0,0.0,0.0,0.0,,,BHD,,,,,,,,,,,,,,2.0,SR_ONNET_L,SRONN_L,SRONN_L,TIME_SR,,,,,SRVIMS,1.0,0.0,0.0,,,,,,,,,,,,,,,4
