# Smart case study

## import libraly

In [None]:
import gdown
import pandas as pd
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import inflection
from catboost import CatBoostRegressor, Pool
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib


## Dowload dataset from google disk

In [2]:
# URL to the file on Google Drive
url_rents = 'https://drive.google.com/uc?id=1oc_RJRsQEiJutVdWjlRdbJ773YpE9h6x'
output_rents = '../data/snp_dld_2024_rents.csv'
url_trans = 'https://drive.google.com/uc?id=1liNykIOnfR5KRR4MXJISCZCKJYFnXQC1'
output_trans = '../data/snp_dld_2024_transactions.csv'
# Download the file
gdown.download(url_rents, output_rents, quiet=False)
gdown.download(url_trans, output_trans, quiet=False)

# Load data into DataFrame
snp_dld_2024_rents = pd.read_csv(output_rents)
snp_dld_2024_transactions = pd.read_csv(output_trans)
print(snp_dld_2024_rents.shape)
print(snp_dld_2024_transactions.shape)

Downloading...
From (original): https://drive.google.com/uc?id=1oc_RJRsQEiJutVdWjlRdbJ773YpE9h6x
From (redirected): https://drive.google.com/uc?id=1oc_RJRsQEiJutVdWjlRdbJ773YpE9h6x&confirm=t&uuid=c6656d2c-730c-465c-973a-e1cbfb60cf26
To: /workspaces/SB-Case-Study-Materials/data/snp_dld_2024_rents.csv
100%|██████████| 278M/278M [00:05<00:00, 49.6MB/s] 
Downloading...
From: https://drive.google.com/uc?id=1liNykIOnfR5KRR4MXJISCZCKJYFnXQC1
To: /workspaces/SB-Case-Study-Materials/data/snp_dld_2024_transactions.csv
100%|██████████| 70.1M/70.1M [00:01<00:00, 48.0MB/s]
  snp_dld_2024_rents = pd.read_csv(output_rents)


(681447, 43)
(162806, 46)


## Preprocess

### Rentals

In [3]:
def load_data(path):
    """
    Загрузка данных из CSV файлов.
    
    :param rentals_path: Путь к файлу с данными по аренде.
    :param sales_path: Путь к файлу с данными по продажам.
    :return: Два DataFrame с данными по аренде и продажам.
    """
    df = pd.read_csv(path)

    return df

rentals = load_data(output_rents)

  df = pd.read_csv(path)


In [4]:
def preprocess_data_rents(df):
    """
    Data preprocessing: handling missing data, encoding categorical variables, and scaling numerical features.
    
    :param df: DataFrame with raw data.
    :return: DataFrame with preprocessed data.
    """

    # Convert date columns to datetime
    df[['registration_date', 'contract_start_date', 'contract_end_date', 'req_from', 'req_to']] = df[['registration_date', 'contract_start_date', 'contract_end_date', 'req_from', 'req_to']].apply(pd.to_datetime)
    
    # Replace 't'/'f' with True/False
    df['is_freehold'] = df['is_freehold'].replace({'t': True, 'f': False})
    
    # Calculate time deltas
    df['delta_time_registration'] = (df['registration_date'] - df['contract_start_date']).dt.days
    df['delta_time_reg_from'] = (df['registration_date'] - df['req_from']).dt.days
    df['time_reg'] = (df['req_to'] - df['req_from']).dt.days
    df['time_contract'] = (df['contract_end_date'] - df['contract_start_date']).dt.days

    # Sort by version number and drop duplicates

    df.sort_values(['contract_amount', 'version_number','registration_date'], ascending=False, inplace=True)
    
    # Filter out rows with non-positive contract amounts
    df = df.query('contract_amount > 10').copy()

    df = df.drop_duplicates(subset=['ejari_contract_number', 'version_number', 'contract_start_date', 'contract_end_date'], keep='first')

    # Drop unnecessary columns
    df.drop(['is_freehold_text', 'land_property_id', 'property_id', 'property_type_ar', 
             'property_subtype_ar', 'ejari_contract_number',
             'property_usage_ar', 'property_usage_id', 'project_name_ar', 'area_ar', 'nearest_landmark_ar', 
             'nearest_metro_ar', 'nearest_mall_ar', 'master_project_ar', 'ejari_property_type_id', 
             'ejari_property_sub_type_id', 'meta_ts', 'area_id', 'version_text','annual_amount',
             'contract_end_date', 'contract_start_date', 'registration_date', 'req_from', 'req_to'
            ], axis=1, inplace=True)
    
    # Identify numerical and categorical features
    numerical_features = df.select_dtypes(include=['int64', 'float64']).columns

        # Replace NaN values in categorical features with 'missing'
    # Replace NaN values in categorical features with 'missing'
    df = df.apply(lambda x: x.fillna('missing') if x.dtype == 'object' else x)
        
    categorical_features = df.select_dtypes(include=['object']).columns

    return df, numerical_features, categorical_features



In [5]:
df_preprocess_data_rents, numerical_features_rents, categorical_features_rents = preprocess_data_rents(rentals)
df_preprocess_data_rents

  df['is_freehold'] = df['is_freehold'].replace({'t': True, 'f': False})


Unnamed: 0,version_number,contract_amount,is_freehold,property_size_sqm,parcel_id,property_type_en,property_subtype_en,property_usage_en,total_properties,rooms,...,area_en,nearest_landmark_en,nearest_metro_en,nearest_mall_en,master_project_en,entry_id,delta_time_registration,delta_time_reg_from,time_reg,time_contract
645663,1,4.410800e+08,True,47258.20,3920513.0,Building,Building,Residential,1,,...,Marsa Dubai,Burj Al Arab,Jumeirah Beach Resdency,Marina Mall,missing,94375,1852,25,30,3287
55150,1,1.963500e+08,False,33600.00,1290104.0,Building,Building,Commercial,1,,...,Port Saeed,Dubai International Airport,Deira City Centre,Dubai Mall,missing,94368,40,10,30,3651
475651,1,1.927818e+08,True,1854.36,,Unit,Office,Commercial,4,,...,Zaabeel First,Burj Khalifa,Al Jafiliya Metro Station,Dubai Mall,missing,94373,203,20,29,3651
630696,1,1.868167e+08,False,4337.87,3730917.0,Unit,Shop,Commercial,1,,...,Al Barsha First,Burj Al Arab,Sharaf Dg Metro Station,Mall of the Emirates,missing,94375,364,8,30,3652
55046,1,1.683000e+08,False,23178.00,1290104.0,Building,Building,missing,1,,...,Port Saeed,Dubai International Airport,Deira City Centre,Dubai Mall,missing,94368,40,10,30,3651
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
610334,3,2.500000e+01,True,84.18,3450663.0,Unit,Flat,Residential,1,,...,Business Bay,Downtown Dubai,Buj Khalifa Dubai Mall Metro Station,Dubai Mall,missing,94375,11,11,30,729
148224,1,2.200000e+01,False,22.00,,Unit,Office,Commercial,1,,...,Saih Shuaib 2,Al Makhtoum International Airport,missing,missing,missing,94369,0,14,28,365
646023,1,2.100000e+01,True,20.00,,Virtual Unit,Office,Commercial,1,,...,Dubai Investment Park First,Expo 2020 Site,Ibn Battuta Metro Station,Ibn-e-Battuta Mall,missing,94375,0,25,30,364
549785,1,2.100000e+01,False,18.59,1280540.0,Unit,Office,Commercial,1,,...,Al Khabeesi,Dubai International Airport,Al Rigga Metro Station,Dubai Mall,missing,94374,0,10,29,364


### Trans

In [6]:
trans = load_data(output_trans)

In [7]:
def preprocess_data_trans(df):
    """
    Data preprocessing: handling missing data, encoding categorical variables, and scaling numerical features.
    
    :param df: DataFrame with raw data.
    :return: DataFrame with preprocessed data.
    """

    df['is_offplan']= df['is_offplan'].replace({'t': True, 'f': False})
    df['is_freehold'] = df['is_freehold_text'].replace({'Free Hold': True, 'Non Free Hold': False})
    df.drop_duplicates(subset=['transaction_number', 'transaction_datetime', 'req_from', 'req_to', 'amount', 
                            'registration_type_en'], keep='last', inplace=True)

        # Filter out rows with non-positive contract amounts
    df = df.query('amount > 0').copy()
        # Identify numerical and categorical features
    # Calculate time deltas
    df[['req_to', 'transaction_datetime', 'req_from']] = df[[ 'transaction_datetime', 'req_from', 'req_to']].apply(pd.to_datetime)

    df['delta_time_reg_to'] = (df['req_to'] - df['transaction_datetime']).dt.days
    df['delta_time_reg_from'] = (df['req_from'] - df['transaction_datetime'] ).dt.days
    df['time_reg'] = (df['req_from'] - df['req_to']).dt.days

    # Sort by version number and drop duplicates
    df.sort_values(['transaction_datetime', 'amount'], ascending=False, inplace=True)
    df.drop_duplicates(subset=['transaction_number', 'transaction_datetime', 'req_from', 'req_to', 'amount', 
                            'registration_type_en'], keep='last', inplace=True)

    df.drop(['property_usage_id', 'transaction_subtype_id', 'transaction_type_id', 'property_id', 
            'transaction_subtype_en', 'is_freehold_text',
            'property_type_ar','property_type_id', 'property_subtype_ar',
            'property_subtype_id', 'building_age', 'rooms_ar', 'project_name_ar',
            'area_ar', 'area_id', 'nearest_landmark_ar', 'nearest_metro_ar', 'nearest_mall_ar',
            'master_project_ar', 'meta_ts', 'transaction_number',
            'req_from', 'transaction_datetime', 'req_to'
            ], axis=1, inplace=True)   

    numerical_features = df.select_dtypes(include=['int64', 'float64']).columns
    # Replace NaN values in categorical features with 'missing'
    df = df.apply(lambda x: x.fillna('missing') if x.dtype == 'object' else x)
        
    categorical_features = df.select_dtypes(include=['object']).columns
    return df, numerical_features, categorical_features

In [8]:
df_preprocess_data_trans, numerical_features_trans, categorical_features_trans = preprocess_data_trans(trans)
df_preprocess_data_trans

  df['is_offplan']= df['is_offplan'].replace({'t': True, 'f': False})
  df['is_freehold'] = df['is_freehold_text'].replace({'Free Hold': True, 'Non Free Hold': False})


Unnamed: 0,transaction_type_en,registration_type_en,property_usage_en,amount,total_buyer,total_seller,transaction_size_sqm,property_size_sqm,parcel_id,is_offplan,...,project_name_en,area_en,nearest_landmark_en,nearest_metro_en,nearest_mall_en,master_project_en,entry_id,delta_time_reg_to,delta_time_reg_from,time_reg
159419,Mortgage,Ready,Residential,2.228449e+09,0,0,7432243.24,7432243.24,,False,...,missing,Mugatrah,missing,missing,missing,missing,94367,24,24,-1
159658,Mortgage,Ready,Commercial,5.520000e+08,0,0,10703.66,10703.66,,False,...,missing,Al Thanyah Fifth,missing,missing,missing,missing,94367,23,24,0
162252,Mortgage,Ready,Residential,5.203507e+08,0,0,31384.41,31384.41,,False,...,missing,DUBAI INTERNATIONAL ACADEMIC CITY,missing,missing,City Centre Mirdif,missing,94367,5,24,18
157251,Mortgage,Ready,Residential,5.030000e+08,0,0,12897.82,13747.52,6008142.0,False,...,missing,CITY OF ARABIA,IMG World Adventures,missing,missing,missing,94367,4,24,19
159615,Mortgage,Ready,Residential,4.000000e+08,0,0,5416.09,5416.09,3450566.0,False,...,missing,BUSINESS BAY,Downtown Dubai,Business Bay Metro Station,Dubai Mall,missing,94367,12,24,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8669,Sales,Ready,Residential,6.195700e+04,1,1,205.22,205.22,,False,...,"THE ROYAL ATLANTIS,RESORT AND RESIDENCES",PALM JUMEIRAH,Burj Al Arab,Al Sufouh,Marina Mall,missing,94359,30,30,-1
14895,Gifts,Ready,Residential,5.995989e+04,0,0,1.00,1136.76,1260815.0,False,...,missing,Abu Hail,Dubai International Airport,Abu Baker Al Siddique Metro Station,missing,missing,94359,9,30,20
8888,Sales,Ready,Residential,5.804500e+04,1,1,8.55,225.66,1271225.0,False,...,missing,Hor Al Anz,Dubai International Airport,Abu Hail Metro Station,City Centre Mirdif,missing,94359,22,30,7
14684,Gifts,Ready,Residential,3.017709e+04,0,0,12.34,136.53,3260865.0,False,...,D1,JADDAF WATERFRONT,Dubai International Airport,Al Jadaf Metro Station,City Centre Mirdif,missing,94359,22,30,7


## Model creation

### ALL Features, gain computer

In [None]:
# Assuming 'contract_amount' is the target variable
target_column_rents = 'contract_amount'

In [10]:
def evaluate_model(y_true, y_pred):
    """
    Evaluate the model using RMSE, R2 Score, and MAE.
    
    :param y_true: True target values.
    :param y_pred: Predicted target values.
    :return: Dictionary with evaluation metrics.
    """
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    return {"RMSE": rmse, "R2 Score": r2, "MAE": mae}

In [None]:
def cross_validate_model(X, y, cat_features, cv=5):
    """
    Perform cross-validation on the CatBoost model and return the best model.
    
    :param X: DataFrame with features.
    :param y: Series with target variable.
    :param cat_features: List of categorical feature names.
    :param cv: Number of cross-validation folds.
    :return: Mean and standard deviation of RMSE scores, best model.
    """
    model = CatBoostRegressor(loss_function='RMSE', verbose=0)
    param_grid = {
        'iterations': [100, 200],
        'depth': [4, 6, 8],
        'learning_rate': [0.01, 0.1]
    }
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv, scoring='neg_mean_squared_error', verbose=1)
    grid_search.fit(X, y, cat_features=cat_features)
    
    best_model = grid_search.best_estimator_
    best_score = grid_search.best_score_
    rmse_scores = (-best_score) ** 0.5
    
    return rmse_scores, best_model

In [12]:
def train_and_evaluate_model(df, target_column, categorical_features):
    """
    Train and evaluate a CatBoost model.
    
    :param df: DataFrame with preprocessed data.
    :param target_column: Name of the target column.
    :param categorical_features: List of categorical feature names.
    :return: Trained model.
    """
    # Remove rows with NaN values in the target column
    df = df.dropna(subset=[target_column])
    
    X = df.drop(columns=[target_column])
    y = df[target_column]
    
    # Perform cross-validation and get the best model
    rmse, best_model = cross_validate_model(X, y, categorical_features)
    print(f"Cross-validated RMSE: {rmse:.2f}")
    
    # Train the best model on the entire dataset
    best_model.fit(X, y, cat_features=categorical_features)
    
    return best_model


In [13]:
def save_model(model, filename):
    model.save_model(f'../models/{filename}', format="cbm")

In [None]:

# Train and evaluate model
model_rents = train_and_evaluate_model(df_preprocess_data_rents, target_column_rents, categorical_features_rents.tolist())

# Save the model
save_model(model_rents, 'catboost_model_rents.cbm')