# Smart case study

## import libraly

In [1]:
import gdown
import pandas as pd
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import inflection


## Dowload dataset from google disk

In [None]:
# URL to the file on Google Drive
url_rents = 'https://drive.google.com/uc?id=1oc_RJRsQEiJutVdWjlRdbJ773YpE9h6x'
output_rents = '../data/snp_dld_2024_rents.csv'
url_trans = 'https://drive.google.com/uc?id=1liNykIOnfR5KRR4MXJISCZCKJYFnXQC1'
output_trans = '../data/snp_dld_2024_transactions.csv'
# Download the file
gdown.download(url_rents, output_rents, quiet=False)
gdown.download(url_trans, output_trans, quiet=False)

# Load data into DataFrame
snp_dld_2024_rents = pd.read_csv(output_rents)
snp_dld_2024_transactions = pd.read_csv(output_trans)
print(snp_dld_2024_rents.shape)
print(snp_dld_2024_transactions.shape)

Downloading...
From (original): https://drive.google.com/uc?id=1oc_RJRsQEiJutVdWjlRdbJ773YpE9h6x
From (redirected): https://drive.google.com/uc?id=1oc_RJRsQEiJutVdWjlRdbJ773YpE9h6x&confirm=t&uuid=a57831e4-947c-44b6-8a66-3c42ba87390b
To: /workspaces/SB-Case-Study-Materials/data/snp_dld_2024_rents.csv
100%|██████████| 278M/278M [00:04<00:00, 59.4MB/s] 
Downloading...
From: https://drive.google.com/uc?id=1liNykIOnfR5KRR4MXJISCZCKJYFnXQC1
To: /workspaces/SB-Case-Study-Materials/data/snp_dld_2024_transactions.csv
100%|██████████| 70.1M/70.1M [00:04<00:00, 16.7MB/s]


## Preprocess

### Rentals

In [None]:
def load_data(path):
    """
    Загрузка данных из CSV файлов.
    
    :param rentals_path: Путь к файлу с данными по аренде.
    :param sales_path: Путь к файлу с данными по продажам.
    :return: Два DataFrame с данными по аренде и продажам.
    """
    df = pd.read_csv(path)

    return df

rentals = load_data(output_rents)

  df = pd.read_csv(path)


In [None]:
def preprocess_data_rents(df):
    """
    Data preprocessing: handling missing data, encoding categorical variables, and scaling numerical features.
    
    :param df: DataFrame with raw data.
    :return: DataFrame with preprocessed data.
    """

    # Convert date columns to datetime
    df[['registration_date', 'contract_start_date', 'contract_end_date', 'req_from', 'req_to']] = df[['registration_date', 'contract_start_date', 'contract_end_date', 'req_from', 'req_to']].apply(pd.to_datetime)
    
    # Replace 't'/'f' with True/False
    df['is_freehold'] = df['is_freehold'].replace({'t': True, 'f': False})
    
    # Calculate time deltas
    df['delta_time_registration'] = (df['registration_date'] - df['contract_start_date']).dt.days
    df['delta_time_reg_from'] = (df['registration_date'] - df['req_from']).dt.days
    df['time_reg'] = (df['req_to'] - df['req_from']).dt.days
    df['time_contract'] = (df['contract_end_date'] - df['contract_start_date']).dt.days

    # Sort by version number and drop duplicates

    df.sort_values(['contract_amount', 'version_number','registration_date'], ascending=False, inplace=True)
    
    # Filter out rows with non-positive contract amounts
    df = df.query('contract_amount > 10').copy()

    df = df.drop_duplicates(subset=['ejari_contract_number', 'version_number', 'contract_start_date', 'contract_end_date'], keep='first')

    # Drop unnecessary columns
    df.drop(['is_freehold_text', 'land_property_id', 'property_id', 'property_type_ar', 
             'property_subtype_ar', 'ejari_contract_number',
             'property_usage_ar', 'property_usage_id', 'project_name_ar', 'area_ar', 'nearest_landmark_ar', 
             'nearest_metro_ar', 'nearest_mall_ar', 'master_project_ar', 'ejari_property_type_id', 
             'ejari_property_sub_type_id', 'meta_ts', 'area_id', 'version_text','contract_amount',
             'contract_end_date', 'contract_start_date', 'registration_date', 'req_from', 'req_to'
            ], axis=1, inplace=True)
    
    # Identify numerical and categorical features
    numerical_features = df.select_dtypes(include=['int64', 'float64']).columns

    
    categorical_features = df.select_dtypes(include=['object', 'bool']).columns

    return df, numerical_features, categorical_features



In [None]:
df_preprocess_data_rents, numerical_features, categorical_features = preprocess_data_rents(rentals)
df_preprocess_data_rents

  df['is_freehold'] = df['is_freehold'].replace({'t': True, 'f': False})


Unnamed: 0,version_number,contract_amount,annual_amount,is_freehold,property_size_sqm,parcel_id,property_type_en,property_subtype_en,property_usage_en,total_properties,...,area_en,nearest_landmark_en,nearest_metro_en,nearest_mall_en,master_project_en,entry_id,delta_time_registration,delta_time_reg_from,time_reg,time_contract
645663,1,4.410800e+08,49008888.89,True,47258.20,3920513.0,Building,Building,Residential,1,...,Marsa Dubai,Burj Al Arab,Jumeirah Beach Resdency,Marina Mall,,94375,1852,25,30,3287
55150,1,1.963500e+08,21823402.28,False,33600.00,1290104.0,Building,Building,Commercial,1,...,Port Saeed,Dubai International Airport,Deira City Centre,Dubai Mall,,94368,40,10,30,3651
475651,1,1.927818e+08,19283541.28,True,1854.36,,Unit,Office,Commercial,4,...,Zaabeel First,Burj Khalifa,Al Jafiliya Metro Station,Dubai Mall,,94373,203,20,29,3651
630696,1,1.868167e+08,18681673.25,False,4337.87,3730917.0,Unit,Shop,Commercial,1,...,Al Barsha First,Burj Al Arab,Sharaf Dg Metro Station,Mall of the Emirates,,94375,364,8,30,3652
55046,1,1.683000e+08,18705773.39,False,23178.00,1290104.0,Building,Building,,1,...,Port Saeed,Dubai International Airport,Deira City Centre,Dubai Mall,,94368,40,10,30,3651
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
536873,2,5.000000e-01,1.94,False,85.60,2450511.0,Unit,Flat,Residential,2,...,Muhaisanah Fourth,Dubai International Airport,Etisalat Metro Station,City Centre Mirdif,,94374,26,8,29,94
671589,12,1.000000e-02,0.01,True,118.20,2514313.0,Unit,Flat,Residential,1,...,Mirdif,Dubai International Airport,Rashidiya Metro Station,City Centre Mirdif,,94375,295,21,30,365
587543,10,1.000000e-02,0.01,True,66.44,2514114.0,Unit,Flat,Residential,1,...,Mirdif,Dubai International Airport,Rashidiya Metro Station,City Centre Mirdif,,94375,280,21,30,365
588961,7,1.000000e-02,0.01,True,71.67,2514114.0,Unit,Flat,Residential,1,...,Mirdif,Dubai International Airport,Rashidiya Metro Station,City Centre Mirdif,,94375,182,21,30,365


### Trans

In [None]:
trans = load_data(output_trans)

In [None]:
def preprocess_data_trans(df):
    """
    Data preprocessing: handling missing data, encoding categorical variables, and scaling numerical features.
    
    :param df: DataFrame with raw data.
    :return: DataFrame with preprocessed data.
    """

    df['is_offplan']= df['is_offplan'].replace({'t': True, 'f': False})
    df['is_freehold'] = df['is_freehold_text'].replace({'Free Hold': True, 'Non Free Hold': False})
    df.drop_duplicates(subset=['transaction_number', 'transaction_datetime', 'req_from', 'req_to', 'amount', 
                            'registration_type_en'], keep='last', inplace=True)

        # Filter out rows with non-positive contract amounts
    df = df.query('amount > 0').copy()
        # Identify numerical and categorical features
    # Calculate time deltas
    df[['req_to', 'transaction_datetime', 'req_from']] = df[[ 'transaction_datetime', 'req_from', 'req_to']].apply(pd.to_datetime)

    df['delta_time_reg_to'] = (df['req_to'] - df['transaction_datetime']).dt.days
    df['delta_time_reg_from'] = (df['req_from'] - df['transaction_datetime'] ).dt.days
    df['time_reg'] = (df['req_from'] - df['req_to']).dt.days

    # Sort by version number and drop duplicates
    df.sort_values(['transaction_datetime', 'amount'], ascending=False, inplace=True)
    df.drop_duplicates(subset=['transaction_number', 'transaction_datetime', 'req_from', 'req_to', 'amount', 
                            'registration_type_en'], keep='last', inplace=True)

    df.drop(['property_usage_id', 'transaction_subtype_id', 'transaction_type_id', 'property_id', 
            'transaction_subtype_en', 'is_freehold_text',
            'property_type_ar','property_type_id', 'property_subtype_ar',
            'property_subtype_id', 'building_age', 'rooms_ar', 'project_name_ar',
            'area_ar', 'area_id', 'nearest_landmark_ar', 'nearest_metro_ar', 'nearest_mall_ar',
            'master_project_ar', 'meta_ts', 'transaction_number',
            'req_from', 'transaction_datetime', 'req_to'
            ], axis=1, inplace=True)   

    numerical_features = df.select_dtypes(include=['int64', 'float64']).columns

        
    categorical_features = df.select_dtypes(include=['object', 'bool']).columns
    return df, numerical_features, categorical_features

In [None]:
df_preprocess_data_trans, numerical_features, categorical_features = preprocess_data_trans(trans)
df_preprocess_data_trans

  df['is_freehold'] = df['is_freehold_text'].replace({'Free Hold': True, 'Non Free Hold': False})


Unnamed: 0,transaction_type_en,registration_type_en,property_usage_en,amount,total_buyer,total_seller,transaction_size_sqm,property_size_sqm,parcel_id,is_offplan,...,project_name_en,area_en,nearest_landmark_en,nearest_metro_en,nearest_mall_en,master_project_en,entry_id,delta_time_reg_to,delta_time_reg_from,time_reg
159419,Mortgage,Ready,Residential,2.228449e+09,0,0,7432243.24,7432243.24,,False,...,,Mugatrah,,,,,94367,24,24,-1
159658,Mortgage,Ready,Commercial,5.520000e+08,0,0,10703.66,10703.66,,False,...,,Al Thanyah Fifth,,,,,94367,23,24,0
162252,Mortgage,Ready,Residential,5.203507e+08,0,0,31384.41,31384.41,,False,...,,DUBAI INTERNATIONAL ACADEMIC CITY,,,City Centre Mirdif,,94367,5,24,18
157251,Mortgage,Ready,Residential,5.030000e+08,0,0,12897.82,13747.52,6008142.0,False,...,,CITY OF ARABIA,IMG World Adventures,,,,94367,4,24,19
159615,Mortgage,Ready,Residential,4.000000e+08,0,0,5416.09,5416.09,3450566.0,False,...,,BUSINESS BAY,Downtown Dubai,Business Bay Metro Station,Dubai Mall,,94367,12,24,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8669,Sales,Ready,Residential,6.195700e+04,1,1,205.22,205.22,,False,...,"THE ROYAL ATLANTIS,RESORT AND RESIDENCES",PALM JUMEIRAH,Burj Al Arab,Al Sufouh,Marina Mall,,94359,30,30,-1
14895,Gifts,Ready,Residential,5.995989e+04,0,0,1.00,1136.76,1260815.0,False,...,,Abu Hail,Dubai International Airport,Abu Baker Al Siddique Metro Station,,,94359,9,30,20
8888,Sales,Ready,Residential,5.804500e+04,1,1,8.55,225.66,1271225.0,False,...,,Hor Al Anz,Dubai International Airport,Abu Hail Metro Station,City Centre Mirdif,,94359,22,30,7
14684,Gifts,Ready,Residential,3.017709e+04,0,0,12.34,136.53,3260865.0,False,...,D1,JADDAF WATERFRONT,Dubai International Airport,Al Jadaf Metro Station,City Centre Mirdif,,94359,22,30,7
