# **Mounting Google Drive**
**NOTE : This code was written in Google Colab**

In [None]:
# Mounting Google Drive to save our trained model
from google.colab import drive
drive.mount("/content/Drive", force_remount=True)

Mounted at /content/Drive


# **Imports**

In [None]:
# Basic Imports for Data Preprocessing
import pandas as pd
import numpy as np

# **Preprocessing**

In [None]:
# Loading our Data
df = pd.read_csv("/content/train.csv")
test_df = pd.read_csv("/content/test.csv")
sample_submission = pd.read_csv("/content/sample submission.csv")

In [None]:
# Viewing the Training Data
df.head(3)

Unnamed: 0,id,type,locality,activation_date,latitude,longitude,lease_type,gym,lift,swimming_pool,...,bathroom,facing,cup_board,floor,total_floor,amenities,water_supply,building_type,balconies,rent
0,ff8081815733a243015733b2876600a6,BHK2,"Cauvery Colony,Koramangala",21-04-2018 14:44,12.934471,77.634471,FAMILY,1,1,1,...,2,E,2,6,12,"{""LIFT"":true,""GYM"":true,""INTERNET"":true,""AC"":f...",CORP_BORE,AP,2,40000
1,ff8081815ee25e15015ee50004da2acd,BHK2,Bellandur,09-10-2017 12:48,12.929557,77.67228,ANYONE,0,1,0,...,2,NE,2,3,4,"{""LIFT"":true,""GYM"":false,""INTERNET"":false,""AC""...",CORPORATION,AP,2,22000
2,ff80818163f29c9e0163f46770b873e7,BHK3,Thiruvanmiyur,12-06-2018 22:02,12.98287,80.262012,FAMILY,0,1,0,...,3,E,3,1,5,"{""LIFT"":true,""GYM"":false,""INTERNET"":false,""AC""...",CORP_BORE,AP,3,28000


In [None]:
# Viewing the Testing Data
test_df.head(3)

Unnamed: 0,id,type,locality,activation_date,latitude,longitude,lease_type,gym,lift,swimming_pool,...,property_age,bathroom,facing,cup_board,floor,total_floor,amenities,water_supply,building_type,balconies
0,ff8081815df539bc015df947ce976cca,BHK2,Basavanagudi,22-08-2017 09:00,12.941603,77.568156,FAMILY,0,1,0,...,1,2,N,2,3,3,"{""LIFT"":true,""GYM"":false,""INTERNET"":false,""AC""...",CORP_BORE,AP,1
1,ff80818157288e9301572c05651853a6,BHK2,Rajaji Nagar,17-09-2017 16:33,12.998803,77.561887,ANYONE,0,0,0,...,7,1,S,2,0,3,"{""LIFT"":false,""GYM"":false,""INTERNET"":true,""AC""...",CORPORATION,IF,0
2,ff8081815f1afc58015f1b831fde166e,BHK1,Jeevan Bima Nagar,14-04-2018 17:27,12.966467,77.661063,ANYONE,0,0,0,...,10,1,S,1,0,1,"{""LIFT"":false,""GYM"":false,""INTERNET"":true,""AC""...",CORPORATION,IF,0


In [None]:
# Viewing Sample submission file
sample_submission.head(3)

Unnamed: 0,rent
0,10000
1,14800
2,18000


In [None]:
# Dropping columns which have less effect on the Rent
df.drop(["id", "activation_date", "locality"], axis=1, inplace=True)
test_df.drop(["id", "activation_date", "locality"], axis=1, inplace=True)

In [None]:
# Accessing the data stored in "amenities" columns
df_amenities = [i for i in df["amenities"].values]
test_df_amenities = [i for i in test_df["amenities"].values]

In [None]:
# Removing brackets and inverted commas from the string
def remove_chars(df):
    for index, i in enumerate(df):
        for char in '{}"':
            i = i.replace(char, "")
        df[index] = i

remove_chars(df_amenities)
remove_chars(test_df_amenities)

In [None]:
# Seperating data based on ","
def clean_data(df):
    for index, i in enumerate(df):
        df[index] = i.split(",")

clean_data(df_amenities)
clean_data(test_df_amenities)   

In [None]:
# Features stored in amenities column
amenities_cols = [i.split(":")[0] for i in df_amenities[0]]
amenities_cols

['LIFT',
 'GYM',
 'INTERNET',
 'AC',
 'CLUB',
 'INTERCOM',
 'POOL',
 'CPA',
 'FS',
 'SERVANT',
 'SECURITY',
 'SC',
 'GP',
 'PARK',
 'RWH',
 'STP',
 'HK',
 'PB',
 'VP']

In [None]:
# All these columns are in binary format so encoding them
# 0 -> False and 1 -> True
def get_amenities_values(df):
    amenities = []

    for i in df:
        tmp1 = [j.split(":") for j in i]
        # for j in i:
        #     j = j.split(":")
        #     tmp1.append(j)

        tmp2 = [1 if i[1] == "true" else 0 for i in tmp1]
        # for i in tmp1:
        #     if i[1] == "true":
        #         tmp2.append(1)
        #     else:
        #         tmp2.append(0)

        amenities.append(tmp2)

    return amenities

df_amenities = get_amenities_values(df_amenities)
test_df_amenities = get_amenities_values(test_df_amenities)

In [None]:
# Convering new feature to dataframe so that we can later concat it with our original data
df_amenities = pd.DataFrame(df_amenities, columns = amenities_cols)
test_df_amenities = pd.DataFrame(test_df_amenities, columns = amenities_cols)

In [None]:
# Viewing amenities in Training Data
df_amenities

Unnamed: 0,LIFT,GYM,INTERNET,AC,CLUB,INTERCOM,POOL,CPA,FS,SERVANT,SECURITY,SC,GP,PARK,RWH,STP,HK,PB,VP
0,1,1,1,0,1,1,1,1,1,0,1,1,1.0,0.0,1.0,1.0,0.0,1.0,1.0
1,1,0,0,0,0,0,0,0,0,0,1,1,0.0,1.0,1.0,1.0,0.0,1.0,1.0
2,1,0,0,0,0,0,0,1,0,0,0,1,0.0,1.0,0.0,0.0,0.0,1.0,1.0
3,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,0,0,0,0,1,0,0,0,0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20495,1,1,1,0,0,1,1,1,0,0,1,1,0.0,0.0,0.0,0.0,0.0,1.0,1.0
20496,0,0,1,0,0,0,0,0,0,0,0,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0
20497,1,1,1,0,1,0,0,1,0,0,1,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
20498,1,0,1,0,0,0,0,1,1,0,1,1,0.0,1.0,0.0,0.0,1.0,1.0,1.0


In [None]:
# Viewing amenities in Testing Data
test_df_amenities

Unnamed: 0,LIFT,GYM,INTERNET,AC,CLUB,INTERCOM,POOL,CPA,FS,SERVANT,SECURITY,SC,GP,PARK,RWH,STP,HK,PB,VP
0,1,0,0,0,0,0,0,0,1,0,1,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0,0,1,0,0,0,0,0,0,0,0,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2,0,0,1,0,0,0,0,0,0,0,0,1,0.0,1.0,1.0,0.0,0.0,1.0,0.0
3,0,0,1,0,0,0,0,0,0,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,1,1,0,1,1,1,1,1,0,1,0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4495,0,0,1,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4496,0,0,1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4497,0,0,0,0,0,0,0,1,0,0,0,0,,,,,,,
4498,0,1,0,1,0,1,1,1,1,1,1,1,,,,,,,


In [None]:
# Dropping repeated columns
df_amenities.drop(["GYM", "LIFT", "POOL"], axis=1, inplace=True)
test_df_amenities.drop(["GYM", "LIFT", "POOL"], axis=1, inplace=True)

In [None]:
# List of all binary columns
cols_to_sum = ["gym", "lift", "swimming_pool"] + list(df_amenities.columns)
cols_to_sum

['gym',
 'lift',
 'swimming_pool',
 'INTERNET',
 'AC',
 'CLUB',
 'INTERCOM',
 'CPA',
 'FS',
 'SERVANT',
 'SECURITY',
 'SC',
 'GP',
 'PARK',
 'RWH',
 'STP',
 'HK',
 'PB',
 'VP']

In [None]:
# Adding the amenities features to the original datasets
df = pd.concat([df, df_amenities], axis=1)
test_df = pd.concat([test_df, test_df_amenities], axis=1)

In [None]:
# List of columns in the dataset
df.columns

Index(['type', 'latitude', 'longitude', 'lease_type', 'gym', 'lift',
       'swimming_pool', 'negotiable', 'furnishing', 'parking', 'property_size',
       'property_age', 'bathroom', 'facing', 'cup_board', 'floor',
       'total_floor', 'amenities', 'water_supply', 'building_type',
       'balconies', 'rent', 'INTERNET', 'AC', 'CLUB', 'INTERCOM', 'CPA', 'FS',
       'SERVANT', 'SECURITY', 'SC', 'GP', 'PARK', 'RWH', 'STP', 'HK', 'PB',
       'VP'],
      dtype='object')

In [None]:
# Summing all the binary columns to reduce the features
# And if a propers has a service -> True(1) -> Most likely it would have a higher price
df["amenities"] = df[cols_to_sum].sum(axis=1)
test_df["amenities"] = test_df[cols_to_sum].sum(axis=1)

In [None]:
# As we have taken the sum dropping the old columns
df.drop(cols_to_sum, axis=1, inplace=True)
test_df.drop(cols_to_sum, axis=1, inplace=True)

In [None]:
# Viewing the train data
df.head()

Unnamed: 0,type,latitude,longitude,lease_type,negotiable,furnishing,parking,property_size,property_age,bathroom,facing,cup_board,floor,total_floor,amenities,water_supply,building_type,balconies,rent
0,BHK2,12.934471,77.634471,FAMILY,0,SEMI_FURNISHED,BOTH,1250,25,2,E,2,6,12,15.0,CORP_BORE,AP,2,40000
1,BHK2,12.929557,77.67228,ANYONE,1,SEMI_FURNISHED,BOTH,1400,4,2,NE,2,3,4,8.0,CORPORATION,AP,2,22000
2,BHK3,12.98287,80.262012,FAMILY,0,SEMI_FURNISHED,BOTH,1350,6,3,E,3,1,5,6.0,CORP_BORE,AP,3,28000
3,BHK1,12.955991,77.531634,FAMILY,1,SEMI_FURNISHED,TWO_WHEELER,600,3,1,E,1,1,2,0.0,CORPORATION,IH,0,8000
4,BHK3,12.963903,77.649446,FAMILY,1,SEMI_FURNISHED,BOTH,1500,15,3,E,4,0,0,3.0,CORPORATION,IH,1,45000


In [None]:
# Viewing the test data
test_df.head()

Unnamed: 0,type,latitude,longitude,lease_type,negotiable,furnishing,parking,property_size,property_age,bathroom,facing,cup_board,floor,total_floor,amenities,water_supply,building_type,balconies
0,BHK2,12.941603,77.568156,FAMILY,0,SEMI_FURNISHED,BOTH,1300,1,2,N,2,3,3,4.0,CORP_BORE,AP,1
1,BHK2,12.998803,77.561887,ANYONE,1,SEMI_FURNISHED,TWO_WHEELER,600,7,1,S,2,0,3,4.0,CORPORATION,IF,0
2,BHK1,12.966467,77.661063,ANYONE,1,SEMI_FURNISHED,TWO_WHEELER,600,10,1,S,1,0,1,5.0,CORPORATION,IF,0
3,BHK1,12.941533,77.592606,ANYONE,0,NOT_FURNISHED,TWO_WHEELER,500,8,1,E,0,2,2,2.0,CORP_BORE,IF,0
4,BHK3,12.971083,77.751625,ANYONE,1,SEMI_FURNISHED,BOTH,1400,0,3,E,3,3,4,15.0,BOREWELL,AP,1


In [None]:
# Seperating features and target from the training data
target_rent = df["rent"]
df = df.drop(["rent"], axis=1)

target_rent

0        40000
1        22000
2        28000
3         8000
4        45000
         ...  
20495    25000
20496    30000
20497    16000
20498    30000
20499     8500
Name: rent, Length: 20500, dtype: int64

In [None]:
# LabelEncoder for encoding the categorical features
from sklearn.preprocessing import LabelEncoder

# Pickle for storing the trained encoder to be used in web app
import pickle

# Making directory to hold the encoders for various categorical columns
!mkdir encoders

le = LabelEncoder()

for col in df.select_dtypes("object").columns:
    df[col] = le.fit_transform(df[col])
    test_df[col] = le.fit_transform(test_df[col])

    output = open(f'/content/encoders/{col}_encoder.pkl', 'wb')
    pickle.dump(le, output)
    output.close()

In [None]:
# Viewing the encoded training data
df.head()

Unnamed: 0,type,latitude,longitude,lease_type,negotiable,furnishing,parking,property_size,property_age,bathroom,facing,cup_board,floor,total_floor,amenities,water_supply,building_type,balconies
0,1,12.934471,77.634471,3,0,2,0,1250,25,2,0,2,6,12,15.0,2,0,2
1,1,12.929557,77.67228,0,1,2,0,1400,4,2,2,2,3,4,8.0,1,0,2
2,2,12.98287,80.262012,3,0,2,0,1350,6,3,0,3,1,5,6.0,2,0,3
3,0,12.955991,77.531634,3,1,2,3,600,3,1,0,1,1,2,0.0,1,3,0
4,2,12.963903,77.649446,3,1,2,0,1500,15,3,0,4,0,0,3.0,1,3,1


In [None]:
# Viewing the encoded testing data
test_df.head()

Unnamed: 0,type,latitude,longitude,lease_type,negotiable,furnishing,parking,property_size,property_age,bathroom,facing,cup_board,floor,total_floor,amenities,water_supply,building_type,balconies
0,1,12.941603,77.568156,3,0,2,0,1300,1,2,1,2,3,3,4.0,2,0,1
1,1,12.998803,77.561887,0,1,2,3,600,7,1,4,2,0,3,4.0,1,2,0
2,0,12.966467,77.661063,0,1,2,3,600,10,1,4,1,0,1,5.0,1,2,0
3,0,12.941533,77.592606,0,0,1,3,500,8,1,0,0,2,2,2.0,2,2,0
4,2,12.971083,77.751625,0,1,2,0,1400,0,3,0,3,3,4,15.0,0,0,1


In [None]:
# Splitting the training data into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df, target_rent, random_state=1, test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16400, 18), (4100, 18), (16400,), (4100,))

# **AutoSklearnRegressor**
https://automl.github.io/auto-sklearn/master/api.html#regression

In [None]:
# Using Auto-Sklearn to test multiple models on our data
# Installing auto-sklearn
!pip install auto-sklearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Imports for auto-sklearn
import autosklearn
from autosklearn.regression import AutoSklearnRegressor

# tmp directory for auto-sklearn files
!mkdir tmp

# Initialzing the AutoSklearnRegressor and training for 2Hrs
automl = AutoSklearnRegressor(
    time_left_for_this_task=7200,
    tmp_folder="/content/tmp/autosklearn_regression_example_tmp",
    n_jobs=-1,
    metric=autosklearn.metrics.mean_squared_error,
)
automl.fit(X_train, y_train)

AutoSklearnRegressor(metric=mean_squared_error, n_jobs=-1,
                     per_run_time_limit=1440, time_left_for_this_task=7200,
                     tmp_folder='/content/tmp/autosklearn_regression_example_tmp')

In [None]:
# Viewng the best performing models
print(automl.leaderboard())

          rank  ensemble_weight                 type          cost    duration
model_id                                                                      
51           1             0.20    gradient_boosting  1.312592e+07   13.813656
10           2             0.34    gradient_boosting  1.313160e+07    4.392393
8            3             0.14    gradient_boosting  1.318030e+07   10.334520
57           4             0.04    gradient_boosting  1.328991e+07    8.789360
82           5             0.04    gradient_boosting  1.362980e+07   46.759067
71           6             0.04    gradient_boosting  1.368531e+07  292.600623
47           7             0.08    gradient_boosting  1.419175e+07   54.121039
5            8             0.08    gradient_boosting  1.425833e+07  147.570775
18           9             0.04  k_nearest_neighbors  2.392080e+07    6.817626


In [None]:
from pprint import pprint

# View the list of different trained models in auto-sklearn
pprint(automl.show_models(), indent=4)

{   5: {   'cost': 14258332.349417921,
           'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f4d31a04a10>,
           'ensemble_weight': 0.08,
           'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f4d3044c250>,
           'model_id': 5,
           'rank': 8,
           'regressor': <autosklearn.pipeline.components.regression.RegressorChoice object at 0x7f4d31197ed0>,
           'sklearn_regressor': HistGradientBoostingRegressor(l2_regularization=6.085630700044881e-10,
                              learning_rate=0.12392806728650493, max_iter=512,
                              min_samples_leaf=25, n_iter_no_change=7,
                              random_state=1, validation_fraction=None,
                              warm_start=True)},
    8: {   'cost': 13180302.023924885,
           'data_preprocessor': <autosklearn.pipeline.components.data_prepro

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Custom function to calculate RMSE, R2 and Adjusted R2
def get_scores(features, target):
    pred = automl.predict(features)

    RMSE = mean_squared_error(target, pred, squared=False)
    R2 = r2_score(target, pred)

    n, p = features.shape[0], features.shape[1]
    Adj_R2 = 1-(1-R2)*(n-1)/(n-p-1)

    print("Train RMSE score\t:", RMSE)
    print("Train R2 score\t\t:", R2)
    print("Train Adjusted R2 score\t:", Adj_R2)

In [None]:
# Getting scores for training data
get_scores(X_train, y_train)

Train RMSE score	: 2546.740334386172
Train R2 score		: 0.908777589610191
Train Adjusted R2 score	: 0.9086773513227228


In [None]:
# Getting scores for testing data
get_scores(X_test, y_test)

Train RMSE score	: 3474.5382345491407
Train R2 score		: 0.8304656329598341
Train Adjusted R2 score	: 0.8297178704980054


In [None]:
# Saving our trained model
filename = "/content/Drive/MyDrive/Hackathon/models/autosklearnregressor_model_2.sav"
pickle.dump(automl, open(filename, "wb"))  

In [None]:
# Zipping the folder containing encoders
!zip -r /content/encoders.zip /content/encoders

  adding: content/encoders/ (stored 0%)
  adding: content/encoders/parking_encoder.pkl (deflated 17%)
  adding: content/encoders/furnishing_encoder.pkl (deflated 19%)
  adding: content/encoders/water_supply_encoder.pkl (deflated 17%)
  adding: content/encoders/building_type_encoder.pkl (deflated 17%)
  adding: content/encoders/lease_type_encoder.pkl (deflated 16%)
  adding: content/encoders/facing_encoder.pkl (deflated 21%)
  adding: content/encoders/type_encoder.pkl (deflated 20%)
