In [96]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_percentage_error, r2_score
from sklearn.pipeline import make_pipeline, Pipeline

%matplotlib inline

In [2]:
os.getcwd()

'E:\\00_Learning\\01. Courses\\05. ML Pipeline\\01. Code'

In [3]:
os.chdir("..")

In [4]:
input_path = os.path.join(os.getcwd(),'02. Input') 

In [5]:
os.path.join(input_path,'airbnb_listings.csv')

'E:\\00_Learning\\01. Courses\\05. ML Pipeline\\02. Input\\airbnb_listings.csv'

In [6]:
data = pd.read_csv(os.path.join(input_path,'airbnb_listings.csv'))

In [7]:
data.shape

(2329, 75)

In [8]:
data.head(3)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,70820.0,https://www.airbnb.com/rooms/70820,20230300000000.0,28-03-2023,previous scrape,City View - Sarah's double room.,A good sized room with a comfy double bed and ...,The neighbourhood is friendly and welcoming; m...,https://a0.muscache.com/pictures/b33c30be-56a0...,360195,...,4.94,4.75,4.77,,f,5,0,5,0,1.56
1,117122.0,https://www.airbnb.com/rooms/117122,20230300000000.0,28-03-2023,city scrape,City Centre-Waterside Retreat,My Harbourside apartment centrally located min...,This area is on the water and has splendid vie...,https://a0.muscache.com/pictures/f6d39021-af07...,591555,...,4.99,4.97,4.88,,f,1,0,1,0,1.21
2,176306.0,https://www.airbnb.com/rooms/176306,20230300000000.0,28-03-2023,city scrape,The White Room - Central Bristol Art House Ga...,"A small double room in an artist's house, with...","Southville is pretty darn hip, frankly. North ...",https://a0.muscache.com/pictures/e1e7fdfc-9db5...,708175,...,4.93,4.74,4.7,,f,1,0,1,0,0.37


In [9]:
data.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', '0',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'calendar_up

In [10]:
keep_columns = [
       'host_response_time', 'host_response_rate', 'host_acceptance_rate','host_is_superhost', 
       #'host_listings_count', 'host_total_listings_count', 
       'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       # 'latitude', 'longitude', 
       'property_type', 'room_type', 'accommodates',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities',
       #'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       #'maximum_minimum_nights', 'minimum_maximum_nights',
       #'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       #'maximum_nights_avg_ntm', 'calendar_updated', 
       #'has_availability','availability_30', 
       #'availability_60', 'availability_90', 'availability_365', 'calendar_last_scraped', 'number_of_reviews',
       'number_of_reviews_ltm', 'number_of_reviews_l30d', 
       #'first_review', 'last_review', 
       'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 
       #'license', 
       'instant_bookable', 'calculated_host_listings_count',
       'calculated_host_listings_count_entire_homes',
       'calculated_host_listings_count_private_rooms',
       'calculated_host_listings_count_shared_rooms', 
       #'reviews_per_month', 
       'price']

In [11]:
data = data[keep_columns]

In [12]:
data['neighbourhood'] = data['neighbourhood'].fillna('')

In [13]:
data = data[data['neighbourhood'].str.lower().str.contains('bristol')]

In [14]:
data['neighbourhood'].value_counts()

City of Bristol, England, United Kingdom           363
Bristol City, England, United Kingdom              320
Bristol, England, United Kingdom                   192
Bristol, United Kingdom                             97
Bristol, City of Bristol, United Kingdom            10
Bristol, Avon, United Kingdom                        8
Clifton, Bristol, United Kingdom                     6
Bristol, UK, United Kingdom                          6
City of Bristol, Bristol, United Kingdom             6
Redland, Bristol, United Kingdom                     4
Bristol City, Southville, United Kingdom             3
Bristol, Somerset, United Kingdom                    3
Bristol , England, United Kingdom                    2
City of Bristol, Avon, United Kingdom                2
Bristol, St Andrews, United Kingdom                  1
Bristol, south glos, United Kingdom                  1
Bristol, Westbury on Trym, United Kingdom            1
Easton, Bristol, United Kingdom                      1
Bristol, D

### 01. Understanding the data

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1032 entries, 0 to 2325
Data columns (total 30 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   host_response_time                            885 non-null    object 
 1   host_response_rate                            885 non-null    object 
 2   host_acceptance_rate                          982 non-null    object 
 3   host_is_superhost                             1032 non-null   object 
 4   host_verifications                            1032 non-null   object 
 5   host_has_profile_pic                          1032 non-null   object 
 6   host_identity_verified                        1032 non-null   object 
 7   neighbourhood                                 1032 non-null   object 
 8   property_type                                 1032 non-null   object 
 9   room_type                                     1032 non-null   o

#### 01.a Exploring columns with Object data type

In [16]:
obj_cols = data.select_dtypes(include=['object']).columns

In [17]:
obj_data = data[obj_cols]

In [18]:
obj_cols

Index(['host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_verifications', 'host_has_profile_pic',
       'host_identity_verified', 'neighbourhood', 'property_type', 'room_type',
       'bathrooms_text', 'amenities', 'instant_bookable', 'price'],
      dtype='object')

In [19]:
obj_data['host_response_time'].isna().sum(), obj_data['host_response_time'].value_counts()

(147,
 within an hour        689
 within a few hours    135
 within a day           57
 a few days or more      4
 Name: host_response_time, dtype: int64)

In [20]:
obj_data['host_response_rate'].isna().sum(), obj_data['host_response_rate'].value_counts()

(147,
 100%    675
 98%      65
 99%      61
 90%      11
 50%       9
 97%       8
 88%       6
 94%       6
 91%       6
 0%        4
 89%       4
 86%       4
 93%       4
 80%       3
 83%       3
 92%       3
 96%       3
 67%       2
 63%       2
 75%       2
 70%       1
 95%       1
 87%       1
 60%       1
 Name: host_response_rate, dtype: int64)

In [21]:
obj_data['host_acceptance_rate'].isna().sum(), obj_data['host_acceptance_rate'].value_counts()

(50,
 100%    275
 98%     130
 99%      68
 97%      62
 72%      61
        ... 
 69%       1
 59%       1
 44%       1
 47%       1
 64%       1
 Name: host_acceptance_rate, Length: 62, dtype: int64)

In [22]:
obj_data['host_is_superhost'].isna().sum(), obj_data['host_is_superhost'].value_counts()

(0,
 f    650
 t    382
 Name: host_is_superhost, dtype: int64)

In [23]:
obj_data['host_verifications'].isna().sum(), obj_data['host_verifications'].value_counts()

(0,
 ['email', 'phone']                  810
 ['email', 'phone', 'work_email']    165
 ['phone']                            42
 ['phone', 'work_email']              12
 []                                    2
 ['email']                             1
 Name: host_verifications, dtype: int64)

In [24]:
obj_data['host_has_profile_pic'].isna().sum(), obj_data['host_has_profile_pic'].value_counts()

(0,
 t    1029
 f       3
 Name: host_has_profile_pic, dtype: int64)

In [25]:
obj_data['host_identity_verified'].isna().sum(), obj_data['host_identity_verified'].value_counts()

(0,
 t    988
 f     44
 Name: host_identity_verified, dtype: int64)

In [26]:
obj_data['neighbourhood'].isna().sum(), obj_data['neighbourhood'].value_counts()

(0,
 City of Bristol, England, United Kingdom           363
 Bristol City, England, United Kingdom              320
 Bristol, England, United Kingdom                   192
 Bristol, United Kingdom                             97
 Bristol, City of Bristol, United Kingdom            10
 Bristol, Avon, United Kingdom                        8
 Clifton, Bristol, United Kingdom                     6
 Bristol, UK, United Kingdom                          6
 City of Bristol, Bristol, United Kingdom             6
 Redland, Bristol, United Kingdom                     4
 Bristol City, Southville, United Kingdom             3
 Bristol, Somerset, United Kingdom                    3
 Bristol , England, United Kingdom                    2
 City of Bristol, Avon, United Kingdom                2
 Bristol, St Andrews, United Kingdom                  1
 Bristol, south glos, United Kingdom                  1
 Bristol, Westbury on Trym, United Kingdom            1
 Easton, Bristol, United Kingdom            

In [27]:
obj_data['property_type'].isna().sum(), obj_data['property_type'].value_counts()

(0,
 Entire rental unit                    234
 Private room in home                  177
 Entire home                           170
 Private room in rental unit            99
 Entire serviced apartment              65
 Private room in townhouse              63
 Entire condo                           62
 Entire townhouse                       36
 Private room in condo                  24
 Private room in bed and breakfast      15
 Entire guest suite                     14
 Entire loft                            10
 Tiny home                               9
 Room in serviced apartment              9
 Entire guesthouse                       8
 Entire cottage                          6
 Private room in guest suite             5
 Private room in bungalow                3
 Private room                            3
 Room in aparthotel                      3
 Private room in serviced apartment      2
 Private room in tiny home               2
 Private room in loft                    2
 Room i

In [28]:
obj_data['room_type'].isna().sum(), obj_data['room_type'].value_counts()

(0,
 Entire home/apt    619
 Private room       401
 Hotel room          11
 Shared room          1
 Name: room_type, dtype: int64)

In [29]:
obj_data['bathrooms_text'].isna().sum(), obj_data['bathrooms_text'].value_counts()

(1,
 1 bath              432
 1 shared bath       164
 1 private bath      136
 2 baths              90
 1.5 baths            78
 1.5 shared baths     43
 2.5 baths            32
 3 baths              22
 2 shared baths        9
 3.5 baths             6
 4.5 baths             4
 Shared half-bath      4
 2.5 shared baths      3
 5 baths               3
 3 shared baths        2
 7 baths               1
 6 baths               1
 Half-bath             1
 Name: bathrooms_text, dtype: int64)

In [30]:
obj_data['amenities'].isna().sum(), obj_data['amenities'].value_counts()

(0,
 ["Hot water", "Lock on bedroom door", "Wifi", "Dedicated workspace", "Essentials", "First aid kit", "Heating", "Elevator", "Bed linens", "Long term stays allowed", "Smoke alarm"]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 30
 ["Hot water", "Lock on bedroom door", "Wifi", "Dedicated workspace", "Essentials", "First aid kit", "Hot water kettle", "Heating", "Elevator", "Bed linens", "Long term stays allowed", "Smoke alarm"]                      

In [31]:
# obj_data['has_availability'].isna().sum(), obj_data['has_availability'].value_counts()

In [32]:
obj_data['instant_bookable'].isna().sum(), obj_data['instant_bookable'].value_counts()

(0,
 f    792
 t    240
 Name: instant_bookable, dtype: int64)

#### 01.b Exploring columns with Numerical data type

In [33]:
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns

In [34]:
numerical_cols

Index(['accommodates', 'bedrooms', 'beds', 'number_of_reviews_ltm',
       'number_of_reviews_l30d', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value',
       'calculated_host_listings_count',
       'calculated_host_listings_count_entire_homes',
       'calculated_host_listings_count_private_rooms',
       'calculated_host_listings_count_shared_rooms'],
      dtype='object')

In [35]:
numerical_data = data[numerical_cols]

In [36]:
numerical_data.describe()

Unnamed: 0,accommodates,bedrooms,beds,number_of_reviews_ltm,number_of_reviews_l30d,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms
count,1032.0,979.0,1005.0,1032.0,1032.0,920.0,918.0,918.0,918.0,918.0,918.0,918.0,1032.0,1032.0,1032.0,1032.0
mean,3.302326,1.642492,2.039801,16.679264,1.246124,4.734359,4.812015,4.761351,4.864357,4.834455,4.782614,4.672985,17.626938,12.415698,4.531008,0.001938
std,2.347931,1.082899,1.793194,22.841987,2.141793,0.417342,0.311637,0.317527,0.311846,0.359569,0.276998,0.363929,38.103073,33.725897,13.903611,0.044001
min,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,2.0,1.0,1.0,1.0,0.0,4.67,4.77,4.67,4.85,4.83,4.72,4.6,1.0,0.0,0.0,0.0
50%,2.0,1.0,1.0,8.0,0.0,4.85,4.9,4.85,4.94,4.94,4.84,4.77,2.0,1.0,0.0,0.0
75%,4.0,2.0,2.0,23.0,2.0,4.94,4.9675,4.96,5.0,5.0,4.94,4.88,6.0,2.0,2.0,0.0
max,16.0,9.0,21.0,170.0,15.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,148.0,134.0,60.0,1.0


In [37]:
numerical_data.shape

(1032, 16)

### 2. Cleaning the Data

#### 2.a Cleaning numerical data

In [38]:
for col in numerical_cols:
    if numerical_data[col].isna().sum():
        if numerical_data[col].dtype == 'int64':
            numerical_data[col] = numerical_data[col].fillna(numerical_data[col].median())
        elif numerical_data[col].dtype == 'float64':
            numerical_data[col] = numerical_data[col].fillna(numerical_data[col].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numerical_data[col] = numerical_data[col].fillna(numerical_data[col].mean())


In [39]:
numerical_data['beds'] = numerical_data['beds'].astype('int64')
numerical_data['bedrooms'] = numerical_data['bedrooms'].astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numerical_data['beds'] = numerical_data['beds'].astype('int64')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numerical_data['bedrooms'] = numerical_data['bedrooms'].astype('int64')


In [40]:
numerical_data = numerical_data.reset_index(drop=True)

In [41]:
numerical_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1032 entries, 0 to 1031
Data columns (total 16 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   accommodates                                  1032 non-null   int64  
 1   bedrooms                                      1032 non-null   int64  
 2   beds                                          1032 non-null   int64  
 3   number_of_reviews_ltm                         1032 non-null   int64  
 4   number_of_reviews_l30d                        1032 non-null   int64  
 5   review_scores_rating                          1032 non-null   float64
 6   review_scores_accuracy                        1032 non-null   float64
 7   review_scores_cleanliness                     1032 non-null   float64
 8   review_scores_checkin                         1032 non-null   float64
 9   review_scores_communication                   1032 non-null   f

#### 2.b Cleaning object type data

In [42]:
impute = SimpleImputer(strategy='most_frequent')

In [43]:
for col in obj_cols:
    if obj_data[col].isna().sum():
        print(col)
        obj_data[col] = impute.fit_transform(obj_data[[col]])

host_response_time
host_response_rate
host_acceptance_rate
bathrooms_text


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  obj_data[col] = impute.fit_transform(obj_data[[col]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  obj_data[col] = impute.fit_transform(obj_data[[col]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  obj_data[col] = impute.fit_transform(obj_data[[col]])
A value is trying to be set on a copy of a 

In [44]:
obj_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1032 entries, 0 to 2325
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   host_response_time      1032 non-null   object
 1   host_response_rate      1032 non-null   object
 2   host_acceptance_rate    1032 non-null   object
 3   host_is_superhost       1032 non-null   object
 4   host_verifications      1032 non-null   object
 5   host_has_profile_pic    1032 non-null   object
 6   host_identity_verified  1032 non-null   object
 7   neighbourhood           1032 non-null   object
 8   property_type           1032 non-null   object
 9   room_type               1032 non-null   object
 10  bathrooms_text          1032 non-null   object
 11  amenities               1032 non-null   object
 12  instant_bookable        1032 non-null   object
 13  price                   1032 non-null   object
dtypes: object(14)
memory usage: 120.9+ KB


### 3. Creating Analytical Data Set (ADS)

In [45]:
ads = pd.DataFrame()

In [46]:
ohe = OneHotEncoder()

In [47]:
# host_response_time
host_response_time = pd.DataFrame(ohe.fit_transform(obj_data[['host_response_time']]).toarray(),columns=ohe.get_feature_names_out())
ads = pd.concat([ads, host_response_time], axis = 1)

In [48]:
# host_response_rate
def response_cat(response_rate):
    if response_rate == 100:
        return '100'
    elif response_rate >= 95 and response_rate <100:
        return 'over_95'
    elif response_rate >= 90 and response_rate <95:
        return 'over_90'
    elif response_rate >= 70 and response_rate <90:
        return 'over_70'
    else:
        return 'below_70'

In [49]:
obj_data['host_response_rate'] = obj_data['host_response_rate'].str.replace('%','').astype('int')
obj_data['host_response_rate_cat'] = obj_data['host_response_rate'].apply(response_cat)
obj_data['host_response_rate_cat'].value_counts()

host_response_rate_cat = pd.DataFrame(ohe.fit_transform(obj_data[['host_response_rate_cat']]).toarray(),columns=ohe.get_feature_names_out())
ads = pd.concat([ads, host_response_rate_cat], axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  obj_data['host_response_rate'] = obj_data['host_response_rate'].str.replace('%','').astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  obj_data['host_response_rate_cat'] = obj_data['host_response_rate'].apply(response_cat)


In [50]:
# host_acceptance_rate
def acceptance_cat(acceptance_rate):
    if acceptance_rate == 100:
        return '100'
    elif acceptance_rate >= 95 and acceptance_rate <100:
        return 'over_95'
    elif acceptance_rate >= 90 and acceptance_rate <95:
        return 'over_90'
    elif acceptance_rate >= 70 and acceptance_rate <90:
        return 'over_70'
    else:
        return 'below_70'

obj_data['host_acceptance_rate'] = obj_data['host_acceptance_rate'].str.replace('%','').astype('int')
obj_data['host_acceptance_rate_cat'] = obj_data['host_acceptance_rate'].apply(acceptance_cat)
obj_data['host_acceptance_rate_cat'].value_counts()

host_acceptance_rate_cat = pd.DataFrame(ohe.fit_transform(obj_data[['host_acceptance_rate_cat']]).toarray(),columns=ohe.get_feature_names_out())
ads = pd.concat([ads, host_acceptance_rate_cat], axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  obj_data['host_acceptance_rate'] = obj_data['host_acceptance_rate'].str.replace('%','').astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  obj_data['host_acceptance_rate_cat'] = obj_data['host_acceptance_rate'].apply(acceptance_cat)


In [51]:
# host_is_superhost

host_is_superhost = pd.DataFrame(ohe.fit_transform(obj_data[['host_is_superhost']]).toarray(),columns=ohe.get_feature_names_out())
ads = pd.concat([ads, host_is_superhost], axis = 1)

In [52]:
ads

Unnamed: 0,host_response_time_a few days or more,host_response_time_within a day,host_response_time_within a few hours,host_response_time_within an hour,host_response_rate_cat_100,host_response_rate_cat_below_70,host_response_rate_cat_over_70,host_response_rate_cat_over_90,host_response_rate_cat_over_95,host_acceptance_rate_cat_100,host_acceptance_rate_cat_below_70,host_acceptance_rate_cat_over_70,host_acceptance_rate_cat_over_90,host_acceptance_rate_cat_over_95,host_is_superhost_f,host_is_superhost_t
0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1027,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1028,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1029,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1030,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [53]:
# host_verifications

host_verifications = obj_data['host_verifications'].apply(lambda x: len(x.replace('[','').replace(']','').split(',')))
host_verifications = host_verifications.reset_index(drop = True)
ads = pd.concat([ads, host_verifications], axis = 1)

In [54]:
# host_has_profile_pic

host_has_profile_pic = pd.DataFrame(ohe.fit_transform(obj_data[['host_has_profile_pic']]).toarray(),columns=ohe.get_feature_names_out())
ads = pd.concat([ads, host_has_profile_pic], axis = 1)

In [55]:
# host_identity_verified

host_identity_verified = pd.DataFrame(ohe.fit_transform(obj_data[['host_identity_verified']]).toarray(),columns=ohe.get_feature_names_out())
ads = pd.concat([ads, host_identity_verified], axis = 1)

In [56]:
# property_type

property_type = pd.DataFrame(ohe.fit_transform(obj_data[['property_type']]).toarray(),columns=ohe.get_feature_names_out())
ads = pd.concat([ads, property_type], axis = 1)

In [57]:
# room_type

room_type = pd.DataFrame(ohe.fit_transform(obj_data[['room_type']]).toarray(),columns=ohe.get_feature_names_out())
ads = pd.concat([ads, room_type], axis = 1)

In [58]:
bathroom_count = obj_data['bathrooms_text'].apply(lambda x: x.replace('Shared half-bath', '0.5 shared').replace('Half-bath', '0.5 bath').split(' ')[0]).astype('float')
bathroom_count = bathroom_count.reset_index(drop = True)
bathroom_count = bathroom_count.rename('bathroom_count')

In [59]:
ads = pd.concat([ads, bathroom_count], axis = 1)

In [60]:
obj_data['bathroom_shared_flag'] = obj_data['bathrooms_text'].apply(lambda x: 't' if 'shared' in x else 'f')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  obj_data['bathroom_shared_flag'] = obj_data['bathrooms_text'].apply(lambda x: 't' if 'shared' in x else 'f')


In [61]:
bathroom_shared = pd.DataFrame(ohe.fit_transform(obj_data[['bathroom_shared_flag']]).toarray(),columns=ohe.get_feature_names_out())
ads = pd.concat([ads, bathroom_shared], axis=1)

In [62]:
# amenities

amenities = obj_data['amenities'].apply(lambda x: len(x.replace('[','').replace(']','').split(',')))
amenities = amenities.reset_index(drop = True)
ads = pd.concat([ads,amenities], axis = 1)

In [63]:
# instant_bookable

instant_bookable = pd.DataFrame(ohe.fit_transform(obj_data[['instant_bookable']]).toarray(), columns=ohe.get_feature_names_out())
ads = pd.concat([ads,instant_bookable], axis = 1)

In [64]:
ads = pd.concat([ads, numerical_data], axis=1)

In [65]:
price = obj_data['price'].str.replace('$','').str.replace(',','').astype('float').astype('int').reset_index(drop=True)
ads = pd.concat([ads, price], axis = 1)

  price = obj_data['price'].str.replace('$','').str.replace(',','').astype('float').astype('int').reset_index(drop=True)


In [67]:
ads.head()

Unnamed: 0,host_response_time_a few days or more,host_response_time_within a day,host_response_time_within a few hours,host_response_time_within an hour,host_response_rate_cat_100,host_response_rate_cat_below_70,host_response_rate_cat_over_70,host_response_rate_cat_over_90,host_response_rate_cat_over_95,host_acceptance_rate_cat_100,...,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,price
0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,4.68,4.95,4.94,4.75,4.77,5,0,5,0,41
1,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,4.98,5.0,4.99,4.97,4.88,1,0,1,0,96
2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,4.42,5.0,4.93,4.74,4.7,1,0,1,0,32
3,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,4.72,4.93,4.76,4.79,4.62,3,1,2,0,38
4,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,4.86,4.96,4.96,4.83,4.73,1,1,0,0,571


In [69]:
ads.isna().sum().sum()

0

### Training the model

In [89]:
X = ads.loc[:, ads.columns!='price']
y = ads.loc[:, 'price']

In [90]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [92]:
lr = LinearRegression()

In [94]:
model = lr.fit(X_train, y_train)

In [95]:
pred = model.predict(X_test)

In [97]:
r2 = r2_score(y_test, pred)
mape = mean_absolute_percentage_error(y_test, pred)

In [98]:
r2, mape

(0.958054888036247, 0.45936808828116776)