In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder , StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split , RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor , GradientBoostingRegressor
from sklearn.metrics import accuracy_score , precision_score , recall_score , classification_report
from sklearn.svm import SVR
from scipy.stats import randint

In [None]:
data = pd.read_csv("/content/Airbnb_Open_Data.csv", on_bad_lines="skip")
print("rows :" , data.shape[0])
print("columns :" , data.shape[1])
print("columns : ")
print(list(data.columns))
print("-"*65)
data.info()

rows : 102599
columns : 26
columns : 
['id', 'NAME', 'host id', 'host_identity_verified', 'host name', 'neighbourhood group', 'neighbourhood', 'lat', 'long', 'country', 'country code', 'instant_bookable', 'cancellation_policy', 'room type', 'Construction year', 'price', 'service fee', 'minimum nights', 'number of reviews', 'last review', 'reviews per month', 'review rate number', 'calculated host listings count', 'availability 365', 'house_rules', 'license']
-----------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102599 entries, 0 to 102598
Data columns (total 26 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   id                              102599 non-null  int64  
 1   NAME                            102349 non-null  object 
 2   host id                         102599 non-null  int64  
 3   host_identity_verified          102310 non-nu

  data = pd.read_csv("/content/Airbnb_Open_Data.csv", on_bad_lines="skip")


In [None]:
print(data.columns.tolist())
cols_to_drop = ["license" , "id" , "NAME" , "host id" , "host name", "neighbourhood" ]
data.drop(columns=cols_to_drop , inplace=True)
print("data's columns : " , data.shape[1])

['id', 'NAME', 'host id', 'host_identity_verified', 'host name', 'neighbourhood group', 'neighbourhood', 'lat', 'long', 'country', 'country code', 'instant_bookable', 'cancellation_policy', 'room type', 'Construction year', 'price', 'service fee', 'minimum nights', 'number of reviews', 'last review', 'reviews per month', 'review rate number', 'calculated host listings count', 'availability 365', 'house_rules', 'license']
data's columns :  20


In [None]:
obj_feats = []
for col in data.columns:
  if data[col].dtype == "object":
    obj_feats.append(col)
for i in obj_feats:
  print(f"{i} : {len(data[i].unique())}")
print("-"*55)
for i in obj_feats:
  print(f"{i} : {data[i].unique()}")

host_identity_verified : 3
neighbourhood group : 8
country : 2
country code : 2
instant_bookable : 3
cancellation_policy : 4
room type : 4
price : 1152
service fee : 232
last review : 2478
house_rules : 1977
-------------------------------------------------------
host_identity_verified : ['unconfirmed' 'verified' nan]
neighbourhood group : ['Brooklyn' 'Manhattan' 'brookln' 'manhatan' 'Queens' nan 'Staten Island'
 'Bronx']
country : ['United States' nan]
country code : ['US' nan]
instant_bookable : [False True nan]
cancellation_policy : ['strict' 'moderate' 'flexible' nan]
room type : ['Private room' 'Entire home/apt' 'Shared room' 'Hotel room']
price : ['$966 ' '$142 ' '$620 ' ... '$1,184 ' '$905 ' '$309 ']
service fee : ['$193 ' '$28 ' '$124 ' '$74 ' '$41 ' '$115 ' '$14 ' '$212 ' '$204 '
 '$58 ' '$64 ' '$121 ' '$143 ' '$116 ' '$30 ' nan '$56 ' '$95 ' '$27 '
 '$210 ' '$163 ' '$235 ' '$106 ' '$55 ' '$42 ' '$86 ' '$133 ' '$154 '
 '$102 ' '$172 ' '$109 ' '$38 ' '$222 ' '$219 ' '$202 ' '$1

In [None]:
obj_to_drop = ['last review' , 'house_rules' , 'service fee' , "country" , "country code"]
data.drop(columns=obj_to_drop , inplace=True)

In [None]:
colsobject = ['cancellation_policy' ,"host_identity_verified" ,
              "instant_bookable" ,"neighbourhood group" ,"room type"]

for x in colsobject[:-1]:
  print(f"{x} : {data[x].isnull().sum()}")

def dropnan(data , features):
  for col in features:
      data.dropna(subset=[col] , inplace=True)
dropnan(data , colsobject)
print()
for x in colsobject[:-1]:
  print(f"{x} : {data[x].isnull().sum()}")

cancellation_policy : 76
host_identity_verified : 289
instant_bookable : 105
neighbourhood group : 29

cancellation_policy : 0
host_identity_verified : 0
instant_bookable : 0
neighbourhood group : 0


In [None]:
for col in colsobject:
    print(f"{col} : {data[col].isnull().sum()}")
print()
for col in colsobject:
    print(f"{col} : {data[col].unique()}")

cancellation_policy : 0
host_identity_verified : 0
instant_bookable : 0
neighbourhood group : 0
room type : 0

cancellation_policy : ['strict' 'moderate' 'flexible']
host_identity_verified : ['unconfirmed' 'verified']
instant_bookable : [False True]
neighbourhood group : ['Brooklyn' 'Manhattan' 'brookln' 'Queens' 'Staten Island' 'Bronx']
room type : ['Private room' 'Entire home/apt' 'Shared room' 'Hotel room']


In [None]:
onehotenc = ['room type','neighbourhood group' ]
labelenc = ["cancellation_policy", "instant_bookable","host_identity_verified"]

lab = LabelEncoder()
data['Construction year'] = lab.fit_transform(data['Construction year'])

def encode(data , features):
   for col in features:
      data[col] = lab.fit_transform(data[col])
   return data

def hotencode(data, feature_cols):
    for col in feature_cols:
        dummies = pd.get_dummies(data[col], prefix=col, drop_first=True)
        data = pd.concat([data, dummies], axis=1)
        data.drop(col, axis=1, inplace=True)
    return data

data = encode(data , labelenc)
data = hotencode(data, onehotenc)
bool_cols = [col for col in data.columns if data[col].dtype == 'bool']
data[bool_cols] = data[bool_cols].astype(int)
print(data.shape)

(102193, 21)


In [None]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
Index: 102193 entries, 0 to 102598
Data columns (total 21 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   host_identity_verified             102193 non-null  int64  
 1   lat                                102185 non-null  float64
 2   long                               102185 non-null  float64
 3   instant_bookable                   102193 non-null  int64  
 4   cancellation_policy                102193 non-null  int64  
 5   Construction year                  102193 non-null  int64  
 6   price                              101956 non-null  object 
 7   minimum nights                     101801 non-null  float64
 8   number of reviews                  102010 non-null  float64
 9   reviews per month                  86366 non-null   float64
 10  review rate number                 101895 non-null  float64
 11  calculated host listings count     101883 no

# part 2 : **handling** numbers

In [None]:
# oops i forgot this
print(data['price'][:5])
def clean_price(x):
    x = str(x).replace('$', '').replace(',', '')
    return float(x)

data['price'] = data['price'].apply(clean_price)
print()
print(data['price'][:5])

0    $966 
1    $142 
3    $368 
4    $204 
5    $577 
Name: price, dtype: object

0    966.0
1    142.0
3    368.0
4    204.0
5    577.0
Name: price, dtype: float64


In [None]:
for v in data.columns:
  print(f"{v} : {data[v].isnull().sum()}")

host_identity_verified : 0
lat : 8
long : 8
instant_bookable : 0
cancellation_policy : 0
Construction year : 0
price : 237
minimum nights : 392
number of reviews : 183
reviews per month : 15827
review rate number : 298
calculated host listings count : 310
availability 365 : 424
room type_Hotel room : 0
room type_Private room : 0
room type_Shared room : 0
neighbourhood group_Brooklyn : 0
neighbourhood group_Manhattan : 0
neighbourhood group_Queens : 0
neighbourhood group_Staten Island : 0
neighbourhood group_brookln : 0


In [None]:
data.drop(columns=['reviews per month' , 'availability 365'] , inplace=True)
cols_to_impute = ["lat" , "long" , "price" ,"minimum nights" , "number of reviews", "review rate number", "calculated host listings count"]

imputer = SimpleImputer(strategy='mean')
def impute(data , columns):
  for colu in columns:
    data[colu] = imputer.fit_transform(data[[colu]])
  return data
data = impute(data , cols_to_impute)

In [None]:
print("after : \n")
for v in data.columns:
  print(f"{v} : {data[v].isnull().sum()}")

after : 

host_identity_verified : 0
lat : 0
long : 0
instant_bookable : 0
cancellation_policy : 0
Construction year : 0
price : 0
minimum nights : 0
number of reviews : 0
review rate number : 0
calculated host listings count : 0
room type_Hotel room : 0
room type_Private room : 0
room type_Shared room : 0
neighbourhood group_Brooklyn : 0
neighbourhood group_Manhattan : 0
neighbourhood group_Queens : 0
neighbourhood group_Staten Island : 0
neighbourhood group_brookln : 0


In [None]:
data.head()
feats_toscale = ["calculated host listings count" ,"review rate number", "lat",
                 "number of reviews", "minimum nights", "Construction year", "long"]
scaler = StandardScaler()

def scale(data , features):
  for s in features:
    data[s] = scaler.fit_transform(data[[s]])
  return data
data = scale(data , feats_toscale)
data.head()

Unnamed: 0,host_identity_verified,lat,long,instant_bookable,cancellation_policy,Construction year,price,minimum nights,number of reviews,review rate number,calculated host listings count,room type_Hotel room,room type_Private room,room type_Shared room,neighbourhood group_Brooklyn,neighbourhood group_Manhattan,neighbourhood group_Queens,neighbourhood group_Staten Island,neighbourhood group_brookln
0,0,-1.443199,-0.459172,0,2,1.296607,966.0,0.061474,-0.372797,0.561987,-0.060465,0,1,0,1,0,0,0,0
1,1,0.457046,-0.6893,0,1,-0.953483,142.0,0.717922,0.357631,0.561987,-0.18471,0,0,0,0,1,0,0,0
3,0,-0.76908,-0.204618,1,1,-1.299651,368.0,0.717922,4.922805,0.561987,-0.215771,0,0,0,1,0,0,0,0
4,1,1.260796,0.113727,0,1,-0.607315,204.0,0.061474,-0.372797,-0.217683,-0.215771,0,0,0,0,1,0,0,0
5,1,0.350512,-0.512263,1,0,0.08502,577.0,-0.168283,0.946031,-0.217683,-0.215771,0,0,0,0,1,0,0,0


In [None]:
target = 'price'
X = data.drop(columns=target)
y = data[target]

# **splitting data**




In [None]:
x_train, x_test, y_train, y_tets = train_test_split(X, y, test_size=0.25, random_state=10, shuffle=True)