In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
###! pip install scikit-learn

In [3]:
###! pip install klib

In [4]:
import klib

In [5]:
import pandas as pd

In [6]:
dataframe = pd.read_csv("listings (1).csv", encoding='utf-8')

In [7]:
dataframe.head(2)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
0,329172,"Hillside designer home,10 min.dwntn",1680871,Janet,,78746,30.30085,-97.80794,Entire home/apt,495,3,7,2022-08-07,0.05,1,363,1,
1,329306,"Urban Homestead, 5 minutes to downtown",880571,Angel,,78702,30.27232,-97.72579,Private room,63,2,570,2022-11-30,4.36,5,55,45,


In [8]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13808 entries, 0 to 13807
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              13808 non-null  int64  
 1   name                            13808 non-null  object 
 2   host_id                         13808 non-null  int64  
 3   host_name                       13806 non-null  object 
 4   neighbourhood_group             0 non-null      float64
 5   neighbourhood                   13808 non-null  int64  
 6   latitude                        13808 non-null  float64
 7   longitude                       13808 non-null  float64
 8   room_type                       13808 non-null  object 
 9   price                           13808 non-null  int64  
 10  minimum_nights                  13808 non-null  int64  
 11  number_of_reviews               13808 non-null  int64  
 12  last_review                     

In [9]:
dataframe.isnull().sum()

id                                    0
name                                  0
host_id                               0
host_name                             2
neighbourhood_group               13808
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                        2860
reviews_per_month                  2860
calculated_host_listings_count        0
availability_365                      0
number_of_reviews_ltm                 0
license                           13808
dtype: int64

In [10]:
dataframe.drop(columns = {"neighbourhood_group"}, inplace=True)
dataframe.drop(columns = {"last_review"}, inplace=True)

In [11]:
dataframe.drop(columns = {"reviews_per_month"}, inplace=True)
dataframe.drop(columns = {"license"}, inplace=True)

In [12]:
print(dataframe.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13808 entries, 0 to 13807
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              13808 non-null  int64  
 1   name                            13808 non-null  object 
 2   host_id                         13808 non-null  int64  
 3   host_name                       13806 non-null  object 
 4   neighbourhood                   13808 non-null  int64  
 5   latitude                        13808 non-null  float64
 6   longitude                       13808 non-null  float64
 7   room_type                       13808 non-null  object 
 8   price                           13808 non-null  int64  
 9   minimum_nights                  13808 non-null  int64  
 10  number_of_reviews               13808 non-null  int64  
 11  calculated_host_listings_count  13808 non-null  int64  
 12  availability_365                

In [13]:
dataframe = dataframe.fillna(method ='pad')

In [14]:
dataframe.isnull().sum()

id                                0
name                              0
host_id                           0
host_name                         0
neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
calculated_host_listings_count    0
availability_365                  0
number_of_reviews_ltm             0
dtype: int64

In [15]:
dataframe = klib.convert_datatypes(dataframe) 

In [16]:
print(dataframe.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13808 entries, 0 to 13807
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype   
---  ------                          --------------  -----   
 0   id                              13808 non-null  int64   
 1   name                            13808 non-null  string  
 2   host_id                         13808 non-null  int32   
 3   host_name                       13808 non-null  string  
 4   neighbourhood                   13808 non-null  int32   
 5   latitude                        13808 non-null  float32 
 6   longitude                       13808 non-null  float32 
 7   room_type                       13808 non-null  category
 8   price                           13808 non-null  int32   
 9   minimum_nights                  13808 non-null  int16   
 10  number_of_reviews               13808 non-null  int16   
 11  calculated_host_listings_count  13808 non-null  int16   
 12  availability_365  

In [17]:
dataframe['room_type'] = dataframe['room_type'].astype('category').cat.codes

In [18]:
dataframe['host_name'] = dataframe['host_name'].astype('category').cat.codes
dataframe['name'] = dataframe['name'].astype('category').cat.codes

In [19]:
from sklearn.linear_model import Lasso

from sklearn.feature_selection import SelectFromModel

In [20]:
from sklearn.model_selection import train_test_split

training_data, testing_data = train_test_split(dataframe, test_size=0.2, random_state=25)

In [21]:
print(training_data.shape, testing_data.shape)

(11046, 14) (2762, 14)


In [22]:
from sklearn.linear_model import Lasso

from sklearn.feature_selection import SelectFromModel

In [23]:
model = SelectFromModel(Lasso(alpha=0.005,random_state=0))

In [24]:
X_train = training_data.drop(columns = "price")
X_test = testing_data.drop(columns = "price")

In [25]:
y_train = training_data["price"]
y_test = testing_data["price"]

In [26]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(11046, 13) (11046,)
(2762, 13) (2762,)


In [27]:
model.fit(X_train, y_train)

  positive)


SelectFromModel(estimator=Lasso(alpha=0.005, random_state=0))

In [28]:
model.get_support()

array([False,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True])

In [29]:
selected_features = X_train.columns[(model.get_support())]

In [30]:
X_train = X_train[selected_features]
X_test = X_test[selected_features]

In [31]:
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(11046, 11) (11046,)
(2762, 11) (2762,)


In [33]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [35]:
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

In [36]:
print(X_train.columns,y_train.columns)
print(X_test.columns,y_test.columns)

Index(['name', 'host_name', 'neighbourhood', 'latitude', 'longitude',
       'room_type', 'minimum_nights', 'number_of_reviews',
       'calculated_host_listings_count', 'availability_365',
       'number_of_reviews_ltm'],
      dtype='object') Index(['price'], dtype='object')
Index(['name', 'host_name', 'neighbourhood', 'latitude', 'longitude',
       'room_type', 'minimum_nights', 'number_of_reviews',
       'calculated_host_listings_count', 'availability_365',
       'number_of_reviews_ltm'],
      dtype='object') Index(['price'], dtype='object')


In [38]:
model_rf = RandomForestRegressor()

In [39]:
model_rf.fit(X_train, y_train)

  """Entry point for launching an IPython kernel.


RandomForestRegressor()

In [41]:
y_pred_rf = model_rf.predict(X_test)

In [42]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error

In [43]:
print("Mean Absolute Error",mean_absolute_error(y_test,y_pred_rf))
print("Mean Squared Error",mean_squared_error(y_test,y_pred_rf))
print("Mean Absolute Percentage Error",mean_absolute_percentage_error(y_test,y_pred_rf))

Mean Absolute Error 187.66913468501085
Mean Squared Error 311009.59030199127
Mean Absolute Percentage Error 1772839873589735.2


In [45]:
from sklearn.ensemble import GradientBoostingRegressor

In [46]:
model_gb = GradientBoostingRegressor()

In [47]:
model_gb.fit(X_train, y_train)

  return f(*args, **kwargs)


GradientBoostingRegressor()

In [48]:
y_pred_gb = model_gb.predict(X_test)

In [49]:
print("Mean Absolute Error",mean_absolute_error(y_test,y_pred_gb))
print("Mean Squared Error",mean_squared_error(y_test,y_pred_gb))
print("Mean Absolute Percentage Error",mean_absolute_percentage_error(y_test,y_pred_gb))

Mean Absolute Error 218.98877277115972
Mean Squared Error 1254432.0378269115
Mean Absolute Percentage Error 813975003452467.9
