In [423]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt


import ydata_profiling as dp

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from skmultilearn.problem_transform import BinaryRelevance

from sklearn.metrics import accuracy_score

from fancyimpute import IterativeImputer


# Removing warnings

import warnings
warnings.filterwarnings('ignore')

In [424]:
features = pd.read_csv("train_features.csv")
labels = pd.read_csv("train_label.csv")

In [425]:
features.head()

Unnamed: 0,facilities,rating,location
0,RestaurantBARSwimmingPools,7.8 Very GoodFrom 10 reviews,Stokol
1,intrnetRestaurantgym,5.6 GoodFrom 4 reviews,Machlessvile
2,restaurantgympoolBar,7.2 Very GoodFrom 38 reviews,Wanderland
3,BARRestaurant,7.3 Very GoodFrom 6 reviews,Uberlandia
4,InternetRestaurant,7.2 Very GoodFrom 30 reviews,Stokol


In [426]:
labels.head()

Unnamed: 0,Price
0,"13,500avg/night"
1,"13,000avg/night"
2,"19,000avg/night"
3,"6,000avg/night"
4,"20,000avg/night"


In [427]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3066 entries, 0 to 3065
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   facilities  2765 non-null   object
 1   rating      2429 non-null   object
 2   location    3066 non-null   object
dtypes: object(3)
memory usage: 72.0+ KB


In [428]:
labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3066 entries, 0 to 3065
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Price   3066 non-null   object
dtypes: object(1)
memory usage: 24.1+ KB


In [429]:
# Convert to lowercase
features['facilities'] = features['facilities'].str.lower()

# Define facility patterns to extract
facility_patterns = ['restaurant', 'bar', 'swimmingpools', 'internet', 'gym', 'pool']

# Create a pattern for matching any facility pattern
facility_pattern = '|'.join(facility_patterns)

# Extract all facilities using the pattern
features['facilities'] = features['facilities'].str.extractall(f'({facility_pattern})')[0].groupby(level=0).apply(list)

In [430]:
features.head()

Unnamed: 0,facilities,rating,location
0,"[restaurant, bar, swimmingpools]",7.8 Very GoodFrom 10 reviews,Stokol
1,"[restaurant, gym]",5.6 GoodFrom 4 reviews,Machlessvile
2,"[restaurant, gym, pool, bar]",7.2 Very GoodFrom 38 reviews,Wanderland
3,"[bar, restaurant]",7.3 Very GoodFrom 6 reviews,Uberlandia
4,"[internet, restaurant]",7.2 Very GoodFrom 30 reviews,Stokol


In [431]:
features['facilities'] = features['facilities'].fillna('[unknown]')
features['facilities'].head(50)

0               [restaurant, bar, swimmingpools]
1                              [restaurant, gym]
2                   [restaurant, gym, pool, bar]
3                              [bar, restaurant]
4                         [internet, restaurant]
5                                     [internet]
6                                          [bar]
7                              [bar, restaurant]
8                              [bar, restaurant]
9                              [bar, restaurant]
10        [restaurant, gym, internet, bar, pool]
11                   [bar, internet, restaurant]
12                             [bar, restaurant]
13                        [internet, restaurant]
14                             [bar, restaurant]
15                                     [unknown]
16                                     [unknown]
17                                         [bar]
18                             [bar, restaurant]
19             [restaurant, pool, bar, internet]
20                  

In [432]:
import re
# Extract numerical rating using regular expression with str.extract()
features['rating'] = features['rating'].str.extract(r'(\d+\.\d+)').astype(float)


In [433]:
features['rating'].fillna(features['rating'].median(), inplace=True)

In [434]:
features['rating'].describe()

count    3066.000000
mean        7.309589
std         1.208299
min         2.000000
25%         6.900000
50%         7.400000
75%         8.000000
max        10.000000
Name: rating, dtype: float64

In [435]:
features['rating'].head(50)

0      7.8
1      5.6
2      7.2
3      7.3
4      7.2
5      5.4
6      7.9
7      7.3
8      7.4
9      7.7
10     6.9
11     9.6
12     7.4
13     7.4
14     7.4
15     5.4
16     7.4
17     7.4
18     9.6
19     2.4
20     7.8
21     9.6
22     7.8
23     7.2
24     8.1
25     7.4
26     8.8
27     8.0
28     7.4
29     6.0
30     7.4
31     7.2
32     7.4
33     8.0
34     7.4
35     8.8
36     8.1
37     7.2
38     7.7
39     7.9
40     6.7
41     7.4
42     7.4
43     5.6
44     7.4
45    10.0
46     8.6
47     7.0
48     6.4
49     5.6
Name: rating, dtype: float64

In [436]:
label_encoder = LabelEncoder()

# Perform label encoding on 'location' column
features['location'] = label_encoder.fit_transform(features['location'])

In [437]:
features['location']

0       3
1       2
2       6
3       4
4       3
       ..
3061    0
3062    4
3063    7
3064    1
3065    0
Name: location, Length: 3066, dtype: int64

In [438]:
facility_dummies = pd.get_dummies(features['facilities'].apply(pd.Series).stack()).sum(level=0)

In [439]:
features_encoded = pd.concat([features, facility_dummies], axis=1)
features_encoded = features_encoded.drop('facilities', axis=1)

In [440]:
features_encoded.head()

Unnamed: 0,rating,location,[unknown],bar,gym,internet,pool,restaurant,swimmingpools
0,7.8,3,0,1,0,0,0,1,1
1,5.6,2,0,0,1,0,0,1,0
2,7.2,6,0,1,1,0,1,1,0
3,7.3,4,0,1,0,0,0,1,0
4,7.2,3,0,0,0,1,0,1,0


In [441]:
labels['Price'] = labels['Price'].str.split('avg/night', expand=True)[0].str.replace(',', '').astype(int)
labels

Unnamed: 0,Price
0,13500
1,13000
2,19000
3,6000
4,20000
...,...
3061,31625
3062,30500
3063,14000
3064,8500


In [442]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features_encoded, labels, test_size=0.2, random_state=42)

In [443]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error

In [444]:
# Create and fit the model
modelLR = LinearRegression()
modelLR.fit(X_train, y_train)

# Make predictions on the test set
y_pred = modelLR.predict(X_test)

# Calculate the mean absolute error
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

Mean Absolute Error: 10010.90152341883


In [445]:
# Create and fit the model
modelDT = DecisionTreeRegressor()
modelDT.fit(X_train, y_train)

# Make predictions on the test set
y_pred = modelDT.predict(X_test)

# Calculate the mean absolute error
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

Mean Absolute Error: 7908.91765391159


In [446]:
# Create and fit the model
modelRF = RandomForestRegressor()
modelRF.fit(X_train, y_train)

# Make predictions on the test set
y_pred = modelRF.predict(X_test)

# Calculate the mean absolute error
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

Mean Absolute Error: 7900.018768773712


In [447]:
# Create and fit the model
modelGB = GradientBoostingRegressor()
modelGB.fit(X_train, y_train)

# Make predictions on the test set
y_pred = modelGB.predict(X_test)

# Calculate the mean absolute error
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

Mean Absolute Error: 8869.853989796176


In [448]:
new_data = pd.read_csv('test_feature.csv')

In [449]:
new_data.head(10)

Unnamed: 0,ID,facilities,rating,location
0,0,GymrestaurantbarInternetSwimmingPools,8.0 ExcellentFrom 1 reviews,Stokol
1,1,Poolrestaurantgyminternetbar,7.4 Very GoodFrom 22 reviews,Hallerson
2,2,BARSwimmingPoolsInternetgym,0.0 FairFrom 4 reviews,Hallerson
3,3,gymSwimmingPoolsBARintrnetRestaurant,6.8 Very GoodFrom 13 reviews,Andeman
4,4,gymRestaurantpoolbarintrnet,0.0 FairFrom 9 reviews,Hallerson
5,5,swimmingpoolsbarInternetrestaurantgym,0.0 FairFrom 13 reviews,Andeman
6,6,intrnetPoolGymbarRestaurant,7.1 Very GoodFrom 23 reviews,Wanderland
7,7,RestaurantPoolBarintrnetgym,5.1 GoodFrom 5 reviews,Stokol
8,8,Restaurantswimmingpoolsgymbarintrnet,5.8 GoodFrom 13 reviews,Machlessvile
9,9,GymrestaurantintrnetSwimmingPoolsbar,7.3 Very GoodFrom 10 reviews,Willsmian


In [450]:
# Convert to lowercase
new_data['facilities'] = new_data['facilities'].str.lower()

# Define facility patterns to extract
facility_patterns = ['restaurant', 'bar', 'swimmingpools', 'internet', 'gym', 'pool']

# Create a pattern for matching any facility pattern
facility_pattern = '|'.join(facility_patterns)

# Extract all facilities using the pattern
new_data['facilities'] = new_data['facilities'].str.extractall(f'({facility_pattern})')[0].groupby(level=0).apply(list)

In [451]:
ID = new_data['ID']

In [452]:
new_data = new_data.drop('ID', axis=1)

In [453]:
new_data['[unknown]'] = 0

In [454]:
new_data['facilities'] = new_data['facilities'].fillna('[unknown]')

In [455]:
new_data['rating'] = new_data['rating'].str.extract(r'(\d+\.\d+)').astype(float)

In [456]:
new_data['rating'].fillna(new_data['rating'].median(), inplace=True)

In [457]:
new_data['location'] = label_encoder.fit_transform(new_data['location'])

In [458]:
newdata_dummies = pd.get_dummies(new_data['facilities'].apply(pd.Series).stack()).sum(level=0)

In [459]:
new_data_encoded = pd.concat([new_data, newdata_dummies], axis=1)
new_data_encoded = new_data_encoded.drop('facilities', axis=1)

In [460]:
new_data_encoded.head(10)

Unnamed: 0,rating,location,[unknown],bar,gym,internet,pool,restaurant,swimmingpools
0,8.0,3,0,1,1,1,0,1,1
1,7.4,1,0,1,1,1,1,1,0
2,0.0,1,0,1,1,1,0,0,1
3,6.8,0,0,1,1,0,0,1,1
4,0.0,1,0,1,1,0,1,1,0
5,0.0,0,0,1,1,1,0,1,1
6,7.1,6,0,1,1,0,1,1,0
7,5.1,3,0,1,1,0,1,1,0
8,5.8,2,0,1,1,0,0,1,1
9,7.3,7,0,1,1,0,0,1,1


In [470]:
predictions = modelDT.predict(new_data_encoded)

In [471]:
# Create a new DataFrame with the predictions
predictions = predictions.astype(int)

predictions_df = pd.DataFrame({'Price': predictions.flatten()})
predictions_df = pd.concat([ID, predictions_df], axis=1)

In [472]:
# Save the predictions to a new CSV file
predictions_df.to_csv('predictions.csv', index=False)