In [1]:
# Import all the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

### 1. Load and explore data 💾

#### Column information
- `id`: Identificador único de la propiedad.
- `log_price`: El precio del alquiler de la propiedad en formato de registro. VARIABLE A PREDECIR
- `property_type`: Tipo de propiedad (ej. apartamento, casa, etc.).
- `room_type`: Tipo de habitación (ej. habitación privada, casa/apartamento completo, etc.).
- `amenities`: Comodidades ofrecidas en la propiedad.
- `accommodates`: Número máximo de huéspedes que la propiedad puede acomodar.
- `bathrooms`: Número de baños en la propiedad.
- `bed_type`: Tipo de cama (ej. cama matrimonial, sofá cama, etc.).
- `cancellation_policy`: Política de cancelación para reservas.
- `cleaning_fee`: Si se cobra una tarifa de limpieza (True/False).
- `city`: Ciudad donde se encuentra la propiedad.
- `description`: Descripción de la propiedad.
- `first_review`: Fecha de la primera revisión.
- `host_has_profile_pic`: Si el anfitrión tiene una foto de perfil (True/False).
- `host_identity_verified`: Si la identidad del anfitrión está verificada (True/False).
- `host_response_rate`: Tasa de respuesta del anfitrión.
- `host_since`: Fecha en que el anfitrión se unió a Airbnb.
- `instant_bookable`: Si la propiedad puede ser reservada instantáneamente (True/False).
- `last_review`: Fecha de la última revisión.
- `latitude`: Latitud geográfica de la propiedad.
- `longitude`: Longitud geográfica de la propiedad.
- `name`: Nombre de la propiedad.
- `neighbourhood`: Barrio donde se encuentra la propiedad.
- `number_of_reviews`: Número total de revisiones de la propiedad.
- `review_scores_rating`: Puntuación general de las revisiones.
- `thumbnail_url`: URL de la miniatura de la propiedad.
- `zipcode`: Código postal de la propiedad.
- `bedrooms`: Número de dormitorios en la propiedad.
- `beds`: Número de camas en la propiedad.

In [2]:
# Load the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
test_data.shape, train_data.shape

((14823, 28), (59288, 29))

In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59288 entries, 0 to 59287
Data columns (total 29 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      59288 non-null  int64  
 1   log_price               59288 non-null  float64
 2   property_type           59288 non-null  object 
 3   room_type               59288 non-null  object 
 4   amenities               59288 non-null  object 
 5   accommodates            59288 non-null  int64  
 6   bathrooms               59139 non-null  float64
 7   bed_type                59288 non-null  object 
 8   cancellation_policy     59288 non-null  object 
 9   cleaning_fee            59288 non-null  bool   
 10  city                    59288 non-null  object 
 11  description             59288 non-null  object 
 12  first_review            46601 non-null  object 
 13  host_has_profile_pic    59148 non-null  object 
 14  host_identity_verified  59148 non-null

### 2. Null handling

In [5]:
train_data.isna().sum() # Train data nulls

id                            0
log_price                     0
property_type                 0
room_type                     0
amenities                     0
accommodates                  0
bathrooms                   149
bed_type                      0
cancellation_policy           0
cleaning_fee                  0
city                          0
description                   0
first_review              12687
host_has_profile_pic        140
host_identity_verified      140
host_response_rate        14628
host_since                  140
instant_bookable              0
last_review               12660
latitude                      0
longitude                     0
name                          0
neighbourhood              5523
number_of_reviews             0
review_scores_rating      13366
thumbnail_url              6622
zipcode                     755
bedrooms                     77
beds                        100
dtype: int64

In [6]:
# Drop columns with a high number of missing values or those not relevant for now
train_data = train_data.drop(['first_review', 'host_response_rate', 'last_review', 
                              'review_scores_rating', 'thumbnail_url', 'neighbourhood', 
                              'zipcode', 'id', 'description', 'host_since', 'name'], axis=1)

train_data.dropna(inplace=True)

### 3. Coding of categorical variables and scaling of numerical features

In [7]:
numeric_features = ['accommodates', 'bathrooms', 'latitude', 'longitude', 'bedrooms', 'beds']
categorical_features = ['property_type', 'room_type', 'bed_type', 'cancellation_policy', 'city']

In [8]:
# drop columns with too many unique values

# train_data.drop(['thumbnail_url', 'zipcode', 'id', 'description', 'host_since', 'name','longitude', 'latitude'], axis=1, inplace=True)

### 4. Create  and apply transformers

In [9]:
# Create transformers
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Apply transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [10]:
train_data['amenities'].head()

0    {TV,Internet,"Wireless Internet","Air conditio...
1    {Internet,"Wireless Internet","Air conditionin...
2    {"Wireless Internet","Air conditioning","Wheel...
4    {TV,"Wireless Internet","Air conditioning",Kit...
5    {TV,"Cable TV",Internet,"Wireless Internet","A...
Name: amenities, dtype: object

In [11]:
# Create a new column called 'num_amenities' based on the existing 'amenities' column.
train_data['num_amenities'] = train_data['amenities'].apply(lambda x: len(x.split(',')))

In [12]:
# Split the train dataframe for tts
X = train_data.drop('log_price', axis=1)
y = train_data['log_price']
X_train, X_val, y_train, y_val = tts(X, y, test_size=0.2, random_state=42)

In [13]:
# Regression model
model = RandomForestRegressor(n_estimators=100, random_state=42)

In [14]:
# Pipeline with preprocessing and model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

In [15]:
# Train the model
pipeline.fit(X_train, y_train)

In [16]:
# Evaluate the model
# Predicciones en el conjunto de validación
y_pred = pipeline.predict(X_val)

# RMSE of the validation 
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f'RMSE for the validation set: {rmse}')

RMSE for the validation set: 0.4195058870725698


In [17]:
y_pred

array([4.65206873, 4.50536329, 4.10091525, ..., 3.97938298, 4.0204029 ,
       4.92086066])

### Test Data Preprocessing

In [18]:
test_data['bathrooms'].fillna(test_data['bathrooms'].median(), inplace=True) # Fill bathroom data with median
test_data['bedrooms'].fillna(test_data['bedrooms'].median(), inplace=True) # Fill bedrooms with median
test_data['beds'].fillna(test_data['beds'].median(), inplace=True) # Fill beds with median

 # Transform host response rate to float and remove the % 
test_data['host_response_rate'] = test_data['host_response_rate'].str.rstrip('%').astype('float') / 100.0
# Fill host response rate nulls with the mean
test_data['host_response_rate'].fillna(test_data['host_response_rate'].mean(), inplace=True)

# Fill the date data with no information
test_data['first_review'].fillna('1900-01-01', inplace=True)
test_data['last_review'].fillna('1900-01-01', inplace=True)

# Drop neighbourhood, thumbnail URL and Zipcode
test_data = test_data.drop(['neighbourhood', 'thumbnail_url', 'zipcode'], axis=1)

# Fill host_has_profile_pic and host_identity_verified with mode
test_data['host_has_profile_pic'].fillna(test_data['host_has_profile_pic'].mode()[0], inplace=True)
test_data['host_identity_verified'].fillna(test_data['host_identity_verified'].mode()[0], inplace=True)

#  Fill review_scores_rating and host_since 
test_data['review_scores_rating'].fillna(test_data['review_scores_rating'].mean(), inplace=True)
test_data['host_since'].fillna(test_data['host_since'].mode()[0], inplace=True)

# Check if there is any nulls left
test_data.isnull().sum()

id                        0
property_type             0
room_type                 0
amenities                 0
accommodates              0
bathrooms                 0
bed_type                  0
cancellation_policy       0
cleaning_fee              0
city                      0
description               0
first_review              0
host_has_profile_pic      0
host_identity_verified    0
host_response_rate        0
host_since                0
instant_bookable          0
last_review               0
latitude                  0
longitude                 0
name                      0
number_of_reviews         0
review_scores_rating      0
bedrooms                  0
beds                      0
dtype: int64

In [19]:
# Predictions for the test data
test_pred = pipeline.predict(test_data)

In [20]:
test_pred.shape[0] # Check if the df has the required 14823 rows for the submission

14823

In [21]:
# Save the predicted log_price of test_data as a Dataframe
submission = pd.DataFrame({'id': range(len(test_data)), 'log_price': test_pred})

# Export the DataFrame to a CSV file
submission.to_csv('submission.csv', index=False)