In [223]:
import pandas as pd
import numpy as np

train_airbnb_df = pd.read_csv("/Users/cha/Desktop/3A/code/ml-project/data/train_airbnb_berlin.csv")
test_airbnb_df = pd.read_csv("/Users/cha/Desktop/3A/code/ml-project/data/test_airbnb_berlin.csv")

# Deleting useless features
features = [
       'Listing ID',
       # 'Listing Name',
       'Host ID',
       # 'Host Name',
       'Host Since', # date -> données numériques a partir du plus récent
       # 'Host Response Time',
       # 'Host Response Rate',
       'Is Superhost',
       # 'neighbourhood',
       # 'Neighborhood Group',
       # 'City',  
       # 'Postal Code',
       'Latitude',
       'Longitude',
       # 'Country Code',
       # 'Country',
       'Is Exact Location',
       'Property Type',
       'Room Type',
       'Accomodates',
       'Bathrooms',
       'Bedrooms',
       'Beds',
       # 'Square Feet',
       'Guests Included',
       'Min Nights',
       'Reviews',
       # 'First Review',
       'Last Review', # -> Imputer les dates commes des chiffres a partir du plus récent
       'Overall Rating',
       'Accuracy Rating',
       'Cleanliness Rating',
       'Checkin Rating',
       'Communication Rating',
       'Location Rating',
       'Value Rating',
       'Instant Bookable',
       'Business Travel Ready',
       'Price'
]

train_airbnb_df = train_airbnb_df[features]
test_airbnb_df = test_airbnb_df[features[:-1]]

In [224]:
train_airbnb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15692 entries, 0 to 15691
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Listing ID             15691 non-null  float64
 1   Host ID                15692 non-null  float64
 2   Host Since             15671 non-null  object 
 3   Is Superhost           15669 non-null  object 
 4   Latitude               15692 non-null  float64
 5   Longitude              15692 non-null  float64
 6   Is Exact Location      15692 non-null  object 
 7   Property Type          15692 non-null  object 
 8   Room Type              15692 non-null  object 
 9   Accomodates            15692 non-null  object 
 10  Bathrooms              15678 non-null  object 
 11  Bedrooms               15687 non-null  object 
 12  Beds                   15684 non-null  object 
 13  Guests Included        15692 non-null  object 
 14  Min Nights             15692 non-null  object 
 15  Re

### Encoding categorical variables

OH for categorical variables with no hierarchy between values

In [225]:
OH_columns = [
    "Property Type",
    "Room Type",
] 

dummies = pd.get_dummies(train_airbnb_df[OH_columns])

# Drop the column with the independent and target variable (Salary), and columns for which we created dummy variables
train_airbnb_df = train_airbnb_df.drop(OH_columns, axis=1)

# Define the feature set X, our matrix data
train_airbnb_df = pd.concat([train_airbnb_df, dummies], axis=1)
print("Number of columns: ", len(train_airbnb_df.columns))
print(train_airbnb_df.columns)

Number of columns:  44
Index(['Listing ID', 'Host ID', 'Host Since', 'Is Superhost', 'Latitude',
       'Longitude', 'Is Exact Location', 'Accomodates', 'Bathrooms',
       'Bedrooms', 'Beds', 'Guests Included', 'Min Nights', 'Reviews',
       'Last Review', 'Overall Rating', 'Accuracy Rating',
       'Cleanliness Rating', 'Checkin Rating', 'Communication Rating',
       'Location Rating', 'Value Rating', 'Instant Bookable',
       'Business Travel Ready', 'Price', 'Property Type_*',
       'Property Type_Apartment', 'Property Type_Bed and breakfast',
       'Property Type_Boat', 'Property Type_Boutique hotel',
       'Property Type_Bungalow', 'Property Type_Condominium',
       'Property Type_Guest suite', 'Property Type_Guesthouse',
       'Property Type_Hostel', 'Property Type_Hotel', 'Property Type_House',
       'Property Type_Loft', 'Property Type_Other',
       'Property Type_Serviced apartment', 'Property Type_Townhouse',
       'Room Type_Entire home/apt', 'Room Type_Private r

Encoding for variables with hierarchy between values

In [226]:
encoding_columns = [
    "Is Superhost",
    "Instant Bookable",
    "Is Exact Location",
    "Business Travel Ready"
]
train_airbnb_df[encoding_columns] = train_airbnb_df[encoding_columns].replace("t", 1).replace("f", 0)

Encoding for dates

In [227]:
dates_columns = ["Host Since", "Last Review"]

for column in dates_columns:
    df = pd.to_datetime(train_airbnb_df[column], format="%Y-%m-%d")
    res = min(df)
    df = df - res
    df = df.dt.days.astype(np.float64)
    train_airbnb_df[column] = df

### Droping NaN values

In [228]:
train_airbnb_df = train_airbnb_df.replace('*', np.nan).dropna()

### Conversion to float

In [229]:
float_columns = [
    "Accomodates",
    "Bathrooms",
    "Bedrooms",
    "Beds",
    "Guests Included",
    "Min Nights"
]

train_airbnb_df[encoding_columns + dates_columns] = train_airbnb_df[encoding_columns + dates_columns].astype(np.uint8)
train_airbnb_df[float_columns] = train_airbnb_df[float_columns].astype(np.float64)
train_airbnb_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12493 entries, 0 to 15690
Data columns (total 44 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Listing ID                        12493 non-null  float64
 1   Host ID                           12493 non-null  float64
 2   Host Since                        12493 non-null  uint8  
 3   Is Superhost                      12493 non-null  uint8  
 4   Latitude                          12493 non-null  float64
 5   Longitude                         12493 non-null  float64
 6   Is Exact Location                 12493 non-null  uint8  
 7   Accomodates                       12493 non-null  float64
 8   Bathrooms                         12493 non-null  float64
 9   Bedrooms                          12493 non-null  float64
 10  Beds                              12493 non-null  float64
 11  Guests Included                   12493 non-null  float64
 12  Min 