In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from datetime import datetime
from re import search

In [2]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_transformer

# Import de la base de données nettoyée

In [3]:
df_airbnb = pd.read_csv('data_cleanedV4.csv', index_col=0)

In [4]:
pd.set_option('display.max_columns', None)
df_airbnb.head()

Unnamed: 0,ID,Name,Summary,Space,Neighborhood Overview,Notes,Transit,Interaction,Host Since,Host Verifications,Neighbourhood Cleansed,Property Type,Room Type,Accommodates,Bathrooms,Bedrooms,Beds,Bed Type,Amenities,Price,Security Deposit,Cleaning Fee,Guests Included,Extra People,Calendar Updated,Availability 60,Review Scores Rating,Review Scores Accuracy,Review Scores Cleanliness,Review Scores Checkin,Review Scores Communication,Review Scores Location,Review Scores Value,Cancellation Policy,Reviews per Month,Geolocation,Features
0,5380461,"Roland Garros, Appart. 2 p. avec balcon, Auteuil.","Appart. ascenc. 1 ch. avec lit double, cuisine...",L'appartement est situé dans le quartier résid...,"J'apprécie le côté résidentiel du quartier, ca...",Nombreux commerces et restaurants à proximité.,Vous pourrez accéder à l'appartement par les l...,Je souhaite louer mon appartement à des person...,2015-02-17,"email,phone,reviews",Passy,Apartment,Entire home/apt,2,1.0,1.0,1.0,Real Bed,"TV,Internet,Wireless Internet,Kitchen,Elevator...",50.0,500.0,30.0,1,0,4 days ago,15,80.0,7.0,8.0,9.0,9.0,10.0,7.0,flexible,0.16,"48.8445603313,2.26336692547","Host Has Profile Pic,Is Location Exact"
1,12994171,APARTMENT 100 M2,location trocadero Passy very well located ans...,la situation et la tranquillité ainsi que l'ac...,situation à 5 min à pied de la tour Eiffel et ...,,métro à 3 min à pied,si vous avez des besoins particuliers ...cours...,2016-05-12,"email,phone,reviews",Passy,Apartment,Entire home/apt,5,2.0,2.0,2.0,Real Bed,"TV,Cable TV,Internet,Wireless Internet,Wheelch...",250.0,750.0,90.0,4,15,today,28,95.0,10.0,10.0,10.0,10.0,10.0,9.0,strict,0.56,"48.8580714183,2.28022090415","Host Has Profile Pic,Is Location Exact"
2,16270986,Appartement familial église d'auteuil,"Mon logement est proche de Beaugrenelle Paris,...",,,,,,2016-10-25,"email,phone,facebook,reviews",Passy,Apartment,Entire home/apt,7,1.5,2.0,5.0,Real Bed,"TV,Wireless Internet,Air conditioning,Kitchen,...",165.0,200.0,30.0,1,0,today,51,100.0,10.0,10.0,10.0,10.0,10.0,10.0,flexible,1.0,"48.8502032736,2.27083162889","Host Has Profile Pic,Is Location Exact,Instant..."
3,16267918,Studio 25m2 16e arrondissement,Petit studio situé dans le 16 e arrondissement...,,"Boulangerie, coiffeur, Franprix etc",,Bus 72 et 22 arrêt victorien Sardou Météo lig...,,2015-12-11,"email,phone,reviews",Passy,Apartment,Entire home/apt,2,1.0,0.0,1.0,Real Bed,"TV,Internet,Wireless Internet,Kitchen,Elevator...",35.0,94.0,0.0,1,0,4 months ago,0,93.0,10.0,10.0,10.0,10.0,9.0,10.0,moderate,1.38,"48.8420130083,2.26885375663","Host Has Profile Pic,Is Location Exact"
4,8531724,Cosy and bright flat,"Nice flat, recently renovated, bright and cosy...",The flat is in the 4th floor of a secure build...,The area is very pleasant and chic. It is both...,The bedsheets and towels are included in the r...,The flat is very well located for transportati...,I will welcome you in the flat to give you the...,2014-01-28,"email,phone,reviews",Passy,Apartment,Entire home/apt,4,1.0,1.0,2.0,Real Bed,"TV,Internet,Wireless Internet,Kitchen,Elevator...",90.0,500.0,30.0,2,10,6 months ago,39,100.0,10.0,10.0,10.0,10.0,10.0,10.0,moderate,0.35,"48.8410481577,2.25944805867","Host Has Profile Pic,Is Location Exact"


In [5]:
def Host_Since_toint(date_inscription) -> int:
    """Renvoit l'ancienneté de l'hôte à la date du 2020/04/15"""
    
    today = datetime.strptime('2020/04/15',"%Y/%m/%d") 
    delta = today - date_inscription
    return delta.days

In [6]:
def Calendar_Update_toint(sentence: str) -> int:
    """
    Renvoie le nombre de jours depuis la dernière mise à jour du calendrier
    """
    sentence = sentence.strip().lower()
    if sentence == "today":
        return 0
    if sentence == "yesterday":
        return 1
    try:
        days = int(sentence.split()[0])
    except ValueError:
        days = 1 if (sentence.split()[0] == "a") else None
    if "days" in sentence:
        return days
    if "week" in sentence:
        return days * 7
    if "month" in sentence:
        return days * 30
    if 'never' in sentence:
        return 9999
    else:
        return None

In [7]:
def Bed_type_toint(bed: str) -> int:
    """
    Binarisation sur la base d'un vrai lit ou non 
    """
    dico = {
        "Airbed" : 0,
        "Couch" : 0,
        "Futon" : 0,
        "Pull-out Sofa": 0,
        "Real Bed": 1
    }
    return dico[bed.strip()]

In [8]:
def Cancellation_policy_toint(policy: str) -> int:
    """
    renvoit sur une échelle de 0 à 3 la rigidité de la politique d'annulation de la réservation du logement
    """
    dico = {
        "flexible" : 0,
        "moderate" : 1,
        "strict" : 2,
        "super_strict_30": 3,

    }
    return dico[policy.strip()]    

In [9]:
def Property_type_encoding(property_type) -> str:
    """reencode les type de propriétés en un plus petit nombre de catégorie"""
    hostel = ['Dorm','Hostel']
    bed_breakfast = ['Bed & Breakfast','Guesthouse']
    apartment = ['Condominium','Apartment','Serviced apartment']
    house = ['House','Loft','Townhouse','Villa']
    other = ['Boat','Boutique hotel','Bungalow','Cabin','Camper/RV','Cave','Earth House','Igloo','Other','Timeshare','Tipi','Treehouse']
    if property_type in hostel:
        return "hoster"
    elif property_type in bed_breakfast:
        return "bed_breakfast"
    elif property_type in apartment:
        return "apartment"
    elif property_type in house:
        return "house"
    else:
        return "other"

In [10]:
def create_columns(X, column_name):
    """pour les colonnes avec une liste de features, sépare en autant de colonnes qu'il y a de features différentes
    et binarise selon que la feature est présente ou non"""
    X = X.copy()
    X[column_name] = X[column_name].fillna('')
    set_of_amenities = set()
    for amen in X[column_name].value_counts().index:
        amenities = [e for e in amen.split(',')]
        set_of_amenities = set_of_amenities | set(amenities)
    if '' in set_of_amenities:
        set_of_amenities.remove('')
    for amen in set_of_amenities:
        X[column_name + " : " + amen] = X[column_name].apply(lambda a : 1 if amen in a.split(',') else 0)
    X = X.drop(columns=column_name)
    return X

## Creation d'une fonction regroupant tout le preproccesing

In [11]:
def data_preprocessing(data):
    X = data.copy()
    X = X.drop(columns=['ID','Name','Summary','Space','Neighborhood Overview',
                        'Notes','Transit','Interaction'])
    
    X['Calendar Updated'] = X['Calendar Updated'].apply(Calendar_Update_toint)
    X['Host Since'] =  pd.to_datetime(X['Host Since']).apply(Host_Since_toint)
    X['Host Since'] =  X['Host Since'].fillna(X['Calendar Updated']) ## remplace les date manquantes par la dernière date de réservation
    X['Bed Type'] = X['Bed Type'].apply(Bed_type_toint)
    
    X['Cancellation Policy'] = X['Cancellation Policy'].apply(Cancellation_policy_toint)
    
    X = pd.concat([X, pd.get_dummies(X['Room Type'])], axis=1)
    X = X.drop(columns='Room Type')
    
    X['Property Type'] = X['Property Type'].apply(Property_type_encoding)
    X = pd.concat([X, pd.get_dummies(X['Property Type'])], axis=1)
    X = X.drop(columns='Property Type')
    X = create_columns(X, 'Amenities')
    X = create_columns(X, 'Host Verifications')
    X = create_columns(X, 'Features')
    X = create_columns(X, 'Neighbourhood Cleansed')
    X = X.drop(columns=['Review Scores Accuracy','Review Scores Cleanliness', 'Review Scores Checkin',
                        'Review Scores Communication', 'Review Scores Location','Review Scores Value', 
                       'Host Verifications : sesame_offline', 'Host Verifications : identity_manual', 
                        'Host Verifications : jumio', 'Private room','Shared room'])
    return X

In [25]:
preprocessor = FunctionTransformer(data_preprocessing)

In [12]:
A = preprocessor.transform(df_airbnb)
A

Unnamed: 0,Host Since,Accommodates,Bathrooms,Bedrooms,Beds,Bed Type,Price,Security Deposit,Cleaning Fee,Guests Included,Extra People,Calendar Updated,Availability 60,Review Scores Rating,Cancellation Policy,Reviews per Month,Geolocation,Entire home/apt,apartment,bed_breakfast,hoster,house,other,Amenities : Fire extinguisher,Amenities : Baby bath,Amenities : Washer,Amenities : Free parking on premises,Amenities : Cat(s),Amenities : Private living room,Amenities : Pool,Amenities : Cable TV,Amenities : Essentials,Amenities : Children’s dinnerware,Amenities : Washer / Dryer,Amenities : Keypad,Amenities : Crib,Amenities : Changing table,Amenities : First aid kit,Amenities : Internet,Amenities : Babysitter recommendations,Amenities : Gym,Amenities : 24-hour check-in,Amenities : Buzzer/wireless intercom,Amenities : translation missing: en.hosting_amenity_50,Amenities : Hangers,Amenities : Children’s books and toys,Amenities : Wireless Internet,Amenities : Fireplace guards,Amenities : Pets live on this property,Amenities : Air conditioning,Amenities : Kitchen,Amenities : Smoking allowed,Amenities : Paid parking off premises,Amenities : Shampoo,Amenities : Smoke detector,Amenities : Pack ’n Play/travel crib,Amenities : Other pet(s),Amenities : Table corner guards,Amenities : Game console,Amenities : Lock on bedroom door,Amenities : Free parking on street,Amenities : Dog(s),Amenities : Window guards,Amenities : Private entrance,Amenities : Self Check-In,Amenities : Stair gates,Amenities : Safety card,Amenities : High chair,Amenities : Lockbox,Amenities : Elevator in building,Amenities : Laptop friendly workspace,Amenities : Carbon monoxide detector,Amenities : TV,Amenities : Indoor fireplace,Amenities : Hair dryer,Amenities : Room-darkening shades,Amenities : Pets allowed,Amenities : Baby monitor,Amenities : Smartlock,Amenities : translation missing: en.hosting_amenity_49,Amenities : Family/kid friendly,Amenities : Dryer,Amenities : Heating,Amenities : Bathtub,Amenities : Wheelchair accessible,Amenities : Doorman Entry,Amenities : Suitable for events,Amenities : Hot tub,Amenities : Outlet covers,Amenities : Breakfast,Amenities : Doorman,Amenities : Iron,Host Verifications : phone,Host Verifications : government_id,Host Verifications : google,Host Verifications : selfie,Host Verifications : None,Host Verifications : sesame,Host Verifications : linkedin,Host Verifications : sent_id,Host Verifications : offline_government_id,Host Verifications : weibo,Host Verifications : reviews,Host Verifications : email,Host Verifications : manual_offline,Host Verifications : photographer,Host Verifications : facebook,Host Verifications : kba,Host Verifications : manual_online,Host Verifications : amex,Host Verifications : work_email,Features : Host Is Superhost,Features : Instant Bookable,Features : Require Guest Phone Verification,Features : Host Has Profile Pic,Features : Is Location Exact,Features : Require Guest Profile Picture,Features : Host Identity Verified,Neighbourhood Cleansed : Palais-Bourbon,Neighbourhood Cleansed : Observatoire,Neighbourhood Cleansed : Vaugirard,Neighbourhood Cleansed : Entrepôt,Neighbourhood Cleansed : Ménilmontant,Neighbourhood Cleansed : Bourse,Neighbourhood Cleansed : Hôtel-de-Ville,Neighbourhood Cleansed : Opéra,Neighbourhood Cleansed : Luxembourg,Neighbourhood Cleansed : Buttes-Montmartre,Neighbourhood Cleansed : Gobelins,Neighbourhood Cleansed : Popincourt,Neighbourhood Cleansed : Élysée,Neighbourhood Cleansed : Batignolles-Monceau,Neighbourhood Cleansed : Passy,Neighbourhood Cleansed : Temple,Neighbourhood Cleansed : Panthéon,Neighbourhood Cleansed : Louvre,Neighbourhood Cleansed : Buttes-Chaumont,Neighbourhood Cleansed : Reuilly
0,1884.0,2,1.0,1.0,1.0,1,50.0,500.0,30.0,1,0,4,15,80.0,0,0.16,"48.8445603313,2.26336692547",1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,1434.0,5,2.0,2.0,2.0,1,250.0,750.0,90.0,4,15,0,28,95.0,2,0.56,"48.8580714183,2.28022090415",1,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,1,1,0,1,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,1,0,1,0,0,0,1,1,1,0,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,1268.0,7,1.5,2.0,5.0,1,165.0,200.0,30.0,1,0,0,51,100.0,0,1.00,"48.8502032736,2.27083162889",1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,1,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,1587.0,2,1.0,0.0,1.0,1,35.0,94.0,0.0,1,0,120,0,93.0,1,1.38,"48.8420130083,2.26885375663",1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,2269.0,4,1.0,1.0,2.0,1,90.0,500.0,30.0,2,10,180,39,100.0,1,0.35,"48.8410481577,2.25944805867",1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40023,2072.0,2,1.0,1.0,1.0,1,40.0,300.0,19.0,1,5,7,10,100.0,1,0.71,"48.8619682733,2.40198612937",1,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0,1,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
40024,1983.0,2,1.0,0.0,2.0,1,26.0,200.0,25.0,1,0,240,0,80.0,2,0.13,"48.8664429191,2.39643395508",1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,1,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
40025,1863.0,2,1.0,1.0,1.0,0,50.0,0.0,0.0,1,0,7,10,100.0,1,0.55,"48.8663673103,2.39473441637",0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0,1,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
40026,2674.0,2,1.0,1.0,1.0,1,40.0,200.0,0.0,1,30,0,0,98.0,1,0.47,"48.8680706349,2.39800286126",0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [82]:
def data_preprocessing_bis(data):
    X = data.copy()
    X = X.drop(columns=['ID','Name','Summary','Space','Neighborhood Overview',
                        'Notes','Transit','Interaction'])
    
    X['Calendar Updated'] = X['Calendar Updated'].apply(Calendar_Update_toint)
    X['Host Since'] =  pd.to_datetime(X['Host Since']).apply(Host_Since_toint)
    X['Host Since'] =  X['Host Since'].fillna(X['Calendar Updated']) ## remplace les date manquantes par la dernière date de réservation
    X['Bed Type'] = X['Bed Type'].apply(Bed_type_toint)
    
    X['Cancellation Policy'] = X['Cancellation Policy'].apply(Cancellation_policy_toint)
    X['Property Type'] = X['Property Type'].apply(Property_type_encoding)
    """
    X = pd.concat([X, pd.get_dummies(X['Room Type'])], axis=1)
    X = X.drop(columns='Room Type')
    
    X = pd.concat([X, pd.get_dummies(X['Property Type'])], axis=1)
    X = X.drop(columns='Property Type')
    X = create_columns(X, 'Neighbourhood Cleansed')
    """
    X = create_columns(X, 'Amenities')
    X = create_columns(X, 'Host Verifications')
    X = create_columns(X, 'Features')
    X = X.drop(columns=['Review Scores Accuracy','Review Scores Cleanliness', 'Review Scores Checkin',
                        'Review Scores Communication', 'Review Scores Location','Review Scores Value', 
                        'Host Verifications : sesame_offline', 'Host Verifications : identity_manual', 
                        'Host Verifications : jumio']) #'Private room','Shared room'
                       
    X = X.fillna('')
    return X

In [83]:
preprocessor_bis = FunctionTransformer(data_preprocessing_bis)

In [84]:
B = preprocessor_bis.transform(df_airbnb)
B

Unnamed: 0,Host Since,Neighbourhood Cleansed,Property Type,Room Type,Accommodates,Bathrooms,Bedrooms,Beds,Bed Type,Price,Security Deposit,Cleaning Fee,Guests Included,Extra People,Calendar Updated,Availability 60,Review Scores Rating,Cancellation Policy,Reviews per Month,Geolocation,Amenities : Fire extinguisher,Amenities : Baby bath,Amenities : Washer,Amenities : Free parking on premises,Amenities : Cat(s),Amenities : Private living room,Amenities : Pool,Amenities : Cable TV,Amenities : Essentials,Amenities : Children’s dinnerware,Amenities : Washer / Dryer,Amenities : Keypad,Amenities : Crib,Amenities : Changing table,Amenities : First aid kit,Amenities : Internet,Amenities : Babysitter recommendations,Amenities : Gym,Amenities : 24-hour check-in,Amenities : Buzzer/wireless intercom,Amenities : translation missing: en.hosting_amenity_50,Amenities : Hangers,Amenities : Children’s books and toys,Amenities : Wireless Internet,Amenities : Fireplace guards,Amenities : Pets live on this property,Amenities : Air conditioning,Amenities : Kitchen,Amenities : Smoking allowed,Amenities : Paid parking off premises,Amenities : Shampoo,Amenities : Smoke detector,Amenities : Pack ’n Play/travel crib,Amenities : Other pet(s),Amenities : Table corner guards,Amenities : Game console,Amenities : Lock on bedroom door,Amenities : Free parking on street,Amenities : Dog(s),Amenities : Window guards,Amenities : Private entrance,Amenities : Self Check-In,Amenities : Stair gates,Amenities : Safety card,Amenities : High chair,Amenities : Lockbox,Amenities : Elevator in building,Amenities : Laptop friendly workspace,Amenities : Carbon monoxide detector,Amenities : TV,Amenities : Indoor fireplace,Amenities : Hair dryer,Amenities : Room-darkening shades,Amenities : Pets allowed,Amenities : Baby monitor,Amenities : Smartlock,Amenities : translation missing: en.hosting_amenity_49,Amenities : Family/kid friendly,Amenities : Dryer,Amenities : Heating,Amenities : Bathtub,Amenities : Wheelchair accessible,Amenities : Doorman Entry,Amenities : Suitable for events,Amenities : Hot tub,Amenities : Outlet covers,Amenities : Breakfast,Amenities : Doorman,Amenities : Iron,Host Verifications : phone,Host Verifications : government_id,Host Verifications : google,Host Verifications : selfie,Host Verifications : None,Host Verifications : sesame,Host Verifications : linkedin,Host Verifications : sent_id,Host Verifications : offline_government_id,Host Verifications : weibo,Host Verifications : reviews,Host Verifications : email,Host Verifications : manual_offline,Host Verifications : photographer,Host Verifications : facebook,Host Verifications : kba,Host Verifications : manual_online,Host Verifications : amex,Host Verifications : work_email,Features : Host Is Superhost,Features : Instant Bookable,Features : Require Guest Phone Verification,Features : Host Has Profile Pic,Features : Is Location Exact,Features : Require Guest Profile Picture,Features : Host Identity Verified
0,1884.0,Passy,apartment,Entire home/apt,2,1.0,1.0,1.0,1,50.0,500.0,30.0,1,0,4,15,80.0,0,0.16,"48.8445603313,2.26336692547",0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0
1,1434.0,Passy,apartment,Entire home/apt,5,2.0,2.0,2.0,1,250.0,750.0,90.0,4,15,0,28,95.0,2,0.56,"48.8580714183,2.28022090415",0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,1,1,0,1,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,1,0,1,0,0,0,1,1,1,0,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0
2,1268.0,Passy,apartment,Entire home/apt,7,1.5,2.0,5.0,1,165.0,200.0,30.0,1,0,0,51,100.0,0,1.00,"48.8502032736,2.27083162889",0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,1,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,1,0,1,1,0,0
3,1587.0,Passy,apartment,Entire home/apt,2,1.0,0.0,1.0,1,35.0,94.0,0.0,1,0,120,0,93.0,1,1.38,"48.8420130083,2.26885375663",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0
4,2269.0,Passy,apartment,Entire home/apt,4,1.0,1.0,2.0,1,90.0,500.0,30.0,2,10,180,39,100.0,1,0.35,"48.8410481577,2.25944805867",0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40023,2072.0,Ménilmontant,apartment,Entire home/apt,2,1.0,1.0,1.0,1,40.0,300.0,19.0,1,5,7,10,100.0,1,0.71,"48.8619682733,2.40198612937",0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0,1,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,1,0,1,1,0,0
40024,1983.0,Ménilmontant,apartment,Entire home/apt,2,1.0,0.0,2.0,1,26.0,200.0,25.0,1,0,240,0,80.0,2,0.13,"48.8664429191,2.39643395508",0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,1,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,1
40025,1863.0,Ménilmontant,apartment,Private room,2,1.0,1.0,1.0,0,50.0,0.0,0.0,1,0,7,10,100.0,1,0.55,"48.8663673103,2.39473441637",0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0,1,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0
40026,2674.0,Ménilmontant,apartment,Private room,2,1.0,1.0,1.0,1,40.0,200.0,0.0,1,30,0,0,98.0,1,0.47,"48.8680706349,2.39800286126",0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,1


# Feature engineering

In [41]:
df_tourisme = pd.read_csv("paris_poi.csv", index_col=0)

In [42]:
df_tourisme

Unnamed: 0,address,lat,lon,name,subCategory
0,23 Quai Anatole France,48.862078,2.322534,Concorde Atlantique,Boat or Ferry
1,17 Place du Trocadéro,48.862301,2.288225,Palais de Chaillot,Monument / Landmark
2,Quai François Mauriac,48.833679,2.375686,Bibliothèque Nationale de France (BNF),Library
3,Place Charles de Gaulle,48.873797,2.294952,Arc de Triomphe,Monument / Landmark
4,4 Boulevard du Palais,48.855920,2.344637,Palais de Justice de Paris,Courthouse
...,...,...,...,...,...
1729,159 Boulevard Malesherbe,48.885632,2.306501,Cabinet Dentaire - Dr Cadet & Dr Bertrant,Dentist's Office
1730,10 Rue Civiale,48.871437,2.374117,cabinet paramedical civiale,Medical Center
1731,Rue Pajol,48.866977,2.353663,Bibliothèque Vaclav Havel,Library
1732,10 Rue Louis-Blanc,48.879165,2.369245,Consulat Chaoui - Paris X,Embassy / Consulate


In [43]:
def nearest_average(lat, lon, ys, n:int):
    """
    return the index in df_t of the "n" neareast and their distances
    in the form (distances, indices)
    """
    x = np.array([lat, lon])
    #distances in km
    distances = np.linalg.norm(x - ys, axis=1) * 77.14017896147689
    indices = distances.argsort()[:n]

    return np.mean(distances[indices])

In [44]:
def create_average_poi(X, number_of_POIs, df_tour):
    """créer une colonne donnant la distance moyennes à i point d'intérêt les plus proches du logement"""
    X = X.copy()
    lat = X.Geolocation.apply(lambda x : float(x.split(',')[0]))
    lon = X.Geolocation.apply(lambda x : float(x.split(',')[1]))
    ys = np.array(list(zip(df_tour['lat'],  df_tour['lon']))) # coordonnées des lieux touristiques
    
    X[str(number_of_POIs) + " nearest POI average"] = [nearest_average(lat[i], lon[i], ys, number_of_POIs) for i in range(len(X))]
    return X

In [45]:
def feature_engineering(data, n=30):
    """renvoie la colonne contenant la distance moyennes au n points d'intérêt les plus proches"""
    X = data.copy()
    df_tour = pd.read_csv("paris_poi.csv", index_col=0)
    X = create_average_poi(X, n, df_tour)
    X = X.drop(columns='Geolocation')
    return X

feature_generator = FunctionTransformer(feature_engineering)

In [18]:
A = feature_generator.transform(df_airbnb)

## Création d'une fonction regroupant le preprocessing et le feature engineering

In [19]:
processor = make_pipeline(preprocessor, feature_generator)

In [20]:
A = processor.transform(df_airbnb)

# Données Finales

In [21]:
A

Unnamed: 0,Host Since,Accommodates,Bathrooms,Bedrooms,Beds,Bed Type,Price,Security Deposit,Cleaning Fee,Guests Included,Extra People,Calendar Updated,Availability 60,Review Scores Rating,Cancellation Policy,Reviews per Month,Entire home/apt,apartment,bed_breakfast,hoster,house,other,Amenities : Fire extinguisher,Amenities : Baby bath,Amenities : Washer,Amenities : Free parking on premises,Amenities : Cat(s),Amenities : Private living room,Amenities : Pool,Amenities : Cable TV,Amenities : Essentials,Amenities : Children’s dinnerware,Amenities : Washer / Dryer,Amenities : Keypad,Amenities : Crib,Amenities : Changing table,Amenities : First aid kit,Amenities : Internet,Amenities : Babysitter recommendations,Amenities : Gym,Amenities : 24-hour check-in,Amenities : Buzzer/wireless intercom,Amenities : translation missing: en.hosting_amenity_50,Amenities : Hangers,Amenities : Children’s books and toys,Amenities : Wireless Internet,Amenities : Fireplace guards,Amenities : Pets live on this property,Amenities : Air conditioning,Amenities : Kitchen,Amenities : Smoking allowed,Amenities : Paid parking off premises,Amenities : Shampoo,Amenities : Smoke detector,Amenities : Pack ’n Play/travel crib,Amenities : Other pet(s),Amenities : Table corner guards,Amenities : Game console,Amenities : Lock on bedroom door,Amenities : Free parking on street,Amenities : Dog(s),Amenities : Window guards,Amenities : Private entrance,Amenities : Self Check-In,Amenities : Stair gates,Amenities : Safety card,Amenities : High chair,Amenities : Lockbox,Amenities : Elevator in building,Amenities : Laptop friendly workspace,Amenities : Carbon monoxide detector,Amenities : TV,Amenities : Indoor fireplace,Amenities : Hair dryer,Amenities : Room-darkening shades,Amenities : Pets allowed,Amenities : Baby monitor,Amenities : Smartlock,Amenities : translation missing: en.hosting_amenity_49,Amenities : Family/kid friendly,Amenities : Dryer,Amenities : Heating,Amenities : Bathtub,Amenities : Wheelchair accessible,Amenities : Doorman Entry,Amenities : Suitable for events,Amenities : Hot tub,Amenities : Outlet covers,Amenities : Breakfast,Amenities : Doorman,Amenities : Iron,Host Verifications : phone,Host Verifications : government_id,Host Verifications : google,Host Verifications : selfie,Host Verifications : None,Host Verifications : sesame,Host Verifications : linkedin,Host Verifications : sent_id,Host Verifications : offline_government_id,Host Verifications : weibo,Host Verifications : reviews,Host Verifications : email,Host Verifications : manual_offline,Host Verifications : photographer,Host Verifications : facebook,Host Verifications : kba,Host Verifications : manual_online,Host Verifications : amex,Host Verifications : work_email,Features : Host Is Superhost,Features : Instant Bookable,Features : Require Guest Phone Verification,Features : Host Has Profile Pic,Features : Is Location Exact,Features : Require Guest Profile Picture,Features : Host Identity Verified,Neighbourhood Cleansed : Palais-Bourbon,Neighbourhood Cleansed : Observatoire,Neighbourhood Cleansed : Vaugirard,Neighbourhood Cleansed : Entrepôt,Neighbourhood Cleansed : Ménilmontant,Neighbourhood Cleansed : Bourse,Neighbourhood Cleansed : Hôtel-de-Ville,Neighbourhood Cleansed : Opéra,Neighbourhood Cleansed : Luxembourg,Neighbourhood Cleansed : Buttes-Montmartre,Neighbourhood Cleansed : Gobelins,Neighbourhood Cleansed : Popincourt,Neighbourhood Cleansed : Élysée,Neighbourhood Cleansed : Batignolles-Monceau,Neighbourhood Cleansed : Passy,Neighbourhood Cleansed : Temple,Neighbourhood Cleansed : Panthéon,Neighbourhood Cleansed : Louvre,Neighbourhood Cleansed : Buttes-Chaumont,Neighbourhood Cleansed : Reuilly,30 nearest POI average
0,1884.0,2,1.0,1.0,1.0,1,50.0,500.0,30.0,1,0,4,15,80.0,0,0.16,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.257026
1,1434.0,5,2.0,2.0,2.0,1,250.0,750.0,90.0,4,15,0,28,95.0,2,0.56,1,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,1,1,0,1,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,1,0,1,0,0,0,1,1,1,0,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.332669
2,1268.0,7,1.5,2.0,5.0,1,165.0,200.0,30.0,1,0,0,51,100.0,0,1.00,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,1,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.224522
3,1587.0,2,1.0,0.0,1.0,1,35.0,94.0,0.0,1,0,120,0,93.0,1,1.38,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.373449
4,2269.0,4,1.0,1.0,2.0,1,90.0,500.0,30.0,2,10,180,39,100.0,1,0.35,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.343726
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40023,2072.0,2,1.0,1.0,1.0,1,40.0,300.0,19.0,1,5,7,10,100.0,1,0.71,1,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0,1,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.348375
40024,1983.0,2,1.0,0.0,2.0,1,26.0,200.0,25.0,1,0,240,0,80.0,2,0.13,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,1,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.275906
40025,1863.0,2,1.0,1.0,1.0,0,50.0,0.0,0.0,1,0,7,10,100.0,1,0.55,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0,1,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.354959
40026,2674.0,2,1.0,1.0,1.0,1,40.0,200.0,0.0,1,30,0,0,98.0,1,0.47,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.268313


In [85]:
processor_bis = make_pipeline(preprocessor_bis, feature_generator)

In [86]:
B = processor_bis.transform(df_airbnb)

# Données auxilliaires

In [87]:
B

Unnamed: 0,Host Since,Neighbourhood Cleansed,Property Type,Room Type,Accommodates,Bathrooms,Bedrooms,Beds,Bed Type,Price,Security Deposit,Cleaning Fee,Guests Included,Extra People,Calendar Updated,Availability 60,Review Scores Rating,Cancellation Policy,Reviews per Month,Amenities : Fire extinguisher,Amenities : Baby bath,Amenities : Washer,Amenities : Free parking on premises,Amenities : Cat(s),Amenities : Private living room,Amenities : Pool,Amenities : Cable TV,Amenities : Essentials,Amenities : Children’s dinnerware,Amenities : Washer / Dryer,Amenities : Keypad,Amenities : Crib,Amenities : Changing table,Amenities : First aid kit,Amenities : Internet,Amenities : Babysitter recommendations,Amenities : Gym,Amenities : 24-hour check-in,Amenities : Buzzer/wireless intercom,Amenities : translation missing: en.hosting_amenity_50,Amenities : Hangers,Amenities : Children’s books and toys,Amenities : Wireless Internet,Amenities : Fireplace guards,Amenities : Pets live on this property,Amenities : Air conditioning,Amenities : Kitchen,Amenities : Smoking allowed,Amenities : Paid parking off premises,Amenities : Shampoo,Amenities : Smoke detector,Amenities : Pack ’n Play/travel crib,Amenities : Other pet(s),Amenities : Table corner guards,Amenities : Game console,Amenities : Lock on bedroom door,Amenities : Free parking on street,Amenities : Dog(s),Amenities : Window guards,Amenities : Private entrance,Amenities : Self Check-In,Amenities : Stair gates,Amenities : Safety card,Amenities : High chair,Amenities : Lockbox,Amenities : Elevator in building,Amenities : Laptop friendly workspace,Amenities : Carbon monoxide detector,Amenities : TV,Amenities : Indoor fireplace,Amenities : Hair dryer,Amenities : Room-darkening shades,Amenities : Pets allowed,Amenities : Baby monitor,Amenities : Smartlock,Amenities : translation missing: en.hosting_amenity_49,Amenities : Family/kid friendly,Amenities : Dryer,Amenities : Heating,Amenities : Bathtub,Amenities : Wheelchair accessible,Amenities : Doorman Entry,Amenities : Suitable for events,Amenities : Hot tub,Amenities : Outlet covers,Amenities : Breakfast,Amenities : Doorman,Amenities : Iron,Host Verifications : phone,Host Verifications : government_id,Host Verifications : google,Host Verifications : selfie,Host Verifications : None,Host Verifications : sesame,Host Verifications : linkedin,Host Verifications : sent_id,Host Verifications : offline_government_id,Host Verifications : weibo,Host Verifications : reviews,Host Verifications : email,Host Verifications : manual_offline,Host Verifications : photographer,Host Verifications : facebook,Host Verifications : kba,Host Verifications : manual_online,Host Verifications : amex,Host Verifications : work_email,Features : Host Is Superhost,Features : Instant Bookable,Features : Require Guest Phone Verification,Features : Host Has Profile Pic,Features : Is Location Exact,Features : Require Guest Profile Picture,Features : Host Identity Verified,30 nearest POI average
0,1884.0,Passy,apartment,Entire home/apt,2,1.0,1.0,1.0,1,50.0,500.0,30.0,1,0,4,15,80.0,0,0.16,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0.257026
1,1434.0,Passy,apartment,Entire home/apt,5,2.0,2.0,2.0,1,250.0,750.0,90.0,4,15,0,28,95.0,2,0.56,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,1,1,0,1,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,1,0,1,0,0,0,1,1,1,0,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0.332669
2,1268.0,Passy,apartment,Entire home/apt,7,1.5,2.0,5.0,1,165.0,200.0,30.0,1,0,0,51,100.0,0,1.00,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,1,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,1,0,1,1,0,0,0.224522
3,1587.0,Passy,apartment,Entire home/apt,2,1.0,0.0,1.0,1,35.0,94.0,0.0,1,0,120,0,93.0,1,1.38,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0.373449
4,2269.0,Passy,apartment,Entire home/apt,4,1.0,1.0,2.0,1,90.0,500.0,30.0,2,10,180,39,100.0,1,0.35,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0.343726
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40023,2072.0,Ménilmontant,apartment,Entire home/apt,2,1.0,1.0,1.0,1,40.0,300.0,19.0,1,5,7,10,100.0,1,0.71,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0,1,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,1,0,1,1,0,0,0.348375
40024,1983.0,Ménilmontant,apartment,Entire home/apt,2,1.0,0.0,2.0,1,26.0,200.0,25.0,1,0,240,0,80.0,2,0.13,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,1,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0.275906
40025,1863.0,Ménilmontant,apartment,Private room,2,1.0,1.0,1.0,0,50.0,0.0,0.0,1,0,7,10,100.0,1,0.55,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0,1,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0.354959
40026,2674.0,Ménilmontant,apartment,Private room,2,1.0,1.0,1.0,1,40.0,200.0,0.0,1,30,0,0,98.0,1,0.47,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0.268313


In [88]:
B['Room Type'].value_counts()

Entire home/apt    34727
Private room        4924
Shared room          377
Name: Room Type, dtype: int64

In [76]:
B.to_csv('unb_dataset.csv')

# Aucune valeur manquante dans le dataset final

In [22]:
for a, b in zip(df_airbnb.isna().sum().index, df_airbnb.isna().sum()):
    if b > 0:
        print(a, " : ", b)

Name  :  1
Summary  :  1818
Space  :  9509
Neighborhood Overview  :  13907
Notes  :  25418
Transit  :  12060
Interaction  :  18534
Host Since  :  2
Amenities  :  135
Features  :  4


In [23]:
for a, b in zip(A.isna().sum().index, A.isna().sum()):
    if b > 0:
        print(a, " : ", b)

In [24]:
A

Unnamed: 0,Host Since,Accommodates,Bathrooms,Bedrooms,Beds,Bed Type,Price,Security Deposit,Cleaning Fee,Guests Included,Extra People,Calendar Updated,Availability 60,Review Scores Rating,Cancellation Policy,Reviews per Month,Entire home/apt,apartment,bed_breakfast,hoster,house,other,Amenities : Pets live on this property,Amenities : Outlet covers,Amenities : Gym,Amenities : Internet,Amenities : Babysitter recommendations,Amenities : Room-darkening shades,Amenities : Wheelchair accessible,Amenities : Kitchen,Amenities : Smartlock,Amenities : Baby monitor,Amenities : Pack ’n Play/travel crib,Amenities : 24-hour check-in,Amenities : Private entrance,Amenities : Fireplace guards,Amenities : Hangers,Amenities : Doorman Entry,Amenities : Other pet(s),Amenities : Baby bath,Amenities : Family/kid friendly,Amenities : Hot tub,Amenities : Changing table,Amenities : Breakfast,Amenities : Smoking allowed,Amenities : Pets allowed,Amenities : Paid parking off premises,Amenities : Free parking on premises,Amenities : Washer,Amenities : Dog(s),Amenities : Cat(s),Amenities : Safety card,Amenities : Cable TV,Amenities : translation missing: en.hosting_amenity_50,Amenities : Shampoo,Amenities : Carbon monoxide detector,Amenities : Laptop friendly workspace,Amenities : Indoor fireplace,Amenities : Free parking on street,Amenities : Fire extinguisher,Amenities : Crib,Amenities : Stair gates,Amenities : Elevator in building,Amenities : Window guards,Amenities : Private living room,Amenities : Self Check-In,Amenities : Dryer,Amenities : Smoke detector,Amenities : Bathtub,Amenities : Wireless Internet,Amenities : Buzzer/wireless intercom,Amenities : Air conditioning,Amenities : Suitable for events,Amenities : Lock on bedroom door,Amenities : Keypad,Amenities : Essentials,Amenities : Hair dryer,Amenities : Children’s books and toys,Amenities : Heating,Amenities : Game console,Amenities : TV,Amenities : Iron,Amenities : Children’s dinnerware,Amenities : Pool,Amenities : Washer / Dryer,Amenities : translation missing: en.hosting_amenity_49,Amenities : High chair,Amenities : First aid kit,Amenities : Lockbox,Amenities : Table corner guards,Amenities : Doorman,Host Verifications : email,Host Verifications : government_id,Host Verifications : sesame,Host Verifications : photographer,Host Verifications : google,Host Verifications : facebook,Host Verifications : linkedin,Host Verifications : sent_id,Host Verifications : manual_offline,Host Verifications : amex,Host Verifications : None,Host Verifications : weibo,Host Verifications : kba,Host Verifications : phone,Host Verifications : work_email,Host Verifications : selfie,Host Verifications : offline_government_id,Host Verifications : reviews,Host Verifications : manual_online,Features : Instant Bookable,Features : Host Is Superhost,Features : Require Guest Phone Verification,Features : Host Has Profile Pic,Features : Require Guest Profile Picture,Features : Is Location Exact,Features : Host Identity Verified,Neighbourhood Cleansed : Entrepôt,Neighbourhood Cleansed : Batignolles-Monceau,Neighbourhood Cleansed : Hôtel-de-Ville,Neighbourhood Cleansed : Buttes-Montmartre,Neighbourhood Cleansed : Observatoire,Neighbourhood Cleansed : Buttes-Chaumont,Neighbourhood Cleansed : Temple,Neighbourhood Cleansed : Bourse,Neighbourhood Cleansed : Ménilmontant,Neighbourhood Cleansed : Louvre,Neighbourhood Cleansed : Passy,Neighbourhood Cleansed : Popincourt,Neighbourhood Cleansed : Opéra,Neighbourhood Cleansed : Gobelins,Neighbourhood Cleansed : Panthéon,Neighbourhood Cleansed : Élysée,Neighbourhood Cleansed : Vaugirard,Neighbourhood Cleansed : Luxembourg,Neighbourhood Cleansed : Reuilly,Neighbourhood Cleansed : Palais-Bourbon,30 nearest POI average
0,1884.0,2,1.0,1.0,1.0,1,50.0,500.0,30.0,1,0,4,15,80.0,0,0.16,1,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,1,1,0,1,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0.257026
1,1434.0,5,2.0,2.0,2.0,1,250.0,750.0,90.0,4,15,0,28,95.0,2,0.56,1,1,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,1,0,1,1,1,0,0,0,0,0,1,0,0,0,1,1,0,1,1,0,0,0,0,1,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0.332669
2,1268.0,7,1.5,2.0,5.0,1,165.0,200.0,30.0,1,0,0,51,100.0,0,1.00,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,1,1,0,0,0,0,0,1,0,0,0,1,1,0,1,0,1,0,0,0,1,1,0,1,0,1,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0.224522
3,1587.0,2,1.0,0.0,1.0,1,35.0,94.0,0.0,1,0,120,0,93.0,1,1.38,1,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0.373449
4,2269.0,4,1.0,1.0,2.0,1,90.0,500.0,30.0,2,10,180,39,100.0,1,0.35,1,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0,0,1,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0.343726
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40023,2072.0,2,1.0,1.0,1.0,1,40.0,300.0,19.0,1,5,7,10,100.0,1,0.71,1,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,1,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0,0,1,1,0,1,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0.348375
40024,1983.0,2,1.0,0.0,2.0,1,26.0,200.0,25.0,1,0,240,0,80.0,2,0.13,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,1,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0.275906
40025,1863.0,2,1.0,1.0,1.0,0,50.0,0.0,0.0,1,0,7,10,100.0,1,0.55,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0.354959
40026,2674.0,2,1.0,1.0,1.0,1,40.0,200.0,0.0,1,30,0,0,98.0,1,0.47,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0.268313


In [23]:
#A.to_csv("final_cleaned_dataset.csv")