In [1]:
import pandas as pd
import numpy as np

In [2]:
def string_to_array(s):
    """Convert pipe separated string to array."""

    if isinstance(s, str):
        out = s.split("|")
    elif math.isnan(s):
        out = []
    else:
        raise ValueError("Value must be either string of nan")
    return out

In [3]:
def explode(df_in, col_expl):
    """Explode column col_expl of array type into multiple rows."""
    # Col_expl - "impressions"
    df = df_in.copy()
    df.loc[:, col_expl] = df[col_expl].apply(string_to_array)

    df_out = pd.DataFrame(
        {col: np.repeat(df[col].values,
                        df[col_expl].str.len())
         for col in df.columns.drop(col_expl)}
    )


    df_out.loc[:, col_expl] = np.concatenate(df[col_expl].values)
    df_out.loc[:, col_expl] = df_out[col_expl].apply(str)

    return df_out

## Get metadata about hotel 

In [4]:
df = pd.read_csv('../../data/item_metadata.csv')
df.head()

Unnamed: 0,item_id,properties
0,5101,Satellite TV|Golf Course|Airport Shuttle|Cosme...
1,5416,Satellite TV|Cosmetic Mirror|Safe (Hotel)|Tele...
2,5834,Satellite TV|Cosmetic Mirror|Safe (Hotel)|Tele...
3,5910,Satellite TV|Sailing|Cosmetic Mirror|Telephone...
4,6066,Satellite TV|Sailing|Diving|Cosmetic Mirror|Sa...


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 927142 entries, 0 to 927141
Data columns (total 2 columns):
item_id       927142 non-null int64
properties    927142 non-null object
dtypes: int64(1), object(1)
memory usage: 14.1+ MB


In [6]:
df['properties'] = df['properties'].str.replace(' ', '_')

In [7]:
df.head()

Unnamed: 0,item_id,properties
0,5101,Satellite_TV|Golf_Course|Airport_Shuttle|Cosme...
1,5416,Satellite_TV|Cosmetic_Mirror|Safe_(Hotel)|Tele...
2,5834,Satellite_TV|Cosmetic_Mirror|Safe_(Hotel)|Tele...
3,5910,Satellite_TV|Sailing|Cosmetic_Mirror|Telephone...
4,6066,Satellite_TV|Sailing|Diving|Cosmetic_Mirror|Sa...


In [8]:
df_out = explode(df, 'properties')

Explode dataframe 

In [9]:
df_out.head()

Unnamed: 0,item_id,properties
0,5101,Satellite_TV
1,5101,Golf_Course
2,5101,Airport_Shuttle
3,5101,Cosmetic_Mirror
4,5101,Safe_(Hotel)


In [10]:
df_out.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18260819 entries, 0 to 18260818
Data columns (total 2 columns):
item_id       int64
properties    object
dtypes: int64(1), object(1)
memory usage: 278.6+ MB


Get unique properties 

In [11]:
len(df_out.properties.unique())

157

In [12]:
df_out.properties.unique()

array(['Satellite_TV', 'Golf_Course', 'Airport_Shuttle',
       'Cosmetic_Mirror', 'Safe_(Hotel)', 'Telephone', 'Hotel',
       'Sitting_Area_(Rooms)', 'Reception_(24/7)', 'Air_Conditioning',
       'Hypoallergenic_Rooms', 'Cable_TV', 'Hotel_Bar', 'Pool_Table',
       'Bathtub', 'Satisfactory_Rating', 'Room_Service', 'Luxury_Hotel',
       'Terrace_(Hotel)', 'Television', 'Minigolf', 'Business_Hotel',
       'Shower', 'Cot', 'Gym', 'Hairdryer', 'Hypoallergenic_Bedding',
       'Accessible_Parking', 'From_3_Stars', 'Good_Rating', 'Radio',
       '4_Star', 'From_4_Stars', 'Family_Friendly', 'Desk',
       'Tennis_Court_(Indoor)', 'Balcony', 'WiFi_(Public_Areas)',
       'Openable_Windows', 'Express_Check-In_/_Check-Out', 'Restaurant',
       'Laundry_Service', 'Ironing_Board', 'Tennis_Court', 'From_2_Stars',
       'Business_Centre', 'Bowling', 'Conference_Rooms',
       'Electric_Kettle', 'Accessible_Hotel', 'Porter', 'Bike_Rental',
       'Non-Smoking_Rooms', 'Car_Park', 'Safe_(Rooms)'

# Get to 15 most frequent properties 

In [13]:
df_out.properties.value_counts()[:15]

Satisfactory_Rating     533286
Car_Park                487879
Good_Rating             481910
WiFi_(Rooms)            467027
Shower                  426875
Television              425953
WiFi_(Public_Areas)     399547
Hotel                   379321
Very_Good_Rating        376666
Air_Conditioning        353296
House_/_Apartment       352254
Openable_Windows        348151
Non-Smoking_Rooms       346035
Free_WiFi_(Combined)    318894
Central_Heating         315101
Name: properties, dtype: int64

In [14]:
len(df_out.item_id.unique())

927142

 1. 3_Star
 2. Reception_(24/7)
 3. Satisfactory_Rating
 4. Car_Park
 5. Very_Good_Rating 

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 927142 entries, 0 to 927141
Data columns (total 2 columns):
item_id       927142 non-null int64
properties    927142 non-null object
dtypes: int64(1), object(1)
memory usage: 14.1+ MB


In [16]:
df['3_Star'] = np.where( df.properties.str.contains('|3_Star|', regex=False), 1, 0) 
df['Reception_(24/7)'] = np.where( df.properties.str.contains('Reception_(24/7)', regex=False), 1, 0) 
df['Satisfactory_Rating'] = np.where( df.properties.str.contains('Satisfactory_Rating', regex=False), 1, 0) 
df['Car_Park'] = np.where( df.properties.str.contains('Car_Park', regex=False), 1, 0) 
df['Very_Good_Rating'] = np.where( df.properties.str.contains('Very_Good_Rating', regex=False), 1, 0) 

In [17]:
df.head()

Unnamed: 0,item_id,properties,3_Star,Reception_(24/7),Satisfactory_Rating,Car_Park,Very_Good_Rating
0,5101,Satellite_TV|Golf_Course|Airport_Shuttle|Cosme...,0,1,1,1,0
1,5416,Satellite_TV|Cosmetic_Mirror|Safe_(Hotel)|Tele...,0,1,1,1,1
2,5834,Satellite_TV|Cosmetic_Mirror|Safe_(Hotel)|Tele...,1,1,1,1,1
3,5910,Satellite_TV|Sailing|Cosmetic_Mirror|Telephone...,0,0,1,1,0
4,6066,Satellite_TV|Sailing|Diving|Cosmetic_Mirror|Sa...,0,1,1,1,0


In [18]:
df = df.drop('properties', axis=1)

In [19]:
df.head()

Unnamed: 0,item_id,3_Star,Reception_(24/7),Satisfactory_Rating,Car_Park,Very_Good_Rating
0,5101,0,1,1,1,0
1,5416,0,1,1,1,1
2,5834,1,1,1,1,1
3,5910,0,0,1,1,0
4,6066,0,1,1,1,0


In [20]:
df.to_csv('../../data/5_features.csv', index=False)