In [1]:
import pandas as pd
import json
import numpy as np
from tqdm import tqdm

from DataProcessing import RecsData as recs

# Traitements des données
Les données sont des fichiers textes sous la forme **JSON**. Il existe deux fichiers distinct : **offering** et **reviews**.

## Présentation de Offering
Offering contient tous les hôtels valables à notre disposition.

In [2]:
offers = recs("../data/offering.txt")
json_offers = offers.to_json()
df_offers = pd.json_normalize(json_offers, meta=['address'])
df_offers.head()

Initialize the dataset


4333it [00:00, 90260.34it/s]
100%|███████████████████████████████████████████████████████████████████████████| 4333/4333 [00:00<00:00, 67708.03it/s]


Unnamed: 0,region_id,url,phone,details,type,id,name,address.region,address.street-address,address.postal-code,address.locality,hotel_class
0,42139,http://www.tripadvisor.com/Hotel_Review-g42139...,,,hotel,1901088,A Victory Inn-East Dearborn,MI,9430 Michigan Avenue,48210,Detroit,
1,37209,http://www.tripadvisor.com/Hotel_Review-g37209...,,,hotel,88161,Hampton Inn Indianapolis-East,IN,2311 N. Shadeland Ave,46219,Indianapolis,2.5
2,60956,http://www.tripadvisor.com/Hotel_Review-g60956...,,,hotel,108250,Crowne Plaza San Antonio Riverwalk,TX,111 Pecan Street East,78205,San Antonio,3.5
3,60811,http://www.tripadvisor.com/Hotel_Review-g60811...,,,hotel,124839,Admiral Fell Inn,MD,888 South Broadway,21231,Baltimore,3.0
4,60878,http://www.tripadvisor.com/Hotel_Review-g60878...,,,hotel,1633462,Aurora Nites Inn Motel,WA,11746 Aurora Ave N,98133,Seattle,


In [3]:
print('En json, une ligne se présente comme suit : \n', json_offers[0])

En json, une ligne se présente comme suit : 
 {'region_id': 42139, 'url': 'http://www.tripadvisor.com/Hotel_Review-g42139-d1901088-Reviews-A_Victory_Inn_East_Dearborn-Detroit_Michigan.html', 'phone': '', 'details': None, 'address': {'region': 'MI', 'street-address': '9430 Michigan Avenue', 'postal-code': '48210', 'locality': 'Detroit'}, 'type': 'hotel', 'id': 1901088, 'name': 'A Victory Inn-East Dearborn'}


## Présentation de reviews
Reviews contient les appréciations des clients dans différents hôtels.

In [4]:
reviews = recs("../data/review.txt")

Initialize the dataset


878561it [00:18, 48101.61it/s]


In [5]:
# prc ou pourcentage limite le nombre de donnée en lecture
reviews_json = reviews.to_json(prc=0.3)

100%|███████████████████████████████████████████████████████████████████████| 263568/263568 [00:08<00:00, 29811.41it/s]


In [6]:
new_reviews = pd.json_normalize(reviews_json, meta=['ratings', 'author'])
new_reviews.head()

Unnamed: 0,title,text,date_stayed,offering_id,num_helpful_votes,date,id,via_mobile,ratings.service,ratings.cleanliness,...,ratings.rooms,author.username,author.num_cities,author.num_helpful_votes,author.num_reviews,author.num_type_reviews,author.id,author.location,ratings.check_in_front_desk,ratings.business_service_(e_g_internet_access)
0,“They Aim To Please”,For people going to Tijuana for treatment at t...,July 2012,81797,1,"July 10, 2012",134008424,False,4.0,5.0,...,4.0,travelynx00,15.0,14.0,37.0,12.0,0CC003ACD8029AF2E4A50B5FE51A8A02,"Drayton Valley, AB Canada",,
1,“Ok - men ikke fancy”,"Hvis man har brug for en base, der ligger i et...",May 2012,99443,0,"June 7, 2012",131475455,False,4.0,5.0,...,3.0,samodor,2.0,,3.0,,072A7C0A39F77DE2E7544484E72886C0,"Helsingoer, Denmark",,
2,“perfect location; wonderful hotel”,We booked the Michelangelo for 3 couples for 3...,October 2011,93589,0,"October 22, 2011",119592649,False,5.0,5.0,...,5.0,Sprocketty,2.0,,8.0,,7C3B54FAC5508C9ACB28AB8D19310CCA,,,
3,“Good stay...”,When pulling into this hotel I was a little le...,August 2010,223137,0,"August 10, 2010",74662207,False,4.0,4.0,...,4.0,Kimmie77Hutto,2.0,,2.0,,71C8A40DEBCF21E3F7024D4E7C5FA3CA,"Hutto, Texas",,
4,“Inexcusable problems”,I booked this hotel for a short 2 night stay. ...,March 2012,81444,4,"March 6, 2012",125730556,False,2.0,3.0,...,1.0,Ramatash,51.0,91.0,62.0,59.0,855DA1721EA94DC2F960A9093AF6B9CC,"Redwood City, CA",,


In [7]:
print('Reviews en JSON')
reviews_json[0]

Reviews en JSON


{'ratings': {'service': 4.0,
  'cleanliness': 5.0,
  'overall': 4.0,
  'value': 4.0,
  'location': 4.0,
  'sleep_quality': 3.0,
  'rooms': 4.0},
 'title': '“They Aim To Please”',
 'text': "For people going to Tijuana for treatment at the various clinics, this place is a Godsend. They try so hard to make things as easy as possible - they offer shuttle service back and forth across the border for clinic patients IF you are staying with them. They can supply hotplates, slow-cookers, etc., so that you can cook Hoxsey friendly meals in your room. As well, they offer a shuttle to Henry's (an organic market) on Sundays. The rooms are basic, beds comfortable enough, but take your own pillow if you're going to be there for any length of time. I've found the rooms to be very clean every time I've been there. Next time I think I'd ask for a ground floor room, and one away from the pool area - I think the best location might be the building facing West. The breakfast room opens at 6:30 and they of

Voici un extrait de valeur de la clé **author** par exemple.

In [8]:
new_reviews.filter(regex='author').head()

Unnamed: 0,author.username,author.num_cities,author.num_helpful_votes,author.num_reviews,author.num_type_reviews,author.id,author.location
0,travelynx00,15.0,14.0,37.0,12.0,0CC003ACD8029AF2E4A50B5FE51A8A02,"Drayton Valley, AB Canada"
1,samodor,2.0,,3.0,,072A7C0A39F77DE2E7544484E72886C0,"Helsingoer, Denmark"
2,Sprocketty,2.0,,8.0,,7C3B54FAC5508C9ACB28AB8D19310CCA,
3,Kimmie77Hutto,2.0,,2.0,,71C8A40DEBCF21E3F7024D4E7C5FA3CA,"Hutto, Texas"
4,Ramatash,51.0,91.0,62.0,59.0,855DA1721EA94DC2F960A9093AF6B9CC,"Redwood City, CA"
