# Idealista API JSON Files
Convert JSON files into one DataFrame to work with

In [1]:
# Load Google Drive
from google.colab import drive

# Mount Google Drive through authorization
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Load all necessary libraries
import json, os
from pandas import json_normalize
import pandas as pd
import numpy as np

All JSON files downloaded from the API of Idealista.

In [3]:
### CHANGE THIS PATHNAME TO THE ONE THAT CONTAINS ALL JSON FILES ###
path = '/content/drive/My Drive'
# Set Working Directory where all JSON files are located
os.chdir(path)
# View Working Directory
os.getcwd()

'/content/drive/My Drive'

In [4]:
# Empty list to store data
appended_data = []

# For loop to go through all files in the directory
for filename in os.listdir(path):
    # Condition to only open JSON files
    if filename.endswith('.json'):
        # Load all JSON files
        with open(os.path.join(path, filename)) as f:
            data = json.load(f)
    # Create DataFrame from normalized JSON dictionary
    df = pd.DataFrame.from_dict(json_normalize(data['elementList']))

    # Append the empty list to store the DataFrame data
    appended_data.append(df)

# Concatinate DataFrames
appended_data = pd.concat(appended_data)
df = appended_data
# Save DataFrame as a CSV
#df.to_csv('ALL-JSON-FILES.csv')

# Count number of variables (columns)
print('Number of variables:', len(df.columns))
# Count number of records (rows)
print('Number of records:', len(df.index))

# See IF a file has a wrong format
#print(f)

Number of variables: 40
Number of records: 24850


## Overview of the Data

In [5]:
# Preview "dirty" data
df.head()

Unnamed: 0,propertyCode,thumbnail,externalReference,numPhotos,price,propertyType,operation,size,exterior,rooms,bathrooms,address,province,municipality,district,country,latitude,longitude,showAddress,url,distance,hasVideo,status,newDevelopment,newDevelopmentFinished,priceByArea,hasPlan,has3DTour,has360,parkingSpace.hasParkingSpace,parkingSpace.isParkingSpaceIncludedInPrice,detailedType.typology,detailedType.subTypology,suggestedTexts.subtitle,suggestedTexts.title,neighborhood,floor,hasLift,parkingSpace.parkingSpacePrice,topNewDevelopment
0,[86350566],[https://img3.idealista.com/blur/WEB_LISTING/0...,[97123],[34],[612900],[chalet],[sale],[334],[False],[4],[3],"[Calle Gonzalo Jiménez de Quesada, 1]",[Madrid],[Boadilla del Monte],[Sector B],[es],[40.4136],[-3.9162],[True],[https://www.idealista.com/obra-nueva/86350566/],[18029],[False],[newdevelopment],[True],[False],[1835],[True],[False],[False],[True],[True],[chalet],[semidetachedHouse],"[Sector B, Boadilla del Monte]",[Chalet pareado en Calle Gonzalo Jiménez de Qu...,,,,,
1,[88820218],[https://img3.idealista.com/blur/WEB_LISTING/0...,[POZ23040],[38],[1250000],[chalet],[sale],[396],[False],[5],[4],[barrio Monteclaro],[Madrid],[Pozuelo de Alarcón],[Urbanizaciones],[es],[40.4415],[-3.8361],[False],[https://www.idealista.com/inmueble/88820218/],[11572],[False],[good],[False],,[3157],[False],[False],[False],,,[chalet],,"[Monteclaro, Pozuelo de Alarcón]",[Chalet],[Monteclaro],,,,
2,[88536347],[https://img3.idealista.com/blur/WEB_LISTING/0...,[AS146617],[26],[289000],[flat],[sale],[44],[False],[1],[1],"[Calle de Pelayo, 26]",[Madrid],[Madrid],[Centro],[es],[40.4228],[-3.6984],[True],[https://www.idealista.com/inmueble/88536347/],[789],[True],[good],[False],,[6568],[True],[True],[False],,,[flat],,"[Chueca-Justicia, Madrid]","[Piso en Calle de Pelayo, 26]",[Chueca-Justicia],[3],[False],,
3,[88744592],[https://img3.idealista.com/blur/WEB_LISTING/0...,[CL147007],[57],[2900000],[chalet],[sale],[800],[False],[8],[7],[La Moraleja],[Madrid],[La Moraleja],[La Moraleja],[es],[40.5121],[-3.6235],[False],[https://www.idealista.com/inmueble/88744592/],[12572],[True],[good],[False],,[3625],[True],[True],[True],[True],[True],[chalet],[independantHouse],"[La Moraleja, La Moraleja]",[Casa independiente],,,,,
4,[87535127],[https://img3.idealista.com/blur/WEB_LISTING/0...,[AS143181],[26],[430000],[flat],[sale],[93],[True],[1],[2],"[Paseo del Prado, 14]",[Madrid],[Madrid],[Centro],[es],[40.4142],[-3.6944],[True],[https://www.idealista.com/inmueble/87535127/],[795],[True],[good],[False],,[4624],[True],[True],[False],,,[flat],,"[Huertas-Cortes, Madrid]","[Piso en Paseo del Prado, 14]",[Huertas-Cortes],[4],[True],,


In [6]:
# View all DataFrame column names
df.columns

Index(['propertyCode', 'thumbnail', 'externalReference', 'numPhotos', 'price',
       'propertyType', 'operation', 'size', 'exterior', 'rooms', 'bathrooms',
       'address', 'province', 'municipality', 'district', 'country',
       'latitude', 'longitude', 'showAddress', 'url', 'distance', 'hasVideo',
       'status', 'newDevelopment', 'newDevelopmentFinished', 'priceByArea',
       'hasPlan', 'has3DTour', 'has360', 'parkingSpace.hasParkingSpace',
       'parkingSpace.isParkingSpaceIncludedInPrice', 'detailedType.typology',
       'detailedType.subTypology', 'suggestedTexts.subtitle',
       'suggestedTexts.title', 'neighborhood', 'floor', 'hasLift',
       'parkingSpace.parkingSpacePrice', 'topNewDevelopment'],
      dtype='object')

In [7]:
# See Data Types
df.dtypes

propertyCode                                  object
thumbnail                                     object
externalReference                             object
numPhotos                                     object
price                                         object
propertyType                                  object
operation                                     object
size                                          object
exterior                                      object
rooms                                         object
bathrooms                                     object
address                                       object
province                                      object
municipality                                  object
district                                      object
country                                       object
latitude                                      object
longitude                                     object
showAddress                                   

## Cleaning and transformations

In [8]:
# STRINGS
for column in df:
    # Convert to string
    df[column] = df[column].astype(str)
    # Remove unnecessary characters
    df[column] = df[column].str.lstrip("['")
    df[column] = df[column].str.rstrip("]'")

# INTEGERS
for column in df[['propertyCode', 'numPhotos', 'price', 'priceByArea', 'rooms', 'bathrooms', 'distance', 'size', 'parkingSpace.parkingSpacePrice']]:
    # Convert to integers
    df[column] = pd.to_numeric(df[column], errors='coerce')
    
# TRUE/FALSE
# Convert to Logical
d = {'True': True, 'False': False}
for column in df[['hasLift', 'exterior', 'parkingSpace.hasParkingSpace', 'newDevelopment', 'newDevelopmentFinished', 'topNewDevelopment', 'parkingSpace.isParkingSpaceIncludedInPrice']]:
  df[column] = df[column].map(d)
  # Replace NaNs with False (0)
  df[column] = df[column].fillna(0)

# Convert True/False to 1/0
for column in df[['hasLift', 'exterior', 'parkingSpace.hasParkingSpace', 'newDevelopment', 'newDevelopmentFinished', 'topNewDevelopment', 'parkingSpace.isParkingSpaceIncludedInPrice']]:
    # Convert TRUE (1) and FALSE (0) to integers
    #df[column] = (df[column] == True).astype(int)
    df[column] = np.where(df[column] == True, 1, 0)

# Categorical variables (Label Encoding) ?
#for column in df[['status']]:
    #df['column'] = df['column'].astype('category')

# Drop unnecessary columns
df.drop(['thumbnail', 'showAddress', 'numPhotos', 'url', 'hasPlan', 'has3DTour', 'has360', 'hasVideo', 'externalReference', 'province', 'operation', 'country', 'detailedType.typology', 'suggestedTexts.subtitle', 'suggestedTexts.title', 'detailedType.subTypology'], axis=1, inplace=True)

In [9]:
# See Data Types
df.dtypes

propertyCode                                    int64
price                                           int64
propertyType                                   object
size                                          float64
exterior                                        int64
rooms                                           int64
bathrooms                                       int64
address                                        object
municipality                                   object
district                                       object
latitude                                       object
longitude                                      object
distance                                        int64
status                                         object
newDevelopment                                  int64
newDevelopmentFinished                          int64
priceByArea                                     int64
parkingSpace.hasParkingSpace                    int64
parkingSpace.isParkingSpaceI

## Clean DataFrame

In [10]:
# Preview data
df.head()

Unnamed: 0,propertyCode,price,propertyType,size,exterior,rooms,bathrooms,address,municipality,district,latitude,longitude,distance,status,newDevelopment,newDevelopmentFinished,priceByArea,parkingSpace.hasParkingSpace,parkingSpace.isParkingSpaceIncludedInPrice,neighborhood,floor,hasLift,parkingSpace.parkingSpacePrice,topNewDevelopment
0,86350566,612900,chalet,334.0,0,4,3,"Calle Gonzalo Jiménez de Quesada, 1",Boadilla del Monte,Sector B,40.4136,-3.9162,18029,newdevelopment,1,0,1835,1,1,,,0,,0
1,88820218,1250000,chalet,396.0,0,5,4,barrio Monteclaro,Pozuelo de Alarcón,Urbanizaciones,40.4415,-3.8361,11572,good,0,0,3157,0,0,Monteclaro,,0,,0
2,88536347,289000,flat,44.0,0,1,1,"Calle de Pelayo, 26",Madrid,Centro,40.4228,-3.6984,789,good,0,0,6568,0,0,Chueca-Justicia,3.0,0,,0
3,88744592,2900000,chalet,800.0,0,8,7,La Moraleja,La Moraleja,La Moraleja,40.5121,-3.6235,12572,good,0,0,3625,1,1,,,0,,0
4,87535127,430000,flat,93.0,1,1,2,"Paseo del Prado, 14",Madrid,Centro,40.4142,-3.6944,795,good,0,0,4624,0,0,Huertas-Cortes,4.0,1,,0


In [11]:
# Total amount of unique Property Codes
total = len(df.index)
uniques = len(df['propertyCode'].unique())

print('There are {} unique properties out of {}.'.format(uniques, total))

There are 10820 unique properties out of 24850.


In [12]:
# Total number of duplicates
df.duplicated(subset = 'propertyCode', keep = 'first').sum()

14030

What is the difference between UNIQUE and DUPLICATES?

In [13]:
# Dropping ALL duplicte values
df.drop_duplicates(subset ='propertyCode', 
                     keep = 'first', inplace = True)

In [14]:
# Preview unique data
df.head()

Unnamed: 0,propertyCode,price,propertyType,size,exterior,rooms,bathrooms,address,municipality,district,latitude,longitude,distance,status,newDevelopment,newDevelopmentFinished,priceByArea,parkingSpace.hasParkingSpace,parkingSpace.isParkingSpaceIncludedInPrice,neighborhood,floor,hasLift,parkingSpace.parkingSpacePrice,topNewDevelopment
0,86350566,612900,chalet,334.0,0,4,3,"Calle Gonzalo Jiménez de Quesada, 1",Boadilla del Monte,Sector B,40.4136,-3.9162,18029,newdevelopment,1,0,1835,1,1,,,0,,0
1,88820218,1250000,chalet,396.0,0,5,4,barrio Monteclaro,Pozuelo de Alarcón,Urbanizaciones,40.4415,-3.8361,11572,good,0,0,3157,0,0,Monteclaro,,0,,0
2,88536347,289000,flat,44.0,0,1,1,"Calle de Pelayo, 26",Madrid,Centro,40.4228,-3.6984,789,good,0,0,6568,0,0,Chueca-Justicia,3.0,0,,0
3,88744592,2900000,chalet,800.0,0,8,7,La Moraleja,La Moraleja,La Moraleja,40.5121,-3.6235,12572,good,0,0,3625,1,1,,,0,,0
4,87535127,430000,flat,93.0,1,1,2,"Paseo del Prado, 14",Madrid,Centro,40.4142,-3.6944,795,good,0,0,4624,0,0,Huertas-Cortes,4.0,1,,0


In [15]:
# Total amount of unique Property Codes
total = len(df.index)

print('There are {} total unique records.'.format(total))

There are 10820 total unique records.


In [16]:
# Reset DataFrame Index
df.reset_index(drop=True, inplace=True)

In [17]:
######################################################
# Save cleaned DataFrame without Duplicates as a CSV #
#df.to_csv('ALL-JSON-FILES.csv')                     #
######################################################

## Analysis

In [18]:
# Value counts of column
df['floor'].value_counts()

1      2094
2      1775
3      1510
bj     1498
nan    1340
4      1107
5       556
6       285
7       179
en      146
8        85
9        61
ss       55
10       33
st       27
11       19
12       19
14       12
-1       11
13        3
15        2
17        1
16        1
-2        1
Name: floor, dtype: int64

In [19]:
df['status'].value_counts()

good              7972
renew             1740
nan                826
newdevelopment     282
Name: status, dtype: int64

In [20]:
dictionary = df['municipality'].value_counts().to_dict()
dictionary

{'Ajalvir': 1,
 'Alcalá de Henares': 137,
 'Alcobendas': 32,
 'Alcorcón': 92,
 'Algete': 22,
 'Arganda': 39,
 'Arroyomolinos': 33,
 'Batres': 2,
 'Boadilla del Monte': 57,
 'Brunete': 13,
 'Camarma de Esteruelas': 1,
 'Campo Real': 5,
 'Casarrubuelos': 5,
 'Chinchón': 1,
 'Ciempozuelos': 29,
 'Ciudalcampo': 15,
 'Cobeña': 3,
 'Colmenar Viejo': 36,
 'Colmenarejo': 3,
 'Coslada': 29,
 'Cubas de la Sagra': 14,
 'Daganzo de Arriba': 9,
 'Fuenlabrada': 145,
 'Fuente del Fresno': 2,
 'Fuente el Saz de Jarama': 8,
 'Galapagar': 10,
 'Getafe': 122,
 'Griñón': 12,
 'Hoyo de Manzanares': 7,
 'Humanes de Madrid': 44,
 'La Moraleja': 108,
 'Las Rozas de Madrid': 111,
 'Leganés': 191,
 'Loeches': 17,
 'Madrid': 8355,
 'Majadahonda': 56,
 'Mejorada del Campo': 32,
 'Moraleja de Enmedio': 6,
 'Morata de Tajuña': 1,
 'Móstoles': 116,
 'Navalcarnero': 29,
 'Paracuellos de Jarama': 9,
 'Parla': 112,
 'Pinto': 28,
 'Pozuelo de Alarcón': 175,
 'Quijorna': 9,
 'Rivas-Vaciamadrid': 39,
 'San Agustin de Guad

In [21]:
dictionary = df['neighborhood'].value_counts().to_dict()
dictionary

{'12 de Octubre-Orcasur': 16,
 'Abrantes': 93,
 'Acacias': 101,
 'Adelfas': 33,
 'Alameda de Osuna': 8,
 'Almagro': 73,
 'Almendrales': 66,
 'Alto de la Jabonería': 1,
 'Aluche': 85,
 'Ambroz': 32,
 'Amposta': 22,
 'Apóstol Santiago': 18,
 'Arapiles': 73,
 'Aravaca': 41,
 'Arcos': 56,
 'Argüelles': 84,
 'Arroyo del Fresno': 3,
 'Atalaya': 6,
 'Batallas': 16,
 'Bellas Vistas': 123,
 'Bernabéu-Hispanoamérica': 51,
 'Berruguete': 124,
 'Buena Vista': 69,
 'Butarque': 33,
 'Campamento': 20,
 'Campo de Tiro': 1,
 'Campo de las Naciones-Corralejos': 4,
 'Campodón - Ventorro del Cano': 3,
 'Canillas': 58,
 'Canillejas': 28,
 'Casa de Campo': 28,
 'Casco Antiguo': 60,
 'Casco Histórico de Barajas': 12,
 'Casco Histórico de Vallecas': 42,
 'Casco Histórico de Vicálvaro': 21,
 'Castellana': 59,
 'Castilla': 32,
 'Chopera': 121,
 'Chueca-Justicia': 137,
 'Ciudad Jardín': 35,
 'Ciudad Universitaria': 46,
 'Colina': 6,
 'Comillas': 128,
 'Concepción': 47,
 'Conde Orgaz-Piovera': 31,
 'Costillares':

In [22]:
df['parkingSpace.hasParkingSpace'].value_counts()

0    8307
1    2513
Name: parkingSpace.hasParkingSpace, dtype: int64

In [23]:
df['hasLift'].value_counts()

1    6114
0    4706
Name: hasLift, dtype: int64

In [24]:
df['newDevelopment'].value_counts()

0    10538
1      282
Name: newDevelopment, dtype: int64

In [25]:
# Replace specific values with something else
#df['floor'] = df['floor'].replace(['-1', 'ss', 'bj'], 'bajo')
#df['floor'] = df['floor'].replace(['1'], 'normal')
#...