# Data Prep

## Importing

In [2]:
import gzip
import pandas as pd 

# Unzip zipped file
with gzip.open('Raw_Data/listings.csv.gz', 'rb') as listings:
 # Pass the unziped file to pandas
   dfRaw = pd.read_csv(listings)

Used [MyGeodata](https://mygeodata.cloud/) to convert the kml downloaded from [Google Maps](https://www.google.com/maps/d/u/0/viewer?ie=UTF8&oe=UTF8&dg=feature&msa=0&mid=1Uq7DL2Qt8S3jMWCtzhLhv34YZ84&ll=38.01603792916025%2C23.79033106347657&z=10) to the resulted file `Transportation.csv`.

In [3]:
dfTransport = pd.read_csv('Data_Prep\Transportation_Metro_Tram.csv')
dfTransport = dfTransport[dfTransport['geometry/type'] != 'LineString']
dfTransport.drop(columns=['type','properties/description', 'geometry/type', 'properties/tessellate'], inplace = True)
dfTransport.rename(columns={'properties/Name': 'Name', 'geometry/coordinates/0' : 'longitude', 'geometry/coordinates/1' : 'latitude' }, inplace=True)
dfTransport.dropna(how='all', axis=1, inplace=True)
dfTransport.reset_index(inplace=True)
dfTransport.drop(columns='index', inplace=True)
dfTransport.head()

Unnamed: 0,Name,longitude,latitude
0,Neos Kosmos (Νέος Κόσμος‎),23.727947,37.957471
1,Faliro (Φάληρο) / S.E.F. (Σ.Ε.Φ.),23.664551,37.944198
2,Piraeus (Πειραιάς),23.639188,37.947616
3,"International Airport ""Eleftherios Venizelos"" ...",23.952599,37.940542
4,Larissa Station,23.720652,37.992341


Used [MyGeodata](https://mygeodata.cloud/) to convert the kml downloaded from [Google Maps](https://www.google.com/maps/d/u/0/viewer?ie=UTF8&t=h&oe=UTF8&msa=0&mid=1oEiURG0UyGJBnMErK3DTtwzsvJo&ll=38.02091428513228%2C23.7600215&z=13) to the resulted file `Attractions.csv`.dfRaw.columns

In [4]:
dfAttractions = pd.read_csv('Data_Prep\Attractions.csv')
#dfAttractions.drop(columns=['description', 'gid', 'tessellate'], inplace = True)

dfAttractions.head()

Unnamed: 0,longitude,latitude,Location
0,23.73567,37.975989,Sytagma_Square
1,23.733963,37.97567,Ermou_Street
2,23.729435,37.982571,Stadiou_Avenue
3,23.73053,37.98318,Panepistimiou_Eleftheriou_Venizelou_Avenue
4,23.743306,37.981885,Lycabetttus_Hill


We will use these dataframes to create new features based on the distance of these locations from the apartments

In [5]:
appartmentLocations = dfRaw[['id', 'longitude', 'latitude']].copy()
appartmentLocations

Unnamed: 0,id,longitude,latitude
0,10595,23.765270,37.988630
1,10990,23.764480,37.989030
2,10993,23.764730,37.988880
3,10995,23.764480,37.989030
4,27262,23.765000,37.989240
...,...,...,...
9577,52959003,23.728438,37.976986
9578,52959885,23.731117,37.955988
9579,52959925,23.723520,37.985283
9580,52960132,23.730460,37.987990


Used [MyGeodata](https://mygeodata.cloud/) to convert the kml downloaded from [Google Maps](https://www.google.com/maps/d/u/0/viewer?ie=UTF8&t=h&oe=UTF8&msa=0&mid=1oEiURG0UyGJBnMErK3DTtwzsvJo&ll=38.02091428513228%2C23.7600215&z=13) to the resulted file `Attractions.csv`.

In [6]:
# Distance between two points of erath function
from math import radians, cos, sin, asin, sqrt 
def map_distance(lat1, lat2, lon1, lon2):
  
    # The math module contains a function named
    # radians which converts from degrees to radians.
    lon1 = radians(lon1)
    lon2 = radians(lon2)
    lat1 = radians(lat1)
    lat2 = radians(lat2)
      
    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
 
    c = 2 * asin(sqrt(a))
    
    # Radius of earth in kilometers
    r = 6371
      
    # calculate the result
    return(c * r)

# Min distance from station function
def nearest_transport_station_distance(row, stationsDf):
   global distances
   distances = []
   for j in range(len(stationsDf)):
      rowLat = row['latitude']
      rowLon = row['longitude']
      station = stationsDf.loc[j]
      stationLat = station['latitude']
      stationLon = station['longitude']
      distance = map_distance(rowLat, stationLat, rowLon, stationLon)
      distances.append(distance)
      min_dist = min(distances)
      distances.clear()
   return min_dist

In [7]:
# Get the distance from the nearest transport station in the column 'nearest_station_distance'
appartmentLocations['nearest_station_distance'] = 100000

for i in appartmentLocations.index:

   appartmentLocations.at[i, 'nearest_station_distance'] = nearest_transport_station_distance(appartmentLocations.loc[i], dfTransport)

In [8]:
# Create one column per Unique location and find it's distance
for i in dfAttractions.Location.index:
   name =  dfAttractions.loc[i].Location
   for row in appartmentLocations.index:
      appartmentLocations.at[row, 'distance_from_' + name] = map_distance(appartmentLocations.loc[row]['latitude'], dfAttractions.loc[i]['latitude'], appartmentLocations.loc[row]['longitude'], dfAttractions.loc[i]['longitude'])

In [9]:
appartmentLocations

Unnamed: 0,id,longitude,latitude,nearest_station_distance,distance_from_Sytagma_Square,distance_from_Ermou_Street,distance_from_Stadiou_Avenue,distance_from_Panepistimiou_Eleftheriou_Venizelou_Avenue,distance_from_Lycabetttus_Hill,distance_from_The_Parliament,...,distance_from_Panathenaic_Stadium,distance_from_Dromeas,distance_from_Mitropoleos_Square,distance_from_Vasilissis_Sofias_Avenue,distance_from_Benaki_Museum,distance_from_Calatrava's_pedestrian_bridge,distance_from_Eleftheria_Park,distance_from_Church_of_Kapnikarea,distance_from_Olympic_Athletic_Center_of_Athens,distance_from_Kifisia
0,10595,23.765270,37.988630,8,2.950583,3.099286,3.212032,3.104327,2.065886,2.886630,...,3.080872,1.923042,3.509716,2.924393,2.590226,1.095094,1.434913,3.494269,5.623390,10.159793
1,10990,23.764480,37.989030,8,2.911781,3.059500,3.154191,3.045635,2.018615,2.851311,...,3.063183,1.905525,3.466668,2.885518,2.557047,1.136339,1.413766,3.448477,5.609524,10.144744
2,10993,23.764730,37.988880,8,2.922588,3.070662,3.171802,3.063551,2.032335,2.860895,...,3.066708,1.908848,3.478999,2.896348,2.565860,1.123964,1.418203,3.461777,5.616179,10.151847
3,10995,23.764480,37.989030,8,2.911781,3.059500,3.154191,3.045635,2.018615,2.851311,...,3.063183,1.905525,3.466668,2.885518,2.557047,1.136339,1.413766,3.448477,5.609524,10.144744
4,27262,23.765000,37.989240,8,2.962929,3.110677,3.203904,3.095166,2.069701,2.902287,...,3.111787,1.953992,3.517868,2.936668,2.607854,1.085159,1.462844,3.499608,5.570072,10.105839
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9577,52959003,23.728438,37.976986,11,0.643504,0.505889,0.627147,0.712735,1.412413,0.782680,...,1.325538,1.840482,0.204908,0.662281,1.052970,4.566572,2.171518,0.066632,8.432761,12.815280
9578,52959885,23.731117,37.955988,12,2.259593,2.202765,2.959619,3.024100,3.071509,2.200178,...,1.678061,2.803968,2.144485,2.278565,2.374849,5.738352,3.299330,2.280130,10.216862,14.713951
9579,52959925,23.723520,37.985283,10,1.483933,1.407255,0.599749,0.657384,1.774798,1.633147,...,2.289012,2.465157,1.220429,1.488882,1.800048,4.703864,2.634277,1.082309,8.051185,12.310227
9580,52960132,23.730460,37.987990,10,1.410413,1.403904,0.609225,0.534883,1.314660,1.535094,...,2.237397,2.086085,1.427370,1.403074,1.587266,4.053012,2.147154,1.300370,7.406368,11.706299


In [15]:
dfRaw.dtypes

id                                                int64
listing_url                                      object
scrape_id                                         int64
last_scraped                                     object
name                                             object
                                                 ...   
calculated_host_listings_count                    int64
calculated_host_listings_count_entire_homes       int64
calculated_host_listings_count_private_rooms      int64
calculated_host_listings_count_shared_rooms       int64
reviews_per_month                               float64
Length: 74, dtype: object