In [166]:
"""
My database number
"""

name = "Mateusz"
surname = "Moskala"
database_number = (len(name) + len(surname)) % 6
print("My database number: {:d}".format(database_number))

My database number: 2


In [167]:
import pandas as pd
df = pd.read_csv("data_set/2/AB_NYC_2019.csv")
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [168]:
"""
Data fast description
"""

print("Columns:")
print(df.columns)
print("\nData description:")
print(df.describe())
print("\nHost description:")
print(df.host_name.describe())

print("\n Info")
print(df.info())

Columns:
Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')

Data description:
                 id       host_id      latitude     longitude         price  \
count  4.889500e+04  4.889500e+04  48895.000000  48895.000000  48895.000000   
mean   1.901714e+07  6.762001e+07     40.728949    -73.952170    152.720687   
std    1.098311e+07  7.861097e+07      0.054530      0.046157    240.154170   
min    2.539000e+03  2.438000e+03     40.499790    -74.244420      0.000000   
25%    9.471945e+06  7.822033e+06     40.690100    -73.983070     69.000000   
50%    1.967728e+07  3.079382e+07     40.723070    -73.955680    106.000000   
75%    2.915218e+07  1.074344e+08     40.763115    -73.936275    175.000000   
max    3.648724e+07  2.74321

In [163]:
"""
Connection SQLAlchemy with database
"""

from sqlalchemy import create_engine

database_type = "postgres"
user = "postgres"
password = "database2020"
database_url = "localhost"
port = 5432
database_name = "lab4_5"

db_login = "{:s}://{:s}:{:s}@{:s}:{:d}/{:s}".format(database_type,
                                                    user,
                                                    password,
                                                    database_url,
                                                    port,
                                                    database_name)
engine = create_engine(db_login)

In [165]:
"""
Tables and database structure
"""

from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, Float, String, Date
from sqlalchemy import ForeignKey
from sqlalchemy import Sequence, CheckConstraint, UniqueConstraint

Base = declarative_base()

class AirBNB(Base):
    __tablename__ = "airBNB"
    __table_args__ = (UniqueConstraint('name'),
                      CheckConstraint('price >= 0'),
                      CheckConstraint('minimum_nights >= 0'),
                      CheckConstraint('availability_365 >= 0'))
    id = Column(Integer, primary_key=True)
    name = Column(String(50))
    host_id = Column(Integer, ForeignKey("host.id"))
    latitude = Column(Float)
    longitude = Column(Float)
    district_id = Column(Integer, ForeignKey("district.id"))
    room_type_id = Column(Integer, ForeignKey("room.id"))
    price = Column(Integer)
    minimum_nights = Column(Integer)
    number_of_reviews = Column(Integer)
    last_review = Column(Date)
    availability_365 = Column(Integer)

class Room(Base):
    __tablename__ = 'room'
    __table_args__ = (CheckConstraint('length(name) > 0'),
                      UniqueConstraint('name'))
    id = Column(Integer, primary_key=True)
    name = Column(String(50))
    
class Host(Base):
    __tablename__ = "host"
    __table_args__ = (CheckConstraint('length(name) > 0'),
                      UniqueConstraint('name'),
                      CheckConstraint('number_of_properties > 0'))
    id = Column(Integer, primary_key=True)
    name = Column(String(50))
    number_of_properties = Column(Integer)
    
class Province(Base):
    __tablename__ = "province"
    __table_args__ = (CheckConstraint('length(name) > 0'),
                      UniqueConstraint('name'))
    id = Column(Integer, primary_key=True)
    name = Column(String(50))
    
class District(Base):
    __tablename__ = "district"
    __table_args__ = (CheckConstraint('length(name) > 0'),
                      UniqueConstraint('name'))
    id = Column(Integer, primary_key=True)
    name = Column(String(50))
    province_id = Column(Integer, ForeignKey("province.id"))

Base.metadata.create_all(engine)
engine.table_names()

['province', 'district', 'host', 'airBNB', 'room']

In [169]:
"""
Unique values
"""
room_type_unique = df.room_type.unique()
host_id_unique = df.host_id.unique()
province_unique = df.neighbourhood_group.unique()
district_unique = df.neighbourhood.unique()

province_unique

array(['Brooklyn', 'Manhattan', 'Queens', 'Staten Island', 'Bronx'],
      dtype=object)

In [None]:
"""
Mapping value
"""
# dicionary_corect = {'US':'USA', 'USA':'USA', ' United States of America':'USA', 'America':'USA', 'Poland':'POL', 'PL':'POL', 'Polska':'POL' }
# mapping_country = data['country'].map(dicionary_corect)
# data['country'] = mapping_country

In [98]:
"""
Supplementing data
"""
for district in district_unique:
    province = df[(df['neighbourhood']==district) & (~df['neighbourhood_group'].isna())]['neighbourhood_group'].unique()
    if len(province) == 1:
        df.loc[(df['neighbourhood']==district) & (df['neighbourhood_group'].isna()), 'neighbourhood_group'] = province
    else:
        print('province data mismatch on the context of {0}'.format(district))
    

In [139]:
for host_id in host_id_unique:
    host_name = df[(df['host_id']==host_id) & (~df['host_name'].isna())]['host_name'].unique()
    if len(host_name) == 1:
        df.loc[(df['host_id'] == host_id) & (df['host_name'].isna()), 'host_name'] = host_name
    else:
        print('host_id {0} data mismatch on the context of {1}'.format(host_id, host_name))


In [170]:
"""
Split data to tables
"""
province_list = pd.DataFrame(province_unique, columns=['name'])
province_list.index.name = 'id'
province_list

Unnamed: 0_level_0,name
id,Unnamed: 1_level_1
0,Brooklyn
1,Manhattan
2,Queens
3,Staten Island
4,Bronx


In [171]:
district_list = df[['neighbourhood', 'neighbourhood_group']].drop_duplicates()
district_list = district_list.reset_index().drop(columns = ['index'])
district_list.index.name = 'id'
district_list = district_list.rename(columns = {'neighbourhood_group':'province_id',
                                                'neighbourhood':      'name'})

district_list['province_id'] = neighbourhood_list['province_id'].map(lambda x:  province_list[province_list['name'] == x].index.values.astype(int)[0])
district_list

Unnamed: 0_level_0,name,province_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Kensington,0
1,Midtown,1
2,Harlem,1
3,Clinton Hill,0
4,East Harlem,1
...,...,...
216,Bull's Head,3
217,New Dorp,3
218,Rossville,3
219,Breezy Point,2


In [172]:
room_list = pd.DataFrame(room_type_unique, columns=['name'])
room_list.index.name = 'id'
room_list

Unnamed: 0_level_0,name
id,Unnamed: 1_level_1
0,Private room
1,Entire home/apt
2,Shared room


In [173]:
host_list = df[['host_id', 'host_name', 'calculated_host_listings_count']].drop_duplicates()
host_list = host_list.reset_index().drop(columns = ['index'])
host_list.index.name = 'id'
host_list = host_list.rename(columns = {'host_name':'name',
                                        'calculated_host_listings_count':'number_of_properties'})
host_list


Unnamed: 0_level_0,host_id,name,number_of_properties
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2787,John,6
1,2845,Jennifer,2
2,4632,Elisabeth,1
3,4869,LisaRoxanne,1
4,7192,Laura,1
...,...,...,...
37452,274307600,Jonathan,1
37453,274311461,Scott,1
37454,274321313,Kat,1
37455,23492952,Ilgar & Aysel,1


In [174]:
airbnb_list = df[['name', 'host_id', 'latitude', 'longitude', 'neighbourhood', 'room_type', 'price', 'minimum_nights', 'number_of_reviews', 'last_review', 'availability_365']].drop_duplicates() 

# Host id
airbnb_list['host_id'] = airbnb_list['host_id'].map(lambda x:  host_list[host_list['host_id'] == x].index.values.astype(int)[0])
host_list.drop(columns=['host_id'])

# District id
airbnb_list['neighbourhood'] = airbnb_list['neighbourhood'].map(lambda x: district_list[district_list['name']==x].index.values.astype(int)[0])
airbnb_list = airbnb_list.rename(columns = {'neighbourhood':'district_id'})

# Room type id
airbnb_list['room_type'] = airbnb_list['room_type'].map(lambda x: room_list[room_list['name']==x].index.values.astype(int)[0])
airbnb_list = airbnb_list.rename(columns = {'room_type':'room_type_id'})


In [176]:
# Load data to postgresql database
airbnb_list.to_sql(  'airBNB',   engine, if_exists='replace')
host_list.to_sql(    'host',     engine, if_exists='replace')
room_list.to_sql(    'room',     engine, if_exists='replace')
district_list.to_sql('district', engine, if_exists='replace')
province_list.to_sql('province', engine, if_exists='replace')

In [177]:
# Check data in database
engine.table_names()

['host', 'province', 'airBNB', 'room', 'district']

In [181]:
from sqlalchemy import MetaData, Table

metadata = MetaData()
for column_name in engine.table_names():
    table = Table(column_name, metadata, autoload=True, autoload_with=engine)
    print("\nColumn name: {:s}".format(column_name))
    print(table.columns.keys())


Column name: host
['id', 'host_id', 'name', 'number_of_properties']

Column name: province
['id', 'name']

Column name: airBNB
['index', 'name', 'host_id', 'latitude', 'longitude', 'district_id', 'room_type_id', 'price', 'minimum_nights', 'number_of_reviews', 'last_review', 'availability_365']

Column name: room
['id', 'name']

Column name: district
['id', 'name', 'province_id']


In [185]:
result = engine.execute("select * from province").fetchall()
for item in result:
    print(item)

(0, 'Brooklyn')
(1, 'Manhattan')
(2, 'Queens')
(3, 'Staten Island')
(4, 'Bronx')
