In [27]:
import pandas as pd
from time import time
pd.__version__

'2.0.3'

In [2]:
df = pd.read_csv('https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-09.csv.gz', nrows=100)

In [5]:
# Преобразуйте столбцы с датами и временем в формат datetime
df['lpep_pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime'])
df['lpep_dropoff_datetime'] = pd.to_datetime(df['lpep_dropoff_datetime'])

In [8]:
from sqlalchemy import create_engine

In [9]:
engine = create_engine('postgresql://root:root@localhost:5432/my_taxi')

In [19]:
print(pd.io.sql.get_schema(df, name='green_taxi_data', con=engine))


CREATE TABLE green_taxi_data (
	"VendorID" BIGINT, 
	lpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	lpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	store_and_fwd_flag TEXT, 
	"RatecodeID" BIGINT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	passenger_count BIGINT, 
	trip_distance FLOAT(53), 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	ehail_fee FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	payment_type BIGINT, 
	trip_type BIGINT, 
	congestion_surcharge FLOAT(53)
)


In [16]:
df_iter = pd.read_csv('https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-09.csv.gz', iterator=True, chunksize=100000)
df = next(df_iter)
len(df)

100000

In [17]:
# Преобразуйте столбцы с датами и временем в формат datetime
df['lpep_pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime'])
df['lpep_dropoff_datetime'] = pd.to_datetime(df['lpep_dropoff_datetime'])

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 20 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   VendorID               100000 non-null  int64         
 1   lpep_pickup_datetime   100000 non-null  datetime64[ns]
 2   lpep_dropoff_datetime  100000 non-null  datetime64[ns]
 3   store_and_fwd_flag     100000 non-null  object        
 4   RatecodeID             100000 non-null  int64         
 5   PULocationID           100000 non-null  int64         
 6   DOLocationID           100000 non-null  int64         
 7   passenger_count        100000 non-null  int64         
 8   trip_distance          100000 non-null  float64       
 9   fare_amount            100000 non-null  float64       
 10  extra                  100000 non-null  float64       
 11  mta_tax                100000 non-null  float64       
 12  tip_amount             100000 non-null  float

In [23]:
df.head(n=0).to_sql(name='green_taxi_data', con=engine, if_exists='replace')

0

In [24]:
%time df.to_sql(name='green_taxi_data', con=engine, if_exists='append')

CPU times: total: 5.34 s
Wall time: 29.2 s


1000

In [35]:
while True:
    t_start = time()

    df = next(df_iter)

    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)

    df.to_sql(name='green_taxi_data', con=engine, if_exists='append')

    t_end = time()

    print('inserted another chunk, took %.3f second' % (t_end - t_start))

inserted another chunk, took 24.534 second


  df = next(df_iter)


inserted another chunk, took 24.723 second
inserted another chunk, took 11.639 second


StopIteration: 

In [11]:
df_zones = pd.read_csv('https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv')

In [12]:
df_zones.head()

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone


In [13]:
df_zones.to_sql(name='zones', con=engine, if_exists='replace')

265

In [0]:
# Сколько всего поездок на такси было совершено 18 сентября 2019 года?

In [40]:
query = """
SELECT COUNT(*) FROM green_taxi_data
WHERE DATE(lpep_pickup_datetime) = '2019-09-18'
AND DATE(lpep_dropoff_datetime) = '2019-09-18';
"""

pd.read_sql(query, con=engine)

Unnamed: 0,count
0,15612


In [ ]:
# В какой день была наибольшая дальность поездки. Используйте время посадки для своих расчетов.

In [43]:
query = """
SELECT
DATE(lpep_pickup_datetime) AS pickup_day,
MAX(trip_distance) AS total_distance
FROM
green_taxi_data
GROUP BY
DATE(lpep_pickup_datetime)
ORDER BY
total_distance DESC
LIMIT 1;
"""

pd.read_sql(query, con=engine)

Unnamed: 0,pickup_day,total_distance
0,2019-09-26,341.64


In [ ]:
# Вопрос 5. Количество пассажиров 
# Учитывайте lpep_pickup_datetime в «2019-09-18», и игнорирование «Город» 'Unknown'.
# В каких трех округах сумма total_amount превышала 50 000?

In [44]:
query = """
SELECT 
	zpu."Borough",
	SUM(t.total_amount)
FROM 
	green_taxi_data t JOIN zones zpu 
		ON t."PULocationID"= zpu."LocationID"
WHERE 
	CAST(lpep_pickup_datetime AS DATE)='2019-09-18' AND
	zpu."Borough"!='Unknown'
GROUP BY
	zpu."Borough"
HAVING
	SUM(t.total_amount)>50000;
"""

pd.read_sql(query, con=engine)

Unnamed: 0,Borough,sum
0,Brooklyn,96333.24
1,Manhattan,92271.3
2,Queens,78671.71


In [ ]:
# Вопрос 6. Самые крупные чаевые
# Для пассажиров, забранных в сентябре 2019 года в зоне под названием Астория,
# в какой зоне высадки были самые большие чаевые? Нам нужно имя зоны, а не идентификатор.
# Примечание: это не опечатка, это чаевые, а не трип.

In [48]:
query = """
SELECT 
	zdo."Zone",
	MAX(t.tip_amount)
FROM 
	green_taxi_data t JOIN zones zpu 
		ON t."PULocationID"= zpu."LocationID"
	JOIN zones zdo ON t."DOLocationID"= zdo."LocationID"
WHERE 
	TO_CHAR(lpep_pickup_datetime, 'YYYY')='2019' AND
	TO_CHAR(lpep_pickup_datetime, 'MM')='09' AND
	zpu."Zone"='Astoria'
GROUP BY
	zdo."Zone"
ORDER BY 
	MAX(t.tip_amount) DESC
LIMIT 3;
"""

pd.read_sql(query, con=engine)

Unnamed: 0,Zone,max
0,JFK Airport,62.31
1,Kips Bay,28.0
2,NV,25.0


In [26]:

# Выберите поездки, которые начались и закончились 18 сентября 2019 года
trips_on_sept_18 = df[(df['lpep_pickup_datetime'].dt.date == pd.to_datetime('2019-09-18').date()) & (df['lpep_dropoff_datetime'].dt.date == pd.to_datetime('2019-09-18').date())]

# Подсчитайте количество таких поездок
num_trips = len(trips_on_sept_18)

num_trips


15612

В какой день была наибольшая дальность поездки. Используйте время посадки для своих расчетов.

In [39]:
max_trip_day = df['trip_distance'].idxmax()
df.loc[max_trip_day, 'lpep_pickup_datetime'].date()

datetime.date(2019, 9, 26)