### **ETL** - *extract tranform load*

In [19]:
import os
import pandas as pd

In [20]:
DATA_FOLDER = ''
BIKE_RENTALS_FILE = 'tripdata_connected.csv'
WEATHER_FILE = 'weather_data.csv'

bike_rentals_df = pd.read_csv(os.path.join('..', DATA_FOLDER, BIKE_RENTALS_FILE))
weather_df = pd.read_csv(os.path.join('..', DATA_FOLDER, WEATHER_FILE),
                         encoding="Windows-1250", sep=";")

In [21]:
bike_rentals_df.head()

Unnamed: 0,Ride_Id,Rideable_Type,Started_At,Ended_At,Start_Station_Name,Start_Station_Id,End_Station_Name,End_Station_Id,Start_Lat,Start_Lng,End_Lat,End_Lng,Member_Casual,Ride_Length,Day_Of_The_Week,Day
0,0BE9C131A5705D92,classic_bike,10/1/2021 0:00,10/1/2021 0:05,Damen Ave & Cortland St,13133,Winchester Ave & Elston Ave,KA1504000140,41.915983,-87.677335,41.924091,-87.67646,casual,0:05:00,6,Friday
1,ABA2BDC3595FC3E9,classic_bike,10/1/2021 0:00,10/1/2021 0:10,Morgan St & Lake St,TA1306000015,Noble St & Milwaukee Ave,13290,41.885483,-87.652305,41.90068,-87.6626,casual,0:10:00,6,Friday
2,74483AC18C8C6B90,classic_bike,10/1/2021 0:00,10/1/2021 0:08,Halsted St & Roscoe St,TA1309000025,Greenview Ave & Diversey Pkwy,13294,41.94367,-87.64895,41.93259,-87.665936,casual,0:08:00,6,Friday
3,A7711BCB74523614,electric_bike,10/1/2021 0:02,10/1/2021 0:03,,,,,41.93,-87.7,41.93,-87.7,casual,0:01:00,6,Friday
4,B0B9EB7622461EF4,classic_bike,10/1/2021 0:02,10/1/2021 0:17,MLK Jr Dr & 29th St,TA1307000139,Clinton St & Roosevelt Rd,WL-008,41.842052,-87.617,41.867118,-87.641088,member,0:15:00,6,Friday


### **czas**

In [22]:
df_time = bike_rentals_df[['Ride_Id', 'Ride_Length', 'Started_At', 'Ended_At', 'Day']].copy()

df_time['Started_At'] = pd.to_datetime(df_time['Started_At'], format='%m/%d/%Y %H:%M')
df_time['Ended_At'] = pd.to_datetime(df_time['Ended_At'], format='%m/%d/%Y %H:%M')
df_time['Date'] = df_time['Started_At'].dt.date
df_time['Duration'] = (df_time['Ended_At'] - df_time['Started_At']).dt.total_seconds() // 60
df_time['Duration'].astype(int)

df_time = df_time[['Ride_Id', 'Date', 'Duration', 'Day']].copy()


df_time.head()

Unnamed: 0,Ride_Id,Date,Duration,Day
0,0BE9C131A5705D92,2021-10-01,5.0,Friday
1,ABA2BDC3595FC3E9,2021-10-01,10.0,Friday
2,74483AC18C8C6B90,2021-10-01,8.0,Friday
3,A7711BCB74523614,2021-10-01,1.0,Friday
4,B0B9EB7622461EF4,2021-10-01,15.0,Friday


### **lokalizacja**

In [23]:
df_location = bike_rentals_df[['Ride_Id', 'Start_Station_Id', 'End_Station_Id']].copy()
df_stations = bike_rentals_df[['Start_Station_Id', 'Start_Station_Name', 'Start_Lat', 'Start_Lng']].copy()
len(df_stations['Start_Station_Id'].unique()), len(df_stations)

(1279, 5138713)

In [24]:
df_stations = df_stations.drop_duplicates(subset=['Start_Station_Id'])
df_stations.head()

Unnamed: 0,Start_Station_Id,Start_Station_Name,Start_Lat,Start_Lng
0,13133,Damen Ave & Cortland St,41.915983,-87.677335
1,TA1306000015,Morgan St & Lake St,41.885483,-87.652305
2,TA1309000025,Halsted St & Roscoe St,41.94367,-87.64895
3,,,41.93,-87.7
4,TA1307000139,MLK Jr Dr & 29th St,41.842052,-87.617


### **typy**

In [25]:
df_types = bike_rentals_df[['Ride_Id', 'Rideable_Type', 'Member_Casual']].copy()
df_types.head()

Unnamed: 0,Ride_Id,Rideable_Type,Member_Casual
0,0BE9C131A5705D92,classic_bike,casual
1,ABA2BDC3595FC3E9,classic_bike,casual
2,74483AC18C8C6B90,classic_bike,casual
3,A7711BCB74523614,electric_bike,casual
4,B0B9EB7622461EF4,classic_bike,member


### **pogoda**

In [26]:
weather_df.head()

Unnamed: 0,time,temperature_2m (°C),precipitation (mm),cloudcover (%),windspeed_10m (km/h),latitude,longitude,elevation,utc_offset_seconds,timezone,timezone_abbreviation
0,2021-10-01T00:00,193,0,23.0,94,41800003,-876,179.0,-18000.0,America/Chicago,CDT
1,2021-10-01T01:00,188,0,14.0,99,41800003,-876,179.0,-18000.0,America/Chicago,CDT
2,2021-10-01T02:00,185,0,4.0,96,41800003,-876,179.0,-18000.0,America/Chicago,CDT
3,2021-10-01T03:00,181,0,0.0,97,41800003,-876,179.0,-18000.0,America/Chicago,CDT
4,2021-10-01T04:00,178,0,0.0,10,41800003,-876,179.0,-18000.0,America/Chicago,CDT


In [31]:
df_weather = pd.DataFrame()
weather_cols = ['Time', 'Temperature', 'Precipitation', 'Wind_Speed', 'Latitude', 'Longitude']

df_weather[weather_cols] = weather_df[[
    'time', 'temperature_2m (°C)', 'precipitation (mm)', 'windspeed_10m (km/h)', 'latitude', 'longitude'
    ]].copy()

df_weather['Time'] = pd.to_datetime(df_weather['Time'], format='%Y-%m-%d %H:%M:%S')

float_cols = ['Temperature', 'Precipitation', 'Wind_Speed', 'Latitude', 'Longitude']
df_weather[float_cols] = df_weather[float_cols].replace(',', '.', regex=True).astype(float)

df_weather['Date'] = df_weather['Time'].dt.date
df_weather = df_weather[['Date', 'Temperature', 'Precipitation', 'Wind_Speed', 'Latitude', 'Longitude']].copy()
df_weather.head(8)

ValueError: time data "2021-10-01T00:00" doesn't match format "%Y-%m-%d %H:%M:%S", at position 0. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [28]:
def remap_perciption(perciption):
    if perciption == 0:
        return 'No'
    elif perciption < 2.5:
        return 'Light'
    elif perciption < 7.6:
        return 'Moderate'
    elif perciption < 50.8:
        return 'Heavy'
    else:
        return 'Violent'
    
df_weather['Precipitation'] = df_weather['Precipitation'].apply(remap_perciption)
df_weather.head()

TypeError: '<' not supported between instances of 'str' and 'float'

In [None]:
def remap_wind(wind):
    if wind < 1.6:
        return 'Calm'
    elif wind < 5.5:
        return 'Light'
    elif wind < 11.1:
        return 'Moderate'
    elif wind < 19.8:
        return 'Fresh'
    elif wind < 28.5:
        return 'Strong'
    elif wind < 38.9:
        return 'Near Gale'
    elif wind < 49.6:
        return 'Gale'
    elif wind < 61.2:
        return 'Strong Gale'
    elif wind < 74.2:
        return 'Storm'
    else:
        return 'Violent Storm'

df_weather['Wind_Speed'] = df_weather['Wind_Speed'].apply(remap_wind)
df_weather.head()

In [None]:
df_weather['Wind_Speed'].value_counts()

In [None]:
def remap_temperature(temperature):
    if temperature < -20:
        return 'Extreme Cold'
    elif temperature < -10:
        return 'Very Cold'
    elif temperature < 0:
        return 'Cold'
    elif temperature < 10:
        return 'Cool'
    elif temperature < 20:
        return 'Mild'
    elif temperature < 30:
        return 'Warm'
    elif temperature < 40:
        return 'Hot'
    else:
        return 'Very Hot'

df_weather['Temperature'] = df_weather['Temperature'].apply(remap_temperature)
df_weather.head()

In [30]:
df_weather['Temperature'].value_counts()

Temperature
21,1     47
22       46
21,6     46
21,4     46
20,9     45
         ..
-14,7     1
30,4      1
-14,9     1
-15,7     1
30,5      1
Name: count, Length: 471, dtype: int64

In [None]:
def remap_cloudcover(cloud_cover):
    if cloud_cover == 0:
        return 'No'
    elif cloud_cover > 0 and cloud_cover < 10:
        return 'Light'
    elif cloud_cover >=10 and cloud_cover < 30:
        return 'Moderate'
    elif cloud_cover >=30 and cloud_cover < 70:
        return 'Heavy'
    else:
        return 'Very Heavy'


df_weather['cloudcover (%)'] = df_weather['cloudcover (%)'].apply(remap_cloudcover)
df_weather.head()