In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [2]:
df = pd.read_csv("data/external_data.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3322 entries, 0 to 3321
Data columns (total 59 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   numer_sta  3322 non-null   int64  
 1   date       3322 non-null   object 
 2   pmer       3322 non-null   int64  
 3   tend       3322 non-null   int64  
 4   cod_tend   3322 non-null   int64  
 5   dd         3322 non-null   int64  
 6   ff         3322 non-null   float64
 7   t          3322 non-null   float64
 8   td         3322 non-null   float64
 9   u          3322 non-null   int64  
 10  vv         3322 non-null   int64  
 11  ww         3322 non-null   int64  
 12  w1         3315 non-null   float64
 13  w2         3312 non-null   float64
 14  n          3166 non-null   float64
 15  nbas       3317 non-null   float64
 16  hbas       2869 non-null   float64
 17  cl         2909 non-null   float64
 18  cm         1941 non-null   float64
 19  ch         1678 non-null   float64
 20  pres    

Relevant Features for Bike Count Prediction:

1. Temperature (t): People are more likely to bike in comfortable temperatures.
2. Precipitation (rr1, rr3, rr6 etc.): Rain or snow can deter biking.
3. Wind Speed (ff, raf10, rafper): Strong winds might affect biking.
4. Humidity (u): High humidity can make conditions uncomfortable for biking.
5. Visibility (vv): Poor visibility could reduce biking for safety reasons.
6. Cloud Cover (n, cl, cm, ch): May indirectly affect biking decisions.
7. Date and Time: To align with hourly bike count data.

In [4]:
df["date"] = pd.to_datetime(df["date"])
relevant_columns = [
    "date",
    "t",
    "rr1",
    "rr3",
    "rr6",
    "ff",
    "raf10",
    "rafper",
    "u",
    "vv",
    "n",
    "cl",
    "cm",
    "ch",
]
df_relevant = df[relevant_columns].copy()

In [5]:
df_relevant["day"] = df_relevant["date"].dt.date

Filling NaN values in continuous columns by mean of the day

In [6]:
for col in relevant_columns:
    daily_avg = df_relevant.groupby("day")[col].transform("mean")
    df_relevant[col].fillna(daily_avg, inplace=True)

In [7]:
df_relevant.head()

Unnamed: 0,date,t,rr1,rr3,rr6,ff,raf10,rafper,u,vv,n,cl,cm,ch,day
0,2021-01-01 00:00:00,272.75,0.0,0.0,0.0,1.8,2.5,2.5,96,990,10.0,35.0,20.0,10.0,2021-01-01
1,2021-01-01 03:00:00,271.25,0.0,0.0,0.0,1.7,2.2,2.2,98,210,25.0,35.0,23.0,10.0,2021-01-01
2,2021-01-01 06:00:00,271.95,0.0,0.0,0.0,2.6,3.2,3.2,98,3660,90.0,35.0,27.0,10.0,2021-01-01
3,2021-01-01 09:00:00,272.45,0.0,0.2,0.2,1.7,2.3,2.3,97,3500,50.0,35.0,23.0,10.0,2021-01-01
4,2021-01-01 12:00:00,276.95,0.0,0.0,0.2,1.0,2.5,4.4,82,8000,90.0,38.0,23.2,10.0,2021-01-01


In [8]:
df_relevant.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3322 entries, 0 to 3321
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    3322 non-null   datetime64[ns]
 1   t       3322 non-null   float64       
 2   rr1     3322 non-null   float64       
 3   rr3     3322 non-null   float64       
 4   rr6     3322 non-null   float64       
 5   ff      3322 non-null   float64       
 6   raf10   3322 non-null   float64       
 7   rafper  3322 non-null   float64       
 8   u       3322 non-null   int64         
 9   vv      3322 non-null   int64         
 10  n       3322 non-null   float64       
 11  cl      3322 non-null   float64       
 12  cm      3083 non-null   float64       
 13  ch      2947 non-null   float64       
 14  day     3322 non-null   object        
dtypes: datetime64[ns](1), float64(11), int64(2), object(1)
memory usage: 389.4+ KB


In [9]:
df_relevant["week"] = df_relevant["date"].dt.isocalendar().week

In [10]:
# Handling the remaining NaN values with mean and mode

for col in relevant_columns:
    weekly_avg = df_relevant.groupby("week")[col].transform("mean")
    df_relevant[col].fillna(weekly_avg, inplace=True)

In [11]:
df_relevant["precipitation"] = (df_relevant["rr1"] > 0).astype(int)
df_relevant["cloudy_day"] = (df_relevant["cl"] > 50).astype(int)

In [12]:
for column in ["rr1", "rr3", "rr6"]:
    df_relevant.loc[df_relevant[column] < 0, column] = 0

In [13]:
df_relevant = df_relevant.drop_duplicates()

In [14]:
# Saving the cleaned DataFrame to a CSV file
df_relevant.to_csv("data/external_data_cleaned.csv", index=False)