In [1]:
import pandas as pd

In [2]:
#I will use a dataframe for wind turbines. The dataframe is simple in that case and contains 2 columns. 
#dt=dates with 15 minutes of interwal and MW=megawats gotten for this time.
#The idea is to research it and combine with data of weather

In [3]:
data = pd.read_csv('turbines.csv')
data

Unnamed: 0,dt,MW
0,2011-01-01 00:00:00,3416.00
1,2011-01-01 00:15:00,4755.00
2,2011-01-01 00:30:00,4939.00
3,2011-01-01 00:45:00,4939.00
4,2011-01-01 01:00:00,4998.00
...,...,...
385561,2021-12-30 06:45:00,8140.31
385562,2021-12-30 07:00:00,8162.33
385563,2021-12-30 07:15:00,8405.11
385564,2021-12-30 07:30:00,8503.48


In [4]:
data.columns

Index(['dt', 'MW'], dtype='object')

In [5]:
#now I want to check out the amount of MW produced on the first day and to check how the separation could be done

In [6]:
data['dt'] = pd.to_datetime(data['dt'])

# Filter rows where date is '2011-01-01'
first_day = data[data['dt'].dt.date == pd.to_datetime('2011-01-01').date()]

In [7]:
first_day

Unnamed: 0,dt,MW
0,2011-01-01 00:00:00,3416.0
1,2011-01-01 00:15:00,4755.0
2,2011-01-01 00:30:00,4939.0
3,2011-01-01 00:45:00,4939.0
4,2011-01-01 01:00:00,4998.0
...,...,...
91,2011-01-01 22:45:00,3614.0
92,2011-01-01 23:00:00,3639.0
93,2011-01-01 23:15:00,3484.0
94,2011-01-01 23:30:00,3576.0


In [8]:
#we have 96 rows with 15 minutes difference in each. (96 rows * 15 minutes and devided by 60 (minutes in hour) we get the amount of 24 hours.
96*15/60

24.0

In [9]:
#checking the amount of MW produced on this date to try to combine all days in my dataframy into one separate date with the sum of MW for a specific day
sum_of_MW = first_day['MW'].sum()
print("Sum of 'MW' on 2011-01-01:", sum_of_MW)

Sum of 'MW' on 2011-01-01: 518229.0


In [10]:
# Set the timestamp as the index
data.set_index('dt', inplace=True)

# Resample data to daily frequency and calculate summary statistics
data = data.resample('D').agg({
    'MW': ['sum']
})

# Rename columns for clarity
data.columns = ['Sum Value']
data

Unnamed: 0_level_0,Sum Value
dt,Unnamed: 1_level_1
2011-01-01,518229.00
2011-01-02,83077.00
2011-01-03,73095.00
2011-01-04,376606.00
2011-01-05,216599.00
...,...
2021-12-26,294921.35
2021-12-27,603998.08
2021-12-28,659450.45
2021-12-29,241640.15


In [11]:
data.columns

Index(['Sum Value'], dtype='object')

In [12]:
#now I want to recet index again to have have it as a separated column and rename it after
data.reset_index(inplace=True)
data.columns

Index(['dt', 'Sum Value'], dtype='object')

In [13]:
#renaming columns into the one that I want to have
data.rename(columns={'dt': 'date', 'Sum Value': 'windturbines_mw'}, inplace=True)

In [14]:
#checking NaN values
nan_values = data.isna().sum()
nan_values

date               0
windturbines_mw    0
dtype: int64

In [15]:
#Checking.dtypes
data.dtypes

date               datetime64[ns]
windturbines_mw           float64
dtype: object

In [16]:
#changing date column into an object
data['date'] = data['date'].astype(str)

In [17]:
data.dtypes

date                object
windturbines_mw    float64
dtype: object

In [18]:
data

Unnamed: 0,date,windturbines_mw
0,2011-01-01,518229.00
1,2011-01-02,83077.00
2,2011-01-03,73095.00
3,2011-01-04,376606.00
4,2011-01-05,216599.00
...,...,...
4012,2021-12-26,294921.35
4013,2021-12-27,603998.08
4014,2021-12-28,659450.45
4015,2021-12-29,241640.15


In [19]:
#data.to_csv('turbines_days_cleaned.csv', index=False) #saved after changing the file into a dayly format

In [20]:
#now lets try and use the file from scrapping and connect them together.
df = pd.read_csv('hamburg.csv')
df

Unnamed: 0,date,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
0,2011-01-01,2.7,1.0,4.0,0.0,110.0,277.0,22.7,51.8,1013.4,252
1,2011-01-02,0.9,-0.8,2.8,0.0,80.0,284.0,12.2,26.3,1017.2,162
2,2011-01-03,-0.8,-2.8,0.9,0.0,70.0,257.0,9.0,20.9,1020.1,84
3,2011-01-04,0.8,-1.1,1.5,0.0,60.0,220.0,18.7,40.0,1016.0,0
4,2011-01-05,-2.3,-4.5,-0.6,0.6,50.0,175.0,19.1,37.4,1010.1,252
...,...,...,...,...,...,...,...,...,...,...,...
4012,2021-12-26,-6.0,-9.8,-1.8,0.1,0.0,104.0,10.8,31.3,1013.2,395
4013,2021-12-27,-2.6,-5.9,0.9,0.0,0.0,119.0,17.3,32.0,1004.7,0
4014,2021-12-28,1.8,0.8,2.7,2.2,0.0,124.0,15.8,32.8,995.9,0
4015,2021-12-29,3.1,1.4,6.2,0.6,0.0,180.0,5.8,19.1,1003.3,0


In [21]:
nan_values = df.isna().sum()
nan_values

date    0
tavg    0
tmin    0
tmax    0
prcp    0
snow    0
wdir    0
wspd    0
wpgt    5
pres    0
tsun    0
dtype: int64

In [22]:
df.dtypes

date     object
tavg    float64
tmin    float64
tmax    float64
prcp    float64
snow    float64
wdir    float64
wspd    float64
wpgt    float64
pres    float64
tsun      int64
dtype: object

In [23]:
#wind turbine per day
# https://www.kaggle.com/datasets/l3llff/wind-power/data
#wiki
# https://en.wikipedia.org/wiki/Wind_power_in_Germany#By_State
#API weather
# https://meteostat.net/de/place/de/hamburg?s=10147&t=2011-01-01/2024-12-30