In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load data
dummy_file_path = '../data/test_data.csv'
dummy_data = pd.read_csv(dummy_file_path) 

print(dummy_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96453 entries, 0 to 96452
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Formatted Date            96453 non-null  object 
 1   Summary                   96453 non-null  object 
 2   Precip Type               95936 non-null  object 
 3   Temperature (C)           96453 non-null  float64
 4   Apparent Temperature (C)  96453 non-null  float64
 5   Humidity                  96453 non-null  float64
 6   Wind Speed (km/h)         96453 non-null  float64
 7   Wind Bearing (degrees)    96453 non-null  float64
 8   Visibility (km)           96453 non-null  float64
 9   Loud Cover                96453 non-null  float64
 10  Pressure (millibars)      96453 non-null  float64
 11  Daily Summary             96453 non-null  object 
dtypes: float64(8), object(4)
memory usage: 8.8+ MB
None


In [7]:
print(dummy_data.head(20).to_markdown())

print('\nAll the columns:\n')
print(dummy_data.columns)

|    | Formatted Date                | Summary       | Precip Type   |   Temperature (C) |   Apparent Temperature (C) |   Humidity |   Wind Speed (km/h) |   Wind Bearing (degrees) |   Visibility (km) |   Loud Cover |   Pressure (millibars) | Daily Summary                     |
|---:|:------------------------------|:--------------|:--------------|------------------:|---------------------------:|-----------:|--------------------:|-------------------------:|------------------:|-------------:|-----------------------:|:----------------------------------|
|  0 | 2006-04-01 00:00:00.000 +0200 | Partly Cloudy | rain          |           9.47222 |                    7.38889 |       0.89 |             14.1197 |                      251 |           15.8263 |            0 |                1015.13 | Partly cloudy throughout the day. |
|  1 | 2006-04-01 01:00:00.000 +0200 | Partly Cloudy | rain          |           9.35556 |                    7.22778 |       0.86 |             14.2646 |            

It looks like the y value of the dataset is 'Temperature (C)' column. All other columns can be used as features.

In [8]:
dummy_data['Formatted Date'] = pd.to_datetime(dummy_data['Formatted Date'], format='%Y-%m-%d %H:%M:%S.%f %z', errors='raise')
# print(dummy_data['Formatted Date'].unique())

print(dummy_data.head().to_markdown())

print(dummy_data.info())

|    | Formatted Date            | Summary       | Precip Type   |   Temperature (C) |   Apparent Temperature (C) |   Humidity |   Wind Speed (km/h) |   Wind Bearing (degrees) |   Visibility (km) |   Loud Cover |   Pressure (millibars) | Daily Summary                     |
|---:|:--------------------------|:--------------|:--------------|------------------:|---------------------------:|-----------:|--------------------:|-------------------------:|------------------:|-------------:|-----------------------:|:----------------------------------|
|  0 | 2006-04-01 00:00:00+02:00 | Partly Cloudy | rain          |           9.47222 |                    7.38889 |       0.89 |             14.1197 |                      251 |           15.8263 |            0 |                1015.13 | Partly cloudy throughout the day. |
|  1 | 2006-04-01 01:00:00+02:00 | Partly Cloudy | rain          |           9.35556 |                    7.22778 |       0.86 |             14.2646 |                      259 | 

  dummy_data['Formatted Date'] = pd.to_datetime(dummy_data['Formatted Date'], format='%Y-%m-%d %H:%M:%S.%f %z', errors='raise')


In [9]:
print(type(dummy_data['Formatted Date'].iloc[90000]))
print((dummy_data['Formatted Date'].iloc[1465]))
print((dummy_data['Formatted Date'].iloc[1463]))

dummy_data['Formatted Date'] = pd.to_datetime(dummy_data['Formatted Date'], format='%Y-%m-%d %H:%M:%S.%f%z', errors='raise', utc=True)

<class 'pandas._libs.tslibs.timestamps.Timestamp'>
2006-12-01 01:00:00+01:00
2006-08-09 23:00:00+02:00


Create three new columns for 'day', 'month' and 'year' as int

In [10]:
# Create new columns
dummy_data['day'] = dummy_data['Formatted Date'].dt.day
dummy_data['month'] = dummy_data['Formatted Date'].dt.month
dummy_data['year'] = dummy_data['Formatted Date'].dt.year
dummy_data['hour'] = dummy_data['Formatted Date'].dt.hour

print(dummy_data.head().to_markdown())

|    | Formatted Date            | Summary       | Precip Type   |   Temperature (C) |   Apparent Temperature (C) |   Humidity |   Wind Speed (km/h) |   Wind Bearing (degrees) |   Visibility (km) |   Loud Cover |   Pressure (millibars) | Daily Summary                     |   day |   month |   year |   hour |
|---:|:--------------------------|:--------------|:--------------|------------------:|---------------------------:|-----------:|--------------------:|-------------------------:|------------------:|-------------:|-----------------------:|:----------------------------------|------:|--------:|-------:|-------:|
|  0 | 2006-03-31 22:00:00+00:00 | Partly Cloudy | rain          |           9.47222 |                    7.38889 |       0.89 |             14.1197 |                      251 |           15.8263 |            0 |                1015.13 | Partly cloudy throughout the day. |    31 |       3 |   2006 |     22 |
|  1 | 2006-03-31 23:00:00+00:00 | Partly Cloudy | rain          |    

Copy clean data to csv

In [11]:
print(dummy_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96453 entries, 0 to 96452
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype              
---  ------                    --------------  -----              
 0   Formatted Date            96453 non-null  datetime64[ns, UTC]
 1   Summary                   96453 non-null  object             
 2   Precip Type               95936 non-null  object             
 3   Temperature (C)           96453 non-null  float64            
 4   Apparent Temperature (C)  96453 non-null  float64            
 5   Humidity                  96453 non-null  float64            
 6   Wind Speed (km/h)         96453 non-null  float64            
 7   Wind Bearing (degrees)    96453 non-null  float64            
 8   Visibility (km)           96453 non-null  float64            
 9   Loud Cover                96453 non-null  float64            
 10  Pressure (millibars)      96453 non-null  float64            
 11  Daily Summary  

In [12]:
dummy_data.to_csv('../data/clean_dummy_data.csv')