In [12]:
import numpy as np
import pandas as pd

df = pd.read_csv('/content/weather_prediction_dataset.csv')

In [13]:
df.head()

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
0,2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.
1,2006-04-01 01:00:00.000 +0200,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.
2,2006-04-01 02:00:00.000 +0200,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.
3,2006-04-01 03:00:00.000 +0200,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.
4,2006-04-01 04:00:00.000 +0200,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.


In [3]:
df.isna().sum()

Unnamed: 0,0
Formatted Date,0
Summary,0
Precip Type,517
Temperature (C),0
Apparent Temperature (C),0
Humidity,0
Wind Speed (km/h),0
Wind Bearing (degrees),0
Visibility (km),0
Loud Cover,0


In [4]:
df['Precip Type'].value_counts()

Unnamed: 0_level_0,count
Precip Type,Unnamed: 1_level_1
rain,85200
snow,10712


In [14]:
df.dropna(inplace=True)

In [7]:
df.isna().sum()

Unnamed: 0,0
Formatted Date,0
Summary,0
Precip Type,0
Temperature (C),0
Apparent Temperature (C),0
Humidity,0
Wind Speed (km/h),0
Wind Bearing (degrees),0
Visibility (km),0
Loud Cover,0


# Feature Engineering

In [17]:
# Step 1: Convert 'Formatted Date' to datetime and extract components

df['Formatted Date'] = pd.to_datetime(df['Formatted Date'], utc=True)

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 95912 entries, 0 to 96428
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype              
---  ------                    --------------  -----              
 0   Formatted Date            95912 non-null  datetime64[ns, UTC]
 1   Summary                   95912 non-null  object             
 2   Precip Type               95912 non-null  object             
 3   Temperature (C)           95912 non-null  float64            
 4   Apparent Temperature (C)  95912 non-null  float64            
 5   Humidity                  95912 non-null  float64            
 6   Wind Speed (km/h)         95912 non-null  float64            
 7   Wind Bearing (degrees)    95912 non-null  float64            
 8   Visibility (km)           95912 non-null  float64            
 9   Loud Cover                95912 non-null  float64            
 10  Pressure (millibars)      95912 non-null  float64            
 11  Daily Summary       

In [19]:
# Extract year, month, day, hour, and day of week as new features
df['Year'] = df['Formatted Date'].dt.year
df['Month'] = df['Formatted Date'].dt.month
df['Day'] = df['Formatted Date'].dt.day
df['Hour'] = df['Formatted Date'].dt.hour
df['DayOfWeek'] = df['Formatted Date'].dt.dayofweek # Monday=0, Sunday=6
df['Is_Weekend'] = df['DayOfWeek'].isin([5, 6]).astype(int)

In [20]:
df.head()

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary,Year,Month,Day,Hour,DayOfWeek,Is_Weekend
0,2006-03-31 22:00:00+00:00,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.,2006,3,31,22,4,0
1,2006-03-31 23:00:00+00:00,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.,2006,3,31,23,4,0
2,2006-04-01 00:00:00+00:00,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.,2006,4,1,0,5,1
3,2006-04-01 01:00:00+00:00,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.,2006,4,1,1,5,1
4,2006-04-01 02:00:00+00:00,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.,2006,4,1,2,5,1


In [21]:
# Optional: Extract part of day (morning, afternoon, evening, night)
def get_part_of_day(hour):
    if 5 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 17:
        return 'afternoon'
    elif 17 <= hour < 21:
        return 'evening'
    else:
        return 'night'

df['PartOfDay'] = df['Hour'].apply(get_part_of_day)

In [23]:
# Step 2: Encode categorical features
# Summary: label encode or one-hot encode (choose one based on your model)
df['Summary_Label'] = pd.factorize(df['Summary'])[0]


In [24]:
# Precip Type: one-hot encoding (creates binary columns for each type)
df = pd.get_dummies(df, columns=['Precip Type'], prefix='Precip')

### Step 4: Drop unused or redundant columns if necessary
(such as original 'Formatted Date', free text columns, etc.)

In [25]:

df = df.drop(['Formatted Date', 'Daily Summary'], axis=1)
# Drop or keep as per your modeling need

In [26]:
df.head()

Unnamed: 0,Summary,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Year,Month,Day,Hour,DayOfWeek,Is_Weekend,PartOfDay,Summary_Label,Precip_rain,Precip_snow
0,Partly Cloudy,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,2006,3,31,22,4,0,night,0,True,False
1,Partly Cloudy,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,2006,3,31,23,4,0,night,0,True,False
2,Mostly Cloudy,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,2006,4,1,0,5,1,night,1,True,False
3,Partly Cloudy,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,2006,4,1,1,5,1,night,0,True,False
4,Mostly Cloudy,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,2006,4,1,2,5,1,night,1,True,False


In [27]:
df.to_csv("dataset_feature_engineering.csv", index=False)