In [95]:
# Let's import few necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [96]:
# Time to load the dataset
df = pd.read_csv("weatherHistory.csv")
df.sample(5)

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
68856,2013-10-16 01:00:00.000 +0200,Overcast,rain,9.95,9.383333,0.93,6.1502,151.0,9.7083,0.0,1009.98,Mostly cloudy throughout the day and breezy st...
53774,2012-08-26 14:00:00.000 +0200,Clear,rain,32.394444,31.616667,0.33,20.2055,296.0,10.3523,0.0,1007.38,Breezy starting in the afternoon continuing un...
15802,2007-11-26 11:00:00.000 +0100,Mostly Cloudy,rain,3.866667,-0.566667,0.89,21.7994,242.0,11.27,0.0,1017.38,Mostly cloudy until night.
46268,2011-02-17 20:00:00.000 +0100,Partly Cloudy,rain,2.288889,-1.005556,0.92,11.9945,139.0,6.9069,0.0,1010.96,Mostly cloudy throughout the day.
94703,2016-11-26 03:00:00.000 +0100,Partly Cloudy,rain,1.25,-1.677778,0.95,9.5151,174.0,7.5187,0.0,1019.2,Mostly cloudy throughout the day.


In [97]:
# let's check the dataset shape
df.shape

(96453, 12)

In [98]:
# we want to get the month from all the date as depending upon the month we want to predict weather condition
df['Formatted Date'].unique()

array(['2006-04-01 00:00:00.000 +0200', '2006-04-01 01:00:00.000 +0200',
       '2006-04-01 02:00:00.000 +0200', ...,
       '2016-09-09 21:00:00.000 +0200', '2016-09-09 22:00:00.000 +0200',
       '2016-09-09 23:00:00.000 +0200'], dtype=object)

We need to get the months from the Formatted Date.  
So that later we can predict the weather summary depending upon the month of the year 

In [99]:
# we will select only the month
df[['year','month','day']] = df['Formatted Date'].str.split('-', expand=True)

In [100]:
df.head()

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary,year,month,day
0,2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.,2006,4,01 00:00:00.000 +0200
1,2006-04-01 01:00:00.000 +0200,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.,2006,4,01 01:00:00.000 +0200
2,2006-04-01 02:00:00.000 +0200,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.,2006,4,01 02:00:00.000 +0200
3,2006-04-01 03:00:00.000 +0200,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.,2006,4,01 03:00:00.000 +0200
4,2006-04-01 04:00:00.000 +0200,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.,2006,4,01 04:00:00.000 +0200


In [101]:
# let's first drop few unnecessary columns
df.drop(columns=['Formatted Date', 'year', 'day', 'Daily Summary'], axis=1, inplace=True)

In [103]:
# covert the month type to integer so that we need not to do label encoding on this any more
df['month'] = df['month'].astype('int')

In [106]:
# we will only select few dominant weather conditions in terms of their frequency throughout the year
df['Summary'].value_counts(normalize=True)

Summary
Partly Cloudy                          0.329000
Mostly Cloudy                          0.291271
Overcast                               0.172073
Clear                                  0.112905
Foggy                                  0.074109
Breezy and Overcast                    0.005474
Breezy and Mostly Cloudy               0.005350
Breezy and Partly Cloudy               0.004002
Dry and Partly Cloudy                  0.000892
Windy and Partly Cloudy                0.000695
Light Rain                             0.000653
Breezy                                 0.000560
Windy and Overcast                     0.000467
Humid and Mostly Cloudy                0.000415
Drizzle                                0.000404
Breezy and Foggy                       0.000363
Windy and Mostly Cloudy                0.000363
Dry                                    0.000353
Humid and Partly Cloudy                0.000176
Dry and Mostly Cloudy                  0.000145
Rain                            

In [108]:
# these weather conditions are only important 
df = df[(df["Summary"] == "Partly Cloudy") 
        | (df["Summary"] == "Mostly Cloudy") 
        | (df["Summary"] == "Foggy") 
        | (df["Summary"] == "Overcast") 
        | (df["Summary"] == "Clear")]
df['Summary'].value_counts()

Summary
Partly Cloudy    31733
Mostly Cloudy    28094
Overcast         16597
Clear            10890
Foggy             7148
Name: count, dtype: int64

In [110]:
df.sample(5)

Unnamed: 0,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),month
18593,Partly Cloudy,rain,31.988889,30.65,0.29,11.3022,353.0,10.3523,0.0,1016.97,8
938,Partly Cloudy,rain,19.505556,19.505556,0.92,2.898,211.0,14.9569,0.0,1015.42,8
45148,Partly Cloudy,rain,18.105556,18.105556,0.61,10.9158,329.0,15.8263,0.0,1011.86,8
71243,Mostly Cloudy,rain,23.972222,23.972222,0.5,7.3738,123.0,15.3111,0.0,1015.52,8
71124,Mostly Cloudy,rain,25.066667,25.066667,0.5,9.821,191.0,16.1,0.0,1013.97,8


In [111]:
# seems like Loud Cover feature is not that much important we can drop it
df['Loud Cover'].describe()

count    94462.0
mean         0.0
std          0.0
min          0.0
25%          0.0
50%          0.0
75%          0.0
max          0.0
Name: Loud Cover, dtype: float64

In [112]:
# check precip type
df['Precip Type'].value_counts()

Precip Type
rain    83365
snow    10580
Name: count, dtype: int64

In [113]:
# since we are going to predict the weather type we will not use the rain/snow feature, we will drop this also
df.drop(columns=['Precip Type','Loud Cover'], axis=1, inplace=True)

In [114]:
df.sample(5)

Unnamed: 0,Summary,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Pressure (millibars),month
16965,Clear,14.772222,14.772222,0.73,4.83,170.0,9.982,1019.12,9
89151,Mostly Cloudy,3.883333,1.25,0.83,10.4006,162.0,8.7906,1018.81,12
82705,Mostly Cloudy,17.15,17.15,0.87,3.22,70.0,16.1,1019.12,7
94106,Partly Cloudy,15.888889,15.888889,0.81,4.9749,211.0,16.1,1011.5,5
83534,Mostly Cloudy,20.233333,20.233333,0.62,19.8513,311.0,10.3523,1010.47,6


In [115]:
# now our dataframe is ready, its time to do some cleaning
df.isnull().sum()

Summary                     0
Temperature (C)             0
Apparent Temperature (C)    0
Humidity                    0
Wind Speed (km/h)           0
Wind Bearing (degrees)      0
Visibility (km)             0
Pressure (millibars)        0
month                       0
dtype: int64

In [117]:
df.duplicated().sum()
df.drop_duplicates(inplace=True)

(94402, 9)