In [59]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['figure.dpi'] = 100
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set(style="whitegrid")
%matplotlib inline

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import mage_ai
import lightgbm as lgb


In [60]:
train_df = pd.read_csv("train_weather.csv")
train_df

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,Brisbane,13.7,23.6,0.0,5.0,9.6,SSE,24.0,SSW,ESE,...,65.0,53.0,1028.4,1026.7,1.0,1.0,18.3,22.3,0,0
1,Sydney,15.4,21.2,61.2,5.8,2.7,S,43.0,WSW,SSW,...,91.0,65.0,1018.8,1016.5,8.0,7.0,15.6,20.7,1,1
2,Richmond,16.8,30.9,1.0,,,NE,35.0,NNW,NE,...,80.0,38.0,1020.4,1013.2,1.0,,19.6,30.0,0,0
3,NorfolkIsland,13.9,17.1,0.2,3.8,8.9,SE,37.0,SE,SSE,...,56.0,63.0,1027.7,1026.1,3.0,1.0,15.3,15.8,0,0
4,Mildura,16.0,36.1,0.0,6.8,12.9,ENE,33.0,ENE,E,...,58.0,32.0,1018.5,1016.0,0.0,3.0,26.0,34.9,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121868,GoldCoast,17.6,25.1,0.0,,,SSE,56.0,SE,E,...,58.0,58.0,1012.8,1010.2,,,23.5,24.2,0,0
121869,Adelaide,15.1,25.2,0.0,10.0,11.3,SW,48.0,S,WSW,...,55.0,41.0,1018.8,1018.3,,,19.0,24.6,0,0
121870,MountGinini,10.5,18.4,1.2,,,SSE,50.0,E,E,...,97.0,93.0,,,,,12.5,15.9,1,1
121871,Adelaide,13.6,19.8,2.6,,7.5,WSW,54.0,WSW,WSW,...,76.0,46.0,1008.5,1007.4,,,14.4,18.9,1,1


In [None]:
# Cleaning the data
mage_ai.connect_data(train_df, name='train_weather')
mage_ai.launch()

In [61]:
# Cleaning the data
mage_ai.clean(train_df, pipeline_uuid=1)
train_df

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,brisbane,13.7,23.6,0.0,5.0,9.6,SSE,24.0,SSW,ESE,...,65.0,53.0,1028.4,1026.7,1.0,1.0,18.3,22.3,0,0
1,sydney,15.4,21.2,61.2,5.8,2.7,S,43.0,WSW,SSW,...,91.0,65.0,1018.8,1016.5,8.0,7.0,15.6,20.7,1,1
2,richmond,16.8,30.9,1.0,,,NE,35.0,NNW,NE,...,80.0,38.0,1020.4,1013.2,1.0,,19.6,30.0,0,0
3,norfolkisland,13.9,17.1,0.2,3.8,8.9,SE,37.0,SE,SSE,...,56.0,63.0,1027.7,1026.1,3.0,1.0,15.3,15.8,0,0
4,mildura,16.0,36.1,0.0,6.8,12.9,ENE,33.0,ENE,E,...,58.0,32.0,1018.5,1016.0,0.0,3.0,26.0,34.9,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121868,goldcoast,17.6,25.1,0.0,,,SSE,56.0,SE,E,...,58.0,58.0,1012.8,1010.2,,,23.5,24.2,0,0
121869,adelaide,15.1,25.2,0.0,10.0,11.3,SW,48.0,S,WSW,...,55.0,41.0,1018.8,1018.3,,,19.0,24.6,0,0
121870,mountginini,10.5,18.4,1.2,,,SSE,50.0,E,E,...,97.0,93.0,,,,,12.5,15.9,1,1
121871,adelaide,13.6,19.8,2.6,,7.5,WSW,54.0,WSW,WSW,...,76.0,46.0,1008.5,1007.4,,,14.4,18.9,1,1


In [62]:
# Unbalanced data
train_df.RainTomorrow.value_counts()

0    95015
1    26858
Name: RainTomorrow, dtype: int64

In [63]:
# Categorical features
train_df.select_dtypes('object').columns

Index(['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm'], dtype='object')

In [64]:
# Remove missing values
train_df.dropna(inplace=True)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49101 entries, 0 to 121867
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Location       49101 non-null  object 
 1   MinTemp        49101 non-null  float64
 2   MaxTemp        49101 non-null  float64
 3   Rainfall       49101 non-null  float64
 4   Evaporation    49101 non-null  float64
 5   Sunshine       49101 non-null  float64
 6   WindGustDir    49101 non-null  object 
 7   WindGustSpeed  49101 non-null  float64
 8   WindDir9am     49101 non-null  object 
 9   WindDir3pm     49101 non-null  object 
 10  WindSpeed9am   49101 non-null  float64
 11  WindSpeed3pm   49101 non-null  float64
 12  Humidity9am    49101 non-null  float64
 13  Humidity3pm    49101 non-null  float64
 14  Pressure9am    49101 non-null  float64
 15  Pressure3pm    49101 non-null  float64
 16  Cloud9am       49101 non-null  float64
 17  Cloud3pm       49101 non-null  float64
 18  Temp9

In [65]:
# Label encoding
categorical_features = ['WindGustDir', 'WindDir9am', 'WindDir3pm']
label_encoder = LabelEncoder()

train_df[categorical_features] = train_df[categorical_features].apply(label_encoder.fit_transform)

train_df

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,brisbane,13.7,23.6,0.0,5.0,9.6,10,24.0,11,2,...,65.0,53.0,1028.4,1026.7,1.0,1.0,18.3,22.3,0,0
1,sydney,15.4,21.2,61.2,5.8,2.7,8,43.0,15,11,...,91.0,65.0,1018.8,1016.5,8.0,7.0,15.6,20.7,1,1
3,norfolkisland,13.9,17.1,0.2,3.8,8.9,9,37.0,9,10,...,56.0,63.0,1027.7,1026.1,3.0,1.0,15.3,15.8,0,0
4,mildura,16.0,36.1,0.0,6.8,12.9,1,33.0,1,0,...,58.0,32.0,1018.5,1016.0,0.0,3.0,26.0,34.9,0,0
7,woomera,18.4,37.9,0.0,13.6,13.0,9,39.0,9,4,...,38.0,12.0,1015.1,1013.4,1.0,1.0,23.8,35.8,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121862,norfolkisland,19.6,23.8,0.6,4.0,11.4,9,43.0,9,9,...,64.0,59.0,1014.8,1013.0,2.0,2.0,22.1,22.9,0,0
121863,cairns,15.8,27.4,0.0,6.4,10.9,4,30.0,9,4,...,68.0,58.0,1017.2,1013.6,1.0,0.0,22.3,26.1,0,0
121864,watsonia,18.0,26.1,0.2,4.2,0.0,2,20.0,13,15,...,91.0,71.0,1016.1,1012.4,8.0,8.0,19.8,25.4,0,1
121866,moree,22.1,32.6,0.2,9.4,7.5,1,61.0,0,0,...,41.0,26.0,1009.4,1006.3,6.0,4.0,24.4,31.2,0,0


In [66]:
# Split data
X, y = train_df.drop(["Location", "RainTomorrow"], axis=1), train_df[["RainTomorrow"]].values.flatten()
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.25, random_state=0)

In [67]:
# LightGBM Model
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train)

y_pred=clf.predict(X_eval.values)

In [68]:
# Classification report
print(classification_report(y_eval, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.94      0.91      9627
           1       0.72      0.55      0.62      2649

    accuracy                           0.86     12276
   macro avg       0.80      0.75      0.77     12276
weighted avg       0.85      0.86      0.85     12276



In [69]:
# Prediction test data
test_df = pd.read_csv("test_weather.csv")
test_df

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,PearceRAAF,11.0,22.1,0.0,,8.2,ESE,57.0,E,E,...,22.0,52.0,34.0,1028.5,1024.6,3.0,,14.7,20.7,0
1,Albury,5.6,20.7,0.0,,,ESE,44.0,SE,NNE,...,24.0,65.0,42.0,1023.0,1018.1,,,13.9,20.3,0
2,Cobar,9.5,18.4,0.0,1.6,,SW,26.0,WNW,SSW,...,15.0,67.0,43.0,1024.7,1021.5,7.0,7.0,11.5,17.9,0
3,Albany,18.4,22.0,0.0,3.8,0.2,,,E,ESE,...,19.0,80.0,79.0,1019.5,1017.0,8.0,8.0,20.7,21.4,0
4,MountGambier,10.4,20.7,0.4,3.8,9.0,SSW,43.0,WSW,SW,...,26.0,55.0,38.0,1019.5,1022.4,5.0,4.0,15.7,18.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13536,BadgerysCreek,13.1,26.7,0.2,,,W,52.0,N,WNW,...,28.0,58.0,35.0,1005.4,1003.9,,,19.4,26.0,0
13537,Richmond,16.4,30.1,0.0,5.6,,WSW,43.0,ENE,WSW,...,20.0,79.0,38.0,1003.2,1000.8,,,21.6,29.7,0
13538,Uluru,22.9,32.9,0.0,,,E,52.0,E,ESE,...,20.0,23.0,17.0,1018.4,1015.0,,1.0,25.2,31.4,0
13539,Nhil,12.5,34.2,0.0,,,SSE,31.0,E,ESE,...,13.0,37.0,13.0,1021.8,1018.7,,,21.1,31.3,0


In [50]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13541 entries, 0 to 13540
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Location       13541 non-null  object 
 1   MinTemp        13409 non-null  float64
 2   MaxTemp        13418 non-null  float64
 3   Rainfall       13256 non-null  float64
 4   Evaporation    7933 non-null   float64
 5   Sunshine       7330 non-null   float64
 6   WindGustDir    12565 non-null  object 
 7   WindGustSpeed  12573 non-null  float64
 8   WindDir9am     12547 non-null  object 
 9   WindDir3pm     13146 non-null  object 
 10  WindSpeed9am   13365 non-null  float64
 11  WindSpeed3pm   13258 non-null  float64
 12  Humidity9am    13304 non-null  float64
 13  Humidity3pm    13140 non-null  float64
 14  Pressure9am    12159 non-null  float64
 15  Pressure3pm    12160 non-null  float64
 16  Cloud9am       8407 non-null   float64
 17  Cloud3pm       8150 non-null   float64
 18  Temp9a

In [70]:
# Label encoding
categorical_features = ['WindGustDir', 'WindDir9am', 'WindDir3pm']
label_encoder = LabelEncoder()

test_df[categorical_features] = test_df[categorical_features].apply(label_encoder.fit_transform)

test_df

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,PearceRAAF,11.0,22.1,0.0,,8.2,2,57.0,0,0,...,22.0,52.0,34.0,1028.5,1024.6,3.0,,14.7,20.7,0
1,Albury,5.6,20.7,0.0,,,2,44.0,9,5,...,24.0,65.0,42.0,1023.0,1018.1,,,13.9,20.3,0
2,Cobar,9.5,18.4,0.0,1.6,,12,26.0,14,11,...,15.0,67.0,43.0,1024.7,1021.5,7.0,7.0,11.5,17.9,0
3,Albany,18.4,22.0,0.0,3.8,0.2,16,,0,2,...,19.0,80.0,79.0,1019.5,1017.0,8.0,8.0,20.7,21.4,0
4,MountGambier,10.4,20.7,0.4,3.8,9.0,11,43.0,15,12,...,26.0,55.0,38.0,1019.5,1022.4,5.0,4.0,15.7,18.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13536,BadgerysCreek,13.1,26.7,0.2,,,13,52.0,3,14,...,28.0,58.0,35.0,1005.4,1003.9,,,19.4,26.0,0
13537,Richmond,16.4,30.1,0.0,5.6,,15,43.0,1,15,...,20.0,79.0,38.0,1003.2,1000.8,,,21.6,29.7,0
13538,Uluru,22.9,32.9,0.0,,,0,52.0,0,2,...,20.0,23.0,17.0,1018.4,1015.0,,1.0,25.2,31.4,0
13539,Nhil,12.5,34.2,0.0,,,10,31.0,0,2,...,13.0,37.0,13.0,1021.8,1018.7,,,21.1,31.3,0


In [71]:
# Replace missing values with 0
test_df.fillna(0, inplace=True)
print(test_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13541 entries, 0 to 13540
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Location       13541 non-null  object 
 1   MinTemp        13541 non-null  float64
 2   MaxTemp        13541 non-null  float64
 3   Rainfall       13541 non-null  float64
 4   Evaporation    13541 non-null  float64
 5   Sunshine       13541 non-null  float64
 6   WindGustDir    13541 non-null  int32  
 7   WindGustSpeed  13541 non-null  float64
 8   WindDir9am     13541 non-null  int32  
 9   WindDir3pm     13541 non-null  int32  
 10  WindSpeed9am   13541 non-null  float64
 11  WindSpeed3pm   13541 non-null  float64
 12  Humidity9am    13541 non-null  float64
 13  Humidity3pm    13541 non-null  float64
 14  Pressure9am    13541 non-null  float64
 15  Pressure3pm    13541 non-null  float64
 16  Cloud9am       13541 non-null  float64
 17  Cloud3pm       13541 non-null  float64
 18  Temp9a

In [72]:
# LightGBM Model
x_test = test_df.drop(["Location"], axis=1)
y_pred=clf.predict(x_test.values)

In [75]:
submission_df = pd.DataFrame({"index":test_df.index, "RainTomorrow": y_pred})
submission_df.to_csv("submission.csv", index=False)