In [23]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split

In [161]:
w_data = pd.read_csv('./Data/daily_data.csv')

In [162]:
w_data

Unnamed: 0,day_id,city_id,temperature_celsius,condition_text,wind_kph,wind_degree,pressure_mb,precip_mm,humidity,cloud,feels_like_celsius,visibility_km,uv_index,gust_kph,air_quality_us-epa-index,sunrise,sunset
0,D0001,C001,27.0,,6.1,210,1006.0,0.0,54,75,28.0,10.0,6.0,11.9,2,06:04 AM,07:19 PM
1,D0002,C001,22.0,,6.1,170,1006.0,0.0,73,75,24.5,10.0,1.0,23.4,1,06:05 AM,07:18 PM
2,D0003,C001,20.0,Light Rain with Thunder,3.6,10,1011.0,4.5,100,75,20.0,10.0,1.0,12.6,1,06:05 AM,07:18 PM
3,D0004,C001,17.0,Clear and Sunny,6.1,150,1018.0,0.0,88,0,17.0,10.0,1.0,11.2,1,06:06 AM,07:16 PM
4,D0005,C001,18.0,,3.6,92,1019.0,0.0,94,0,18.0,10.0,1.0,9.0,1,06:07 AM,07:15 PM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2888,D2889,C112,17.1,Clear and Sunny,13.3,61,1017.0,0.0,57,3,17.1,10.0,1.0,22.2,1,05:49 AM,05:51 PM
2889,D2890,C112,17.4,,13.0,51,1017.0,0.0,49,0,17.4,10.0,1.0,22.2,1,05:49 AM,05:52 PM
2890,D2891,C112,19.2,,11.5,46,1016.0,0.0,34,0,19.2,10.0,1.0,21.3,2,05:48 AM,05:52 PM
2891,D2892,C112,19.2,,14.4,76,1017.0,0.0,45,2,19.2,10.0,1.0,24.9,2,05:47 AM,05:52 PM


In [163]:
train_dt = w_data[w_data['condition_text'].notnull()].copy()
test_dt = w_data[w_data['condition_text'].isnull()].copy()
print(train_dt.shape, test_dt.shape)

(479, 17) (2414, 17)


In [164]:
conds = train_dt['condition_text'].unique()

cond_ids = {}
i = 0

for cond in conds:
    cond_ids[cond] = i
    i += 1

cond_ids


{'Light Rain with Thunder': 0,
 'Clear and Sunny': 1,
 'Partly Cloudy': 2,
 'Light Precipitation': 3,
 'Cloudy and Overcast': 4,
 'Mist or Fog': 5,
 'Rain Showers': 6,
 'Moderate to Heavy Rain': 7,
 'Thunderstorms': 8}

In [165]:
cond_strngs = {v: k for k, v in cond_ids.items()}
cond_strngs

{0: 'Light Rain with Thunder',
 1: 'Clear and Sunny',
 2: 'Partly Cloudy',
 3: 'Light Precipitation',
 4: 'Cloudy and Overcast',
 5: 'Mist or Fog',
 6: 'Rain Showers',
 7: 'Moderate to Heavy Rain',
 8: 'Thunderstorms'}

In [166]:
train_dt.loc[:, 'condition_id'] = train_dt['condition_text'].apply(lambda x: cond_ids[x])
train_dt

Unnamed: 0,day_id,city_id,temperature_celsius,condition_text,wind_kph,wind_degree,pressure_mb,precip_mm,humidity,cloud,feels_like_celsius,visibility_km,uv_index,gust_kph,air_quality_us-epa-index,sunrise,sunset,condition_id
2,D0003,C001,20.0,Light Rain with Thunder,3.6,10,1011.0,4.50,100,75,20.0,10.0,1.0,12.6,1,06:05 AM,07:18 PM,0
3,D0004,C001,17.0,Clear and Sunny,6.1,150,1018.0,0.00,88,0,17.0,10.0,1.0,11.2,1,06:06 AM,07:16 PM,1
6,D0007,C001,21.0,Partly Cloudy,4.0,310,1015.0,0.00,100,50,21.0,10.0,1.0,15.1,2,06:08 AM,07:11 PM,2
18,D0019,C001,19.0,Clear and Sunny,3.6,64,1017.0,0.00,88,0,19.0,10.0,1.0,8.3,3,06:20 AM,06:51 PM,1
27,D0028,C002,19.0,Partly Cloudy,3.6,83,1010.0,0.00,73,25,19.0,10.0,1.0,8.3,1,06:17 AM,07:20 PM,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2838,D2839,C110,27.0,Partly Cloudy,19.1,90,1014.0,0.01,70,25,29.8,10.0,6.0,45.6,1,05:39 AM,05:43 PM,2
2863,D2864,C111,21.4,Light Precipitation,3.6,178,1012.0,0.00,89,75,21.4,10.0,6.0,6.6,1,06:17 AM,06:25 PM,3
2873,D2874,C112,16.9,Clear and Sunny,16.6,68,1018.0,0.00,44,2,16.9,10.0,1.0,27.0,1,06:02 AM,05:49 PM,1
2886,D2887,C112,19.5,Clear and Sunny,5.4,27,1014.0,0.00,34,0,19.4,10.0,1.0,11.2,2,05:51 AM,05:51 PM,1


In [167]:
train_dt_fil = train_dt.drop(['city_id', 'condition_text', 'sunrise', 'sunset'], axis=1).copy()
train_dt_fil.set_index('day_id', inplace=True)
train_dt_fil

Unnamed: 0_level_0,temperature_celsius,wind_kph,wind_degree,pressure_mb,precip_mm,humidity,cloud,feels_like_celsius,visibility_km,uv_index,gust_kph,air_quality_us-epa-index,condition_id
day_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
D0003,20.0,3.6,10,1011.0,4.50,100,75,20.0,10.0,1.0,12.6,1,0
D0004,17.0,6.1,150,1018.0,0.00,88,0,17.0,10.0,1.0,11.2,1,1
D0007,21.0,4.0,310,1015.0,0.00,100,50,21.0,10.0,1.0,15.1,2,2
D0019,19.0,3.6,64,1017.0,0.00,88,0,19.0,10.0,1.0,8.3,3,1
D0028,19.0,3.6,83,1010.0,0.00,73,25,19.0,10.0,1.0,8.3,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
D2839,27.0,19.1,90,1014.0,0.01,70,25,29.8,10.0,6.0,45.6,1,2
D2864,21.4,3.6,178,1012.0,0.00,89,75,21.4,10.0,6.0,6.6,1,3
D2874,16.9,16.6,68,1018.0,0.00,44,2,16.9,10.0,1.0,27.0,1,1
D2887,19.5,5.4,27,1014.0,0.00,34,0,19.4,10.0,1.0,11.2,2,1


In [171]:
# X = train_dt_fil.drop('condition_id', axis=1)
# y = train_dt_fil['condition_id']

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [172]:
# rf = RandomForestClassifier(n_estimators=80, random_state=42)
# rf.fit(X_train, y_train)

# y_pred = rf.predict(X_test)

In [173]:
# accuracy = accuracy_score(y_test, y_pred)
# print("Accuracy:", accuracy)

In [174]:
test_dt_fil = test_dt.drop(['city_id', 'condition_text', 'sunrise', 'sunset'], axis=1).copy()
test_dt_fil.set_index('day_id', inplace=True)

test_dt_fil

Unnamed: 0_level_0,temperature_celsius,wind_kph,wind_degree,pressure_mb,precip_mm,humidity,cloud,feels_like_celsius,visibility_km,uv_index,gust_kph,air_quality_us-epa-index
day_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
D0001,27.0,6.1,210,1006.0,0.0,54,75,28.0,10.0,6.0,11.9,2
D0002,22.0,6.1,170,1006.0,0.0,73,75,24.5,10.0,1.0,23.4,1
D0005,18.0,3.6,92,1019.0,0.0,94,0,18.0,10.0,1.0,9.0,1
D0006,20.0,3.6,96,1019.0,0.0,88,0,20.0,10.0,1.0,11.2,1
D0008,21.0,20.2,330,1011.0,0.0,53,75,21.0,10.0,1.0,17.3,1
...,...,...,...,...,...,...,...,...,...,...,...,...
D2888,19.5,16.6,113,1015.0,0.0,38,0,19.5,10.0,1.0,26.7,1
D2890,17.4,13.0,51,1017.0,0.0,49,0,17.4,10.0,1.0,22.2,1
D2891,19.2,11.5,46,1016.0,0.0,34,0,19.2,10.0,1.0,21.3,2
D2892,19.2,14.4,76,1017.0,0.0,45,2,19.2,10.0,1.0,24.9,2


In [175]:
X_train = train_dt_fil.drop('condition_id', axis=1)
y_train = train_dt_fil['condition_id']

X_test = test_dt_fil

In [176]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

In [177]:
test_dt_fil['condition_id'] = y_pred
test_dt_fil

Unnamed: 0_level_0,temperature_celsius,wind_kph,wind_degree,pressure_mb,precip_mm,humidity,cloud,feels_like_celsius,visibility_km,uv_index,gust_kph,air_quality_us-epa-index,condition_id
day_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
D0001,27.0,6.1,210,1006.0,0.0,54,75,28.0,10.0,6.0,11.9,2,2
D0002,22.0,6.1,170,1006.0,0.0,73,75,24.5,10.0,1.0,23.4,1,2
D0005,18.0,3.6,92,1019.0,0.0,94,0,18.0,10.0,1.0,9.0,1,1
D0006,20.0,3.6,96,1019.0,0.0,88,0,20.0,10.0,1.0,11.2,1,1
D0008,21.0,20.2,330,1011.0,0.0,53,75,21.0,10.0,1.0,17.3,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
D2888,19.5,16.6,113,1015.0,0.0,38,0,19.5,10.0,1.0,26.7,1,1
D2890,17.4,13.0,51,1017.0,0.0,49,0,17.4,10.0,1.0,22.2,1,1
D2891,19.2,11.5,46,1016.0,0.0,34,0,19.2,10.0,1.0,21.3,2,1
D2892,19.2,14.4,76,1017.0,0.0,45,2,19.2,10.0,1.0,24.9,2,1


In [178]:
test_dt_res = test_dt_fil.copy()
test_dt_res.reset_index(inplace=True, drop=False)
test_dt_res

Unnamed: 0,day_id,temperature_celsius,wind_kph,wind_degree,pressure_mb,precip_mm,humidity,cloud,feels_like_celsius,visibility_km,uv_index,gust_kph,air_quality_us-epa-index,condition_id
0,D0001,27.0,6.1,210,1006.0,0.0,54,75,28.0,10.0,6.0,11.9,2,2
1,D0002,22.0,6.1,170,1006.0,0.0,73,75,24.5,10.0,1.0,23.4,1,2
2,D0005,18.0,3.6,92,1019.0,0.0,94,0,18.0,10.0,1.0,9.0,1,1
3,D0006,20.0,3.6,96,1019.0,0.0,88,0,20.0,10.0,1.0,11.2,1,1
4,D0008,21.0,20.2,330,1011.0,0.0,53,75,21.0,10.0,1.0,17.3,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2409,D2888,19.5,16.6,113,1015.0,0.0,38,0,19.5,10.0,1.0,26.7,1,1
2410,D2890,17.4,13.0,51,1017.0,0.0,49,0,17.4,10.0,1.0,22.2,1,1
2411,D2891,19.2,11.5,46,1016.0,0.0,34,0,19.2,10.0,1.0,21.3,2,1
2412,D2892,19.2,14.4,76,1017.0,0.0,45,2,19.2,10.0,1.0,24.9,2,1


In [179]:
test_dt_res['condition_text'] = test_dt_res['condition_id'].apply(lambda x: cond_strngs[x])
test_dt_res

Unnamed: 0,day_id,temperature_celsius,wind_kph,wind_degree,pressure_mb,precip_mm,humidity,cloud,feels_like_celsius,visibility_km,uv_index,gust_kph,air_quality_us-epa-index,condition_id,condition_text
0,D0001,27.0,6.1,210,1006.0,0.0,54,75,28.0,10.0,6.0,11.9,2,2,Partly Cloudy
1,D0002,22.0,6.1,170,1006.0,0.0,73,75,24.5,10.0,1.0,23.4,1,2,Partly Cloudy
2,D0005,18.0,3.6,92,1019.0,0.0,94,0,18.0,10.0,1.0,9.0,1,1,Clear and Sunny
3,D0006,20.0,3.6,96,1019.0,0.0,88,0,20.0,10.0,1.0,11.2,1,1,Clear and Sunny
4,D0008,21.0,20.2,330,1011.0,0.0,53,75,21.0,10.0,1.0,17.3,1,2,Partly Cloudy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2409,D2888,19.5,16.6,113,1015.0,0.0,38,0,19.5,10.0,1.0,26.7,1,1,Clear and Sunny
2410,D2890,17.4,13.0,51,1017.0,0.0,49,0,17.4,10.0,1.0,22.2,1,1,Clear and Sunny
2411,D2891,19.2,11.5,46,1016.0,0.0,34,0,19.2,10.0,1.0,21.3,2,1,Clear and Sunny
2412,D2892,19.2,14.4,76,1017.0,0.0,45,2,19.2,10.0,1.0,24.9,2,1,Clear and Sunny


In [180]:
train_dt_res = train_dt.drop(['city_id', 'sunrise', 'sunset'], axis=1).copy()
train_dt_res.reset_index(inplace=True, drop=False)
train_dt_res

Unnamed: 0,index,day_id,temperature_celsius,condition_text,wind_kph,wind_degree,pressure_mb,precip_mm,humidity,cloud,feels_like_celsius,visibility_km,uv_index,gust_kph,air_quality_us-epa-index,condition_id
0,2,D0003,20.0,Light Rain with Thunder,3.6,10,1011.0,4.50,100,75,20.0,10.0,1.0,12.6,1,0
1,3,D0004,17.0,Clear and Sunny,6.1,150,1018.0,0.00,88,0,17.0,10.0,1.0,11.2,1,1
2,6,D0007,21.0,Partly Cloudy,4.0,310,1015.0,0.00,100,50,21.0,10.0,1.0,15.1,2,2
3,18,D0019,19.0,Clear and Sunny,3.6,64,1017.0,0.00,88,0,19.0,10.0,1.0,8.3,3,1
4,27,D0028,19.0,Partly Cloudy,3.6,83,1010.0,0.00,73,25,19.0,10.0,1.0,8.3,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
474,2838,D2839,27.0,Partly Cloudy,19.1,90,1014.0,0.01,70,25,29.8,10.0,6.0,45.6,1,2
475,2863,D2864,21.4,Light Precipitation,3.6,178,1012.0,0.00,89,75,21.4,10.0,6.0,6.6,1,3
476,2873,D2874,16.9,Clear and Sunny,16.6,68,1018.0,0.00,44,2,16.9,10.0,1.0,27.0,1,1
477,2886,D2887,19.5,Clear and Sunny,5.4,27,1014.0,0.00,34,0,19.4,10.0,1.0,11.2,2,1


In [181]:
fin_data_1 = train_dt_res[['day_id', 'condition_text']].copy()
fin_data_2 = test_dt_res[['day_id', 'condition_text']].copy()

fin_data = pd.concat([fin_data_1, fin_data_2])
fin_data.reset_index(inplace=True, drop=True)
fin_data

Unnamed: 0,day_id,condition_text
0,D0003,Light Rain with Thunder
1,D0004,Clear and Sunny
2,D0007,Partly Cloudy
3,D0019,Clear and Sunny
4,D0028,Partly Cloudy
...,...,...
2888,D2888,Clear and Sunny
2889,D2890,Clear and Sunny
2890,D2891,Clear and Sunny
2891,D2892,Clear and Sunny


In [182]:
fin_data.to_csv('./results/3rd_res.csv', index=False)