In [12]:
import pandas as pd
import numpy as np

In [13]:
df_train = pd.read_csv("train.csv")
df_train

In [None]:
df_train.info()

Data visualisation

In [None]:
import seaborn as sb

cols= ["#C2C4E2","#EED4E5"]
sb.countplot(x= df_train["RainTomorrow"], palette= cols)

In [None]:
#correlation
import matplotlib.pyplot as plt

df = df_train.select_dtypes(include=['float64'])

corrmat = df.corr()
cmap = sb.diverging_palette(260,-10,s=50, l=75, n=6, as_cmap=True)
plt.subplots(figsize=(18,18))
sb.heatmap(corrmat,cmap= cmap,annot=True, square=True)

Data preprocessing

Data cleaning

In [None]:
df_train.isnull().sum()

Handling missing values

In [None]:
#replacing the missing values :
#numerical values with the median and non numerical ones with the mode

for col in df_test.columns:

    if df_train[col].isnull().sum() > 0:
        
        if df_train[col].dtype == 'float64':

            val = df_train[col].median()
        else:

            val = df_train[col].mode()[0]  #the most frequent value
        

        df_train[col] = df_train[col].fillna(val)


df_train.isnull().sum().sum()

Encoding

In [None]:
#turning non numerical values into numerical

df = df_train.select_dtypes(include=['object'])
df

Unnamed: 0,Date,City,WindGustDir,WindDir9am,WindDir3pm,RainToday,RainTomorrow
0,2010-01-01,Algiers,NNE,NW,WNW,Yes,Yes
1,2010-01-02,Setif,W,S,W,Yes,No
2,2010-01-03,Sidi Bel Abbes,W,SSW,SW,No,No
3,2010-01-04,Skikda,SW,ENE,NNW,No,No
4,2010-01-05,Skikda,W,E,NW,No,No
...,...,...,...,...,...,...,...
76667,2014-12-27,Annaba,NNW,S,NNW,No,No
76668,2014-12-28,Algiers,NW,ESE,WNW,No,No
76669,2014-12-29,Sidi Bel Abbes,W,E,W,No,Yes
76670,2014-12-30,Skikda,N,E,WNW,Yes,Yes


In [None]:
df_train = pd.read_csv("train.csv")

#encoding
df_train.replace({'Yes':1, 'No':0}, inplace=True)

df_train['WindGustDir'] = pd.factorize(df_train['WindGustDir'])[0] + 1
df_train['WindDir9am'] = pd.factorize(df_train['WindDir9am'])[0] + 1
df_train['WindDir3pm'] = pd.factorize(df_train['WindDir3pm'])[0] + 1

#City:
df_train['City'] = pd.factorize(df_train['City'])[0] + 1

#Date : extracting each component
df_train['Date'] = pd.to_datetime(df_train['Date'])

#year, month, day as separate features
df_train['Year'] = df_train['Date'].dt.year
df_train['Month'] = df_train['Date'].dt.month
df_train['Day'] = df_train['Date'].dt.day
df_train = df_train.drop('Date', axis=1)

Splitting data

In [16]:
from sklearn.model_selection import train_test_split

x = df_train.drop(["RainTomorrow"], axis=1)
y = df_train["RainTomorrow"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, shuffle=False)


Balancing data

In [17]:
from imblearn.over_sampling import RandomOverSampler
# since the data is imbalanced
# balancing the data by adding repetitive rows of minority class
ros = RandomOverSampler(sampling_strategy='minority',random_state=22)
X, Y = ros.fit_resample(x_train, y_train)

(94276,)

Training the model

In [18]:
import xgboost as xgb
import optuna

from sklearn.metrics import f1_score

In [26]:
def objective(trial):
    param = {
        'booster': 'gbtree',
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
    }

   
    model = xgb.XGBClassifier(**param, use_label_encoder=False)
    model.fit(X, Y)

    
    y_pred = model.predict(x_test)

    f1 = f1_score(y_test, y_pred)
    return f1


In [28]:
study = optuna.create_study(direction='maximize')  # maximum f1
study.optimize(objective, n_trials=30,timeout=1800)  # sets of hyperparameters


[I 2024-10-12 19:31:22,342] A new study created in memory with name: no-name-4746d3a3-e13a-4bea-b74b-51b64873bdce

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.


suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.


Parameters: { "use_label_encoder" } are not used.


[I 2024-10-12 19:31:37,260] Trial 0 finished with value: 0.6422752184861502 and parameters: {'learning_rate': 0.1491420350074782, 'max_depth': 8, 'n_estimators': 305, 'min_child_weight': 8, 'gamma': 1.551124786348805e-07, 'subsample': 0.8810855008893894, 'colsample_bytree': 0.9941482656268432}. Best is trial 0 with value: 0.6422752184861502.

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://gi

In [29]:
print("Best F1 Score: ",study.best_value)
best_params = study.best_params


Best F1 Score: 0.6554919596620332
Best Hyperparameters: {'learning_rate': 0.021355413756704068, 'max_depth': 9, 'n_estimators': 216, 'min_child_weight': 4, 'gamma': 0.0016635979346675872, 'subsample': 0.6610203644235196, 'colsample_bytree': 0.8367211869715874}


In [31]:
best_xgb_model = xgb.XGBClassifier(**best_params, use_label_encoder=False)

#retraining the model that has best params
best_xgb_model.fit(X, Y)


Parameters: { "use_label_encoder" } are not used.




Evaluating the model

In [32]:
y_test_pred = best_xgb_model.predict(x_test)
test_f1 = f1_score(y_test, y_test_pred)
print("Test F1 Score: ",test_f1)

Test F1 Score: 0.6554919596620332


Predicting the test data

In [35]:
#reading the df test
df_test = pd.read_csv("test.csv")

Handling missing values

In [36]:
#replacing the missing values :
#numerical values with the median and non numerical ones with the mode

for col in df_test.columns:

    if df_test[col].isnull().sum() > 0:
        
        if df_test[col].dtype == 'float64':

            val = df_test[col].median()
        else:

            val = df_test[col].mode()[0]  #the most frequent value
        

        df_test[col] = df_test[col].fillna(val)


df_test.isnull().sum().sum()

0

Encoding

In [37]:
df_test.replace({'Yes':1, 'No':0}, inplace=True)

#print("WindGustDir: ", df['WindGustDir'].unique())

#WindGustDir   WindDir9am	WindDir3pm
df_test['WindGustDir'] = pd.factorize(df_test['WindGustDir'])[0] + 1
df_test['WindDir9am'] = pd.factorize(df_test['WindDir9am'])[0] + 1
df_test['WindDir3pm'] = pd.factorize(df_test['WindDir3pm'])[0] + 1

#City:
df_test['City'] = pd.factorize(df_test['City'])[0] + 1

#Date : extracting each component
df_test['Date'] = pd.to_datetime(df_test['Date'])

#year, month, day as separate features
df_test['Year'] = df_test['Date'].dt.year
df_test['Month'] = df_test['Date'].dt.month
df_test['Day'] = df_test['Date'].dt.day

#df_test['Season'] = df_test['Month'].apply(get_season)

df_test = df_test.drop('Date', axis=1)


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



Predicting

In [38]:
test_preds = best_xgb_model.predict(df_test)

prediction_distribution = pd.Series(test_preds).value_counts()

print(prediction_distribution)

0    23767
1     8550
Name: count, dtype: int64


Saving the results

In [39]:
df_test['RainTomorrow'] = test_preds
new_df = pd.read_csv("test.csv")

df_test = df_test[['Day', 'RainTomorrow']]
df_test['Day'] = new_df['Day']
df_test['RainTomorrow'] = df_test['RainTomorrow'].replace({0: 'No', 1: 'Yes'})
df_test.to_csv('rain_pred.csv', index=False)
df_test

Unnamed: 0,Day,RainTomorrow
0,76672,No
1,76673,No
2,76674,No
3,76675,No
4,76676,No
...,...,...
32312,108984,Yes
32313,108985,Yes
32314,108986,Yes
32315,108987,Yes
