# Binary Prediction with a Rainfall Dataset

https://www.kaggle.com/competitions/playground-series-s5e3/overview

### Goal
Predict rainfall for each day of the year

In [147]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Load Dataset

In [148]:
train_path = "/content/train.csv"
test_path = "/content/test.csv"

df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

(729, 12)

In [149]:
df_train.head()

Unnamed: 0,id,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall
0,0,1,1017.4,21.2,20.6,19.9,19.4,87.0,88.0,1.1,60.0,17.2,1
1,1,2,1019.5,16.2,16.9,15.8,15.4,95.0,91.0,0.0,50.0,21.9,1
2,2,3,1024.1,19.4,16.1,14.6,9.3,75.0,47.0,8.3,70.0,18.1,1
3,3,4,1013.4,18.1,17.8,16.9,16.8,95.0,95.0,0.0,60.0,35.6,1
4,4,5,1021.8,21.3,18.4,15.2,9.6,52.0,45.0,3.6,40.0,24.8,0


In [150]:
df_test.head()

Unnamed: 0,id,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed
0,2190,1,1019.5,17.5,15.8,12.7,14.9,96.0,99.0,0.0,50.0,24.3
1,2191,2,1016.5,17.5,16.5,15.8,15.1,97.0,99.0,0.0,50.0,35.3
2,2192,3,1023.9,11.2,10.4,9.4,8.9,86.0,96.0,0.0,40.0,16.9
3,2193,4,1022.9,20.6,17.3,15.2,9.5,75.0,45.0,7.1,20.0,50.6
4,2194,5,1022.2,16.1,13.8,6.4,4.3,68.0,49.0,9.2,20.0,19.4


# EDA

In [151]:
def outliers(df: pd.DataFrame)->None:
  for col in df.select_dtypes(include=['number']).columns:
    if col == 'id':
      continue

    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3-Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] <lower_bound) | (df[col] >upper_bound)][col]

    if len(outliers) > 0:
      print(f"'{col}': {outliers.value_counts()}\n")


def data_eda(df: pd.DataFrame)->None:
  print(f'Shape: {df.shape}\n\n')
  print(f'Columns: {df.columns}\n\n')
  print(f'Null : {df.isnull().sum()}\n\n')
  print(f'Duplicate : {df.duplicated().sum()}\n\n')
  print(f'Outliers : {outliers(df)}\n\n')
  print(f'Info : {df.info()}\n\n')
  print(f'Describe : {df.describe()}\n\n')

print(f'Train Data:')
data_eda(df_train)
print(f'Test Data:')
data_eda(df_test)

Train Data:
Shape: (2190, 13)


Columns: Index(['id', 'day', 'pressure', 'maxtemp', 'temparature', 'mintemp',
       'dewpoint', 'humidity', 'cloud', 'sunshine', 'winddirection',
       'windspeed', 'rainfall'],
      dtype='object')


Null : id               0
day              0
pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
sunshine         0
winddirection    0
windspeed        0
rainfall         0
dtype: int64


Duplicate : 0


'pressure': pressure
1034.6    3
1032.3    1
Name: count, dtype: int64

'mintemp': mintemp
4.0    1
Name: count, dtype: int64

'dewpoint': dewpoint
 2.0    2
 2.7    2
 3.4    2
 4.0    2
 4.3    2
 2.3    1
 4.4    1
 0.2    1
 3.8    1
 1.0    1
 2.5    1
 2.2    1
 3.1    1
 3.2    1
 1.1    1
 3.7    1
 1.7    1
 2.4    1
 3.9    1
-0.3    1
 3.5    1
Name: count, dtype: int64

'humidity': humidity
59.0    9
58.0    5
52.0    4
46.0    2
60.0    2
56.0    1
47.0    1
54.

In [152]:
df_train.dropna(inplace=True)
df_test.dropna(inplace=True)

### Feature Engineering

In [228]:
def drop_unrequire_features(df:pd.DataFrame)->pd.DataFrame:
  columns_to_drop=["id", "maxtemp", "mintemp", "temparature", "humidity", "dewpoint","winddirection"]
  return df.drop(columns_to_drop, axis=1)

def create_new_feature(df:pd.DataFrame)->pd.DataFrame:
  df = df.copy()
  df["temp_range"] = df["maxtemp"] - df["mintemp"]
  df["dewpoint_depression"] = df["maxtemp"] - df["dewpoint"]
  df["humidity_dewpoint"] = df["humidity"]* df["dewpoint"]
  df["wind_x"] = df["windspeed"]* np.cos(np.radians(df['winddirection']))
  df["wind_y"] = df["windspeed"]* np.sin(np.radians(df['winddirection']))

  return drop_unrequire_features(df)

dffe_train = df_train.drop(["id", "day"], axis=1)#create_new_feature(df_train)
dffe_test = df_test.drop(["id", "day"], axis=1) #create_new_feature(df_test)

In [229]:
dffe_train.head()

Unnamed: 0,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall
0,1017.4,21.2,20.6,19.9,19.4,87.0,88.0,1.1,60.0,17.2,1
1,1019.5,16.2,16.9,15.8,15.4,95.0,91.0,0.0,50.0,21.9,1
2,1024.1,19.4,16.1,14.6,9.3,75.0,47.0,8.3,70.0,18.1,1
3,1013.4,18.1,17.8,16.9,16.8,95.0,95.0,0.0,60.0,35.6,1
4,1021.8,21.3,18.4,15.2,9.6,52.0,45.0,3.6,40.0,24.8,0


In [230]:
dffe_test.head()

Unnamed: 0,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed
0,1019.5,17.5,15.8,12.7,14.9,96.0,99.0,0.0,50.0,24.3
1,1016.5,17.5,16.5,15.8,15.1,97.0,99.0,0.0,50.0,35.3
2,1023.9,11.2,10.4,9.4,8.9,86.0,96.0,0.0,40.0,16.9
3,1022.9,20.6,17.3,15.2,9.5,75.0,45.0,7.1,20.0,50.6
4,1022.2,16.1,13.8,6.4,4.3,68.0,49.0,9.2,20.0,19.4


In [231]:
fetures = [col for col in dffe_train.columns if col !="rainfall"]
label = "rainfall"

X_train=dffe_train[fetures]
Y_train = dffe_train[label]
X_test = dffe_test

### Data scalling

In [232]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Split train, val data

In [233]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(X_train_scaled, Y_train, shuffle=True, test_size=0.25, random_state=9)

### Train model

In [234]:
from xgboost import XGBRFRegressor
from sklearn.metrics import roc_auc_score

xgb_model = XGBRFRegressor(n_estimators=100,tree_method="hist",subsample=0.8,colsample_bytree=0.6,learning_rate=0.001, random_state=42)
xgb_model.fit(x_train, y_train)

In [235]:
y_val_pred_xgb = xgb_model.predict(x_val)
auc_score_xgb = roc_auc_score(y_val, y_val_pred_xgb)
print(f"Validation AUC-ROC: {auc_score_xgb:.4f}")

Validation AUC-ROC: 0.8901


In [236]:
y_test_pred_xgb = xgb_model.predict(X_test)

In [237]:
y_test_pred_xgb = np.where(y_test_pred_xgb > 0.5, 1, 0)

In [238]:
y_test_pred_xgb

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [240]:
test_ids = df_test["id"]

# Create submission DataFrame
submission = pd.DataFrame({"id": test_ids, "rainfall": y_test_pred_xgb})
submission.to_csv("sample_submission2.csv", index=False)

In [245]:
submission["rainfall"].unique()

array([1])