In [None]:
!pip install pandas scikit-learn




In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer


In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

print(train.head())
print(train.info())


   Day        Date            City  MinTemp  MaxTemp  Rainfall  Evaporation  \
0    0  2010-01-01         Algiers     19.4     31.9       5.0          NaN   
1    1  2010-01-02           Setif     18.6     29.1      12.4          NaN   
2    2  2010-01-03  Sidi Bel Abbes     12.2     29.7       0.0          NaN   
3    3  2010-01-04          Skikda     14.8     32.8       0.0          NaN   
4    4  2010-01-05          Skikda     15.0     35.8       0.0          NaN   

   Sunshine WindGustDir  WindGustSpeed  ... Humidity9am Humidity3pm  \
0       NaN         NNE           39.0  ...        70.0        40.0   
1       NaN           W           56.0  ...        88.0        48.0   
2       NaN           W           30.0  ...        57.0        32.0   
3       NaN          SW           30.0  ...        55.0        24.0   
4       NaN           W           46.0  ...        46.0        13.0   

   Pressure9am  Pressure3pm  Cloud9am  Cloud3pm  Temp9am  Temp3pm  RainToday  \
0       1012.2    

In [None]:
numerical_cols = train.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = train.select_dtypes(include=['object']).columns

categorical_cols = categorical_cols.drop('RainTomorrow')

imputer_num = SimpleImputer(strategy='median')
train[numerical_cols] = imputer_num.fit_transform(train[numerical_cols])
test[numerical_cols] = imputer_num.transform(test[numerical_cols])

imputer_cat = SimpleImputer(strategy='most_frequent')
train[categorical_cols] = imputer_cat.fit_transform(train[categorical_cols])
test[categorical_cols] = imputer_cat.transform(test[categorical_cols])

print(train.isnull().sum())
print(test.isnull().sum())


Day              0
Date             0
City             0
MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustDir      0
WindGustSpeed    0
WindDir9am       0
WindDir3pm       0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
RainToday        0
RainTomorrow     0
dtype: int64
Day              0
Date             0
City             0
MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustDir      0
WindGustSpeed    0
WindDir9am       0
WindDir3pm       0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
RainToday        0
dtype: int64


In [None]:
le = LabelEncoder()
train['RainTomorrow'] = le.fit_transform(train['RainTomorrow'])

if 'RainToday' in train.columns:
    train['RainToday'] = le.fit_transform(train['RainToday'])

if 'RainToday' in test.columns:
    test['RainToday'] = le.transform(test['RainToday'])

train = pd.get_dummies(train, columns=['City', 'WindGustDir', 'WindDir9am', 'WindDir3pm'], drop_first=True)
test = pd.get_dummies(test, columns=['City', 'WindGustDir', 'WindDir9am', 'WindDir3pm'], drop_first=True)

test = test.reindex(columns=train.columns, fill_value=0)

if 'RainTomorrow' in test.columns:
    test.drop('RainTomorrow', axis=1, inplace=True)

In [None]:
X = train.drop(['RainTomorrow', 'Date'], axis=1)
y = train['RainTomorrow']

print("Features after dropping 'RainTomorrow' and 'Date':")
print(X.columns)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training data shape: {X_train.shape}, Training labels shape: {y_train.shape}")
print(f"Validation data shape: {X_val.shape}, Validation labels shape: {y_val.shape}")


Features after dropping 'RainTomorrow' and 'Date':
Index(['Day', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
       'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm',
       'Temp9am', 'Temp3pm', 'RainToday', 'City_Annaba', 'City_Batna',
       'City_Bejaia', 'City_Blida', 'City_Constantine', 'City_Oran',
       'City_Setif', 'City_Sidi Bel Abbes', 'City_Skikda', 'WindGustDir_ENE',
       'WindGustDir_ESE', 'WindGustDir_N', 'WindGustDir_NE', 'WindGustDir_NNE',
       'WindGustDir_NNW', 'WindGustDir_NW', 'WindGustDir_S', 'WindGustDir_SE',
       'WindGustDir_SSE', 'WindGustDir_SSW', 'WindGustDir_SW', 'WindGustDir_W',
       'WindGustDir_WNW', 'WindGustDir_WSW', 'WindDir9am_ENE',
       'WindDir9am_ESE', 'WindDir9am_N', 'WindDir9am_NE', 'WindDir9am_NNE',
       'WindDir9am_NNW', 'WindDir9am_NW', 'WindDir9am_S', 'WindDir9am_SE',
       'WindDir9am_SSE', 'WindDir9am_SSW', 'WindDir9am_S

In [None]:
model = RandomForestClassifier(random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_val)

f1 = f1_score(y_val, y_pred)

print(f'F1 Score on the validation set: {f1:.4f}')


F1 Score on the validation set: 0.6136


In [None]:
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

xgb = XGBClassifier(eval_metric='logloss', random_state=42)
xgb.fit(X_train_resampled, y_train_resampled)

y_val_pred = xgb.predict(X_val)

f1 = f1_score(y_val, y_val_pred)
print("F1 Score on the validation set: {:.4f}".format(f1))


F1 Score on the validation set: 0.6355


In [None]:
test = test.reindex(columns=X.columns, fill_value=0)
test.drop('Date', axis=1, inplace=True, errors='ignore')

predictions = model.predict(test)

predictions_labels = le.inverse_transform(predictions)

submission_df = pd.DataFrame({
    'Day': test['Day'].values,
    'RainTomorrow': predictions_labels
})

submission_df.to_csv('submission.csv', index=False)

print(submission_df.head(20))


        Day RainTomorrow
0   76672.0           No
1   76673.0           No
2   76674.0           No
3   76675.0           No
4   76676.0           No
5   76677.0           No
6   76678.0           No
7   76679.0           No
8   76680.0           No
9   76681.0          Yes
10  76682.0           No
11  76683.0           No
12  76684.0          Yes
13  76685.0           No
14  76686.0           No
15  76687.0           No
16  76688.0           No
17  76689.0           No
18  76690.0           No
19  76691.0           No


In [None]:
print(submission_df.shape)


(32317, 2)
