In [2]:
!pip install pandas
!pip install openpyxl
!pip install scikit-learn
!pip install tqdm

















In [1]:
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV

In [2]:
# Load data set
filename = 'IDCJAC0010_066062_1800_Data.csv'
weather_data = pd.read_csv('C:/Users/Joshua/Desktop/python_ai/sklearn_models/archive/' + filename)

# Display first row
print(weather_data.head())

# Edit columns
weather_data['Date'] = pd.to_datetime(weather_data[['Year', 'Month', 'Day']])
weather_data = weather_data.drop(columns=['Product code', 'Bureau of Meteorology station number', 'Date'])

# Create a LabelEncoder instance
le = LabelEncoder()

# Apply label encoding to the 'Quality' column
weather_data['Quality'] = le.fit_transform(weather_data['Quality'])

# Access features
X = weather_data.drop('Maximum temperature (Degree C)', axis=1)
y = weather_data['Maximum temperature (Degree C)']
y = pd.to_numeric(y, errors='coerce')
print("X values:\n", X)
print("Y values:\n", y)
print(f"Number of NaN values in y before: {y.isna().sum()}")

# Replace NaN with mean
y.fillna(y.mean(), inplace=True)

print(f"Number of NaN values in y after: {y.isna().sum()}")

  Product code  Bureau of Meteorology station number  Year  Month  Day  \
0   IDCJAC0010                                 66062  1859      1    1   
1   IDCJAC0010                                 66062  1859      1    2   
2   IDCJAC0010                                 66062  1859      1    3   
3   IDCJAC0010                                 66062  1859      1    4   
4   IDCJAC0010                                 66062  1859      1    5   

   Maximum temperature (Degree C)  \
0                            24.4   
1                            24.4   
2                            24.2   
3                            24.7   
4                            24.6   

   Days of accumulation of maximum temperature Quality  
0                                          NaN       Y  
1                                          1.0       Y  
2                                          1.0       Y  
3                                          1.0       Y  
4                                          1.0 

In [5]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Create the model
rf = RandomForestRegressor(random_state=0)

# Hyperparameter ranges
param_grid = {
    'n_estimators': [100, 500, 1000],      # Number of trees in the forest
    'max_depth': [10, 20, 30, None],       # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],       # Minimum samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],         # Minimum samples required at each leaf node
    'max_features': [None, 'sqrt', 'log2']  # Number of features to consider for splits
}

# Select best hyper parameters
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

# Find best set hyperparameters
grid_search.fit(X_train, y_train)

# Select best hyperparameter
print("Best hyperparameters: ", grid_search.best_params_)
best_rf = grid_search.best_estimator_

# Predict best model
y_pred = best_rf.predict(X_test)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Best hyperparameters:  {'max_depth': 20, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 1000}
Mean Squared Error: 8.9076151203949
R² Score: 0.589621691146486


In [8]:
# Make prediction from input
input_data = {
    'Year': 2024,
    'Month': 9,
    'Day': 30,
    'Days of accumulation of maximum temperature': 1.0,
    'Quality': 1
}

# Convert to a dataframe
input_df = pd.DataFrame([input_data])

# Make prediction with model
predicted_temperature = best_rf.predict(input_df)
print(f"Predicted maximum temperature for that year: {predicted_temperature[0]:.2f} °C")

Predicted maximum temperature for that year: 22.97 °C
