**Regression Model**

In [54]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Load the dataset
data = pd.read_csv('model_training/weatherHistory.csv')

In [55]:
# Display the first few rows of the dataset
print(data.head())

                  Formatted Date        Summary Precip Type  Temperature (C)  \
0  2006-04-01 00:00:00.000 +0200  Partly Cloudy        rain         9.472222   
1  2006-04-01 01:00:00.000 +0200  Partly Cloudy        rain         9.355556   
2  2006-04-01 02:00:00.000 +0200  Mostly Cloudy        rain         9.377778   
3  2006-04-01 03:00:00.000 +0200  Partly Cloudy        rain         8.288889   
4  2006-04-01 04:00:00.000 +0200  Mostly Cloudy        rain         8.755556   

   Apparent Temperature (C)  Humidity  Wind Speed (km/h)  \
0                  7.388889      0.89            14.1197   
1                  7.227778      0.86            14.2646   
2                  9.377778      0.89             3.9284   
3                  5.944444      0.83            14.1036   
4                  6.977778      0.83            11.0446   

   Wind Bearing (degrees)  Visibility (km)  Loud Cover  Pressure (millibars)  \
0                   251.0          15.8263         0.0               1015.13  

In [56]:
# Display summary information about the dataset
print(data.info())
print(data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96453 entries, 0 to 96452
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Formatted Date            96453 non-null  object 
 1   Summary                   96453 non-null  object 
 2   Precip Type               95936 non-null  object 
 3   Temperature (C)           96453 non-null  float64
 4   Apparent Temperature (C)  96453 non-null  float64
 5   Humidity                  96453 non-null  float64
 6   Wind Speed (km/h)         96453 non-null  float64
 7   Wind Bearing (degrees)    96453 non-null  float64
 8   Visibility (km)           96453 non-null  float64
 9   Loud Cover                96453 non-null  float64
 10  Pressure (millibars)      96453 non-null  float64
 11  Daily Summary             96453 non-null  object 
dtypes: float64(8), object(4)
memory usage: 8.8+ MB
None
       Temperature (C)  Apparent Temperature (C)      Humidity  

In [57]:
# Selecting features and target variable
features = data.drop(columns=['Temperature (C)'])  # Use the correct column name
target = data['Temperature (C)']  # Use the correct column name

In [58]:
# Drop non-numeric columns from features
features = features.drop(columns=['Formatted Date', 'Summary', 'Daily Summary'])

In [59]:
# Handle 'Snow' as a new category
if 'Snow' in features['Precip Type'].unique():
    # 'Snow' is already present, so no need to do anything
    print("Snow is already present as a category.")
else:
    # 'Snow' is not present, so add it as a new category
    features['Precip Type'].replace(np.nan, 'Snow', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  features['Precip Type'].replace(np.nan, 'Snow', inplace=True)


In [60]:
# Encode categorical variables
label_encoder = LabelEncoder()
features['Precip Type'] = label_encoder.fit_transform(features['Precip Type'])

In [61]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [62]:
# Standardize the feature variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [63]:
# Initialize the model
model = LinearRegression()

In [64]:
# Train the model
model.fit(X_train_scaled, y_train)

# Make predictions on the training set
y_train_pred = model.predict(X_train_scaled)

# Make predictions on the testing set
y_test_pred = model.predict(X_test_scaled)

In [65]:
# Evaluate the model
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = train_mse ** 0.5
train_r2 = r2_score(y_train, y_train_pred)

test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = test_mse ** 0.5
test_r2 = r2_score(y_test, y_test_pred)

In [66]:
print(f"Training Mean Squared Error: {train_mse}")
print(f"Training Root Mean Squared Error: {train_rmse}")
print(f"Training R² Score: {train_r2}")

print("")

print(f"Testing Mean Squared Error: {test_mse}")
print(f"Testing Root Mean Squared Error: {test_rmse}")
print(f"Testing R² Score: {test_r2}")

Training Mean Squared Error: 0.900626502893674
Training Root Mean Squared Error: 0.9490134366244105
Training R² Score: 0.9901028256979006

Testing Mean Squared Error: 0.8980960562126761
Testing Root Mean Squared Error: 0.9476793002976672
Testing R² Score: 0.9902549574687834


In [67]:
# Save the model and scaler
joblib.dump(model, 'linear_regression_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']