# Weather prediction system

First we have to get the dataset

In [1]:
import pandas as pd

file = 'data/weatherHistory.csv'
df = pd.read_csv(file)
columns = df.columns
print(columns)

Index(['Formatted Date', 'Summary', 'Precip Type', 'Temperature (C)',
       'Apparent Temperature (C)', 'Humidity', 'Wind Speed (km/h)',
       'Wind Bearing (degrees)', 'Visibility (km)', 'Loud Cover',
       'Pressure (millibars)', 'Daily Summary'],
      dtype='object')


As you can see there are 12 columns in the dataset. We need to simply this data for our prediction model. We will use only the following columns: **Precip Type, Temperature (C), Apparent Temperature (C), Humidity, Wind Speed (km/h), Wind Bearing (degrees), Visibility (km), Pressure (millibars)**

In [2]:
selected_columns = ['Precip Type', 'Temperature (C)', 'Apparent Temperature (C)', 'Humidity', 'Wind Speed (km/h)', 'Wind Bearing (degrees)', 'Visibility (km)', 'Pressure (millibars)']
df_selected = df.loc[:, selected_columns]
df_selected

Unnamed: 0,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Pressure (millibars)
0,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,1015.13
1,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,1015.63
2,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,1015.94
3,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,1016.41
4,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,1016.51
...,...,...,...,...,...,...,...,...
96448,rain,26.016667,26.016667,0.43,10.9963,31.0,16.1000,1014.36
96449,rain,24.583333,24.583333,0.48,10.0947,20.0,15.5526,1015.16
96450,rain,22.038889,22.038889,0.56,8.9838,30.0,16.1000,1015.66
96451,rain,21.522222,21.522222,0.60,10.5294,20.0,16.1000,1015.95


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
file_path = 'data/weatherHistory.csv'
data = pd.read_csv(file_path)

# Drop columns that are not needed for prediction
data = data.drop(columns=['Formatted Date', 'Summary', 'Daily Summary', 'Loud Cover'])

# Handle missing values
data = data.dropna()

# Encode categorical variables
le = LabelEncoder()
data['Precip Type'] = le.fit_transform(data['Precip Type'])

# Split the data into features and target variable
X = data.drop(columns=['Precip Type'])
y = data['Precip Type']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Build the Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=le.classes_)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

# Function to make predictions on new data
def predict_precip_type(new_data):
    # Preprocess the new data similarly
    new_data_scaled = scaler.transform(new_data)
    predictions = model.predict(new_data_scaled)
    return le.inverse_transform(predictions)

# Example usage of the predict_precip_type function
# new_data should be a DataFrame with the same structure as the training features
new_data = pd.DataFrame({
    'Temperature (C)': [50.0, 25.0],
    'Apparent Temperature (C)': [8.0, -7.0],
    'Humidity': [0.9, 0.8],
    'Wind Speed (km/h)': [10.0, 5.0],
    'Wind Bearing (degrees)': [200, 100],
    'Visibility (km)': [15.0, 5.0],
    'Pressure (millibars)': [1015.0, 1020.0]
})

predictions = predict_precip_type(new_data)
print(predictions)


Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

        rain       1.00      1.00      1.00     17090
        snow       1.00      1.00      1.00      2098

    accuracy                           1.00     19188
   macro avg       1.00      1.00      1.00     19188
weighted avg       1.00      1.00      1.00     19188

['rain' 'rain']
