In [17]:
import pandas as pd

# Load the dataset
df = pd.read_csv('data.csv')

# Display the first few rows of the dataset
print(df.head())


   YEAR  MO  DY  HR   T2M  PRECTOTCORR   RH2M  WS10M      PS     REF
0  2021   3  31  18  2.87          0.0  59.62   7.72  100.30  202103
1  2021   3  31  19  2.68          0.0  62.12   7.64  100.38  202103
2  2021   3  31  20  2.34          0.0  66.19   7.88  100.44  202103
3  2021   3  31  21  1.88          0.0  69.12   8.09  100.48  202103
4  2021   3  31  22  1.54          0.0  67.50   8.28  100.52  202103


In [18]:
# Convert PRCP column to binary (1 for rain, 0 for no rain)
df['RAIN'] = (df['PRCP'] > 0).astype(int)

# Define function to classify weather conditions
def classify_weather(row):
    if row['RAIN'] == 1:
        return 'rainy'
    elif row['TEMP'] <= 1:
        return 'cold'
    elif row['WND_SPD'] > 4:
        return 'windy'
    else:
        return 'sunny'

# Apply the function to create a new column for weather conditions
df['WEATHER_CONDITION'] = df.apply(classify_weather, axis=1)

# Display the updated DataFrame
print(df.head())

KeyError: 'PRCP'

In [6]:
# Count the number of occurrences of each weather condition
weather_counts = df['WEATHER_CONDITION'].value_counts()

# Display the counts
print(weather_counts)


KeyError: 'WEATHER_CONDITION'

In [7]:
import pandas as pd
import numpy as np

# Assuming df is your DataFrame containing the dataset
# Generate synthetic data points for "sunny"
snowy = df[df['WEATHER_CONDITION'] == 'cold'].sample(n=1000, replace=True)

# Concatenate the additional data points with the original dataset
df = pd.concat([df, snowy], ignore_index=True)


KeyError: 'WEATHER_CONDITION'

In [8]:
# Count the number of occurrences of each weather condition
weather_counts = df['WEATHER_CONDITION'].value_counts()

# Display the counts
print(weather_counts)


KeyError: 'WEATHER_CONDITION'

In [47]:
def classify_weather_balanced(row):
    if row['RAIN'] == 1:
        return 'rainy'
    elif row['TEMP'] <= 1 and row['RAIN'] == 0:
        return 'cold'
    elif row['WND_SPD'] > 4 and row['RAIN'] == 0:
        return 'windy'
    elif row['TEMP'] > 1 and row['RAIN'] == 0:
        return 'sunny'
    else:
        return 'snowy'

# Apply the function to create a new column for weather conditions
df['WEATHER_CONDITION_BALANCED'] = df.apply(classify_weather_balanced, axis=1)

# Display the updated DataFrame
print(df['WEATHER_CONDITION_BALANCED'].value_counts())

WEATHER_CONDITION_BALANCED
windy    8551
rainy    8139
sunny    4968
cold     3450
Name: count, dtype: int64


In [48]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Select features and target variable
X = df[['TEMP', 'HMDT', 'WND_SPD']]
y = df['WEATHER_CONDITION']

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a Logistic Regression model
model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.6704500199123855


In [49]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Select features and target variable
X = df[['TEMP', 'HMDT', 'WND_SPD']]
y = df['WEATHER_CONDITION']

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a Decision Tree classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Make predictions on the test set
dt_y_pred = dt_model.predict(X_test)

# Evaluate the Decision Tree model
dt_accuracy = accuracy_score(y_test, dt_y_pred)
print(f"Decision Tree Accuracy: {dt_accuracy}")

# Initialize and train a Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
rf_y_pred = rf_model.predict(X_test)

# Evaluate the Random Forest model
rf_accuracy = accuracy_score(y_test, rf_y_pred)
print(f"Random Forest Accuracy: {rf_accuracy}")


Decision Tree Accuracy: 0.7158502588610115
Random Forest Accuracy: 0.764436479490243


In [50]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Select features and target variable
X = df[['TEMP', 'HMDT', 'WND_SPD']]
y = df['WEATHER_CONDITION']

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train an SVM classifier
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test)

# Evaluate the SVM model
accuracy = accuracy_score(y_test, y_pred)
print(f"SVM Accuracy: {accuracy}")


SVM Accuracy: 0.6917562724014337


In [51]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Select features and target variable
X = df[['TEMP', 'HMDT', 'WND_SPD']]
y = df['WEATHER_CONDITION_BALANCED']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize and train the Gaussian Naive Bayes classifier
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = nb_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Naive Bayes Accuracy: {accuracy}")


Naive Bayes Accuracy: 0.6730386300278773
