In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping

# Load the dataset
data = pd.read_csv('daily_data.csv')

# Handle missing values
imputer = SimpleImputer(strategy='most_frequent')
data_filled = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# Convert numerical columns to the correct dtype
numerical_columns = ['temperature_celsius', 'wind_kph', 'wind_degree', 'pressure_mb', 'precip_mm',
                     'humidity', 'cloud', 'feels_like_celsius', 'visibility_km', 'uv_index', 
                     'gust_kph', 'air_quality_us-epa-index']

data_filled[numerical_columns] = data_filled[numerical_columns].astype(float)

# Encode categorical variables
label_encoders = {}
for column in data_filled.select_dtypes(include=['object']).columns:
    if column not in ['condition_text', 'day_id', 'sunrise', 'sunset']:
        le = LabelEncoder()
        data_filled[column] = le.fit_transform(data_filled[column])
        label_encoders[column] = le

# Encode target variable
target_encoder = LabelEncoder()
data_filled['condition_text'] = target_encoder.fit_transform(data_filled['condition_text'])

# Normalize numerical features
scaler = StandardScaler()
data_filled[numerical_columns] = scaler.fit_transform(data_filled[numerical_columns])

# Separate features and target
X = data_filled.drop(columns=['condition_text', 'day_id', 'sunrise', 'sunset'])
y = data_filled['condition_text']

# Split the dataset into training and validation sets
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a neural network
model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    BatchNormalization(),
    Dropout(0.5),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(9, activation='softmax')  # 9 classes for the weather conditions
])

# Compile the model
model.compile(optimizer='SGD', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model with early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(X_train, y_train, epochs=100, validation_data=(X_val, y_val), callbacks=[early_stopping])

# Evaluate the model
loss, accuracy = model.evaluate(X_val, y_val)
print(f'Validation accuracy: {accuracy}')

# Predict missing values in the dataset
predictions = model.predict(X)

# Convert predictions to labels
predicted_labels = target_encoder.inverse_transform(np.argmax(predictions, axis=1))

# Prepare submission
submission = pd.DataFrame({'day_id': data['day_id'], 'condition_text': predicted_labels})
submission.to_csv('submission.csv', index=False)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Validation accuracy: 0.8981001973152161
