In [20]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


In [21]:
extracted_folder_path = 'C:/Users/91709/Downloads/Dataset'
dataset_folder_path = os.path.join(extracted_folder_path, 'Dataset')



In [22]:
# Load the datasets
train_data_path = os.path.join(dataset_folder_path, 'Train.csv')
test_data_path = os.path.join(dataset_folder_path, 'Test.csv')
submission_data_path = os.path.join(dataset_folder_path, 'Submission.csv')


In [23]:
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
submission_format = pd.read_csv(submission_data_path)


In [24]:
# Check the column names in the datasets
print("Train Data Columns:", train_data.columns)
print("Test Data Columns:", test_data.columns)


Train Data Columns: Index(['Holiday', 'Temperature', 'Rainfall_last_hour', 'Snowfall_last_hour',
       'Cloud_Cover', 'Weather', 'Weather_Desc', 'TimeStamp', 'Date',
       'Traffic_Vol'],
      dtype='object')
Test Data Columns: Index(['Holiday', 'Temperature', 'Rainfall_last_hour', 'Snowfall_last_hour',
       'Cloud_Cover', 'Weather', 'Weather_Desc', 'TimeStamp', 'Date',
       'Traffic_Vol'],
      dtype='object')


In [25]:
def preprocess_data(train, test):
    # Combine train and test for consistent one-hot encoding
    train['is_train'] = 1
    test['is_train'] = 0
    combined = pd.concat([train, test], ignore_index=True)
    
    # Convert 'TimeStamp' to datetime and extract features
    combined['TimeStamp'] = pd.to_datetime(combined['TimeStamp'])
    combined['hour'] = combined['TimeStamp'].dt.hour
    combined['day'] = combined['TimeStamp'].dt.day
    combined['month'] = combined['TimeStamp'].dt.month
    combined['weekday'] = combined['TimeStamp'].dt.weekday
    combined.drop(columns=['TimeStamp'], inplace=True)

    # Handle missing values
    combined.fillna(combined.median(), inplace=True)
    for column in combined.select_dtypes(include=['object']).columns:
        combined[column].fillna(combined[column].mode()[0], inplace=True)

    # One-hot encode categorical variables
    combined = pd.get_dummies(combined, drop_first=True)

    # Split back into train and test sets
    train = combined[combined['is_train'] == 1].drop(columns=['is_train'])
    test = combined[combined['is_train'] == 0].drop(columns=['is_train'])

    return train, test


   



In [26]:
# Apply preprocessing to both train and test datasets
train_data, test_data = preprocess_data(train_data, test_data)


  combined.fillna(combined.median(), inplace=True)


In [27]:

# Split train_data into features and target
X_train = train_data.drop(columns=['Traffic_Vol'])  # Exclude the target column
y_train = train_data['Traffic_Vol']  # Target column

# Ensure test_data has no 'Traffic_Vol' column
if 'Traffic_Vol' in test_data.columns:
    test_data = test_data.drop(columns=['Traffic_Vol'])




In [28]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [29]:
# Build the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)



In [30]:
# Evaluate the model
val_predictions = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, val_predictions))
print(f"Validation RMSE: {rmse}")



Validation RMSE: 570.3224468896766


In [34]:
# Predict on the test dataset
test_predictions = model.predict(test_data)
# Remove the first column
submission_format = submission_format.drop(submission_format.columns[0], axis=1)


In [35]:
# Prepare the submission format
submission_format['traffic_volume'] = test_predictions



In [37]:
output_dir = 'C:/Users/91709/Downloads/Dataset'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

output_path = os.path.join(output_dir, 'submission.csv')
submission_format.to_csv(output_path, index=False)
print(f"Predictions saved to {output_path}")



Predictions saved to C:/Users/91709/Downloads/Dataset\submission.csv
