In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sklearn

### Pre-processing

In [2]:
# Reading the Training Data
df = pd.read_csv("/content/train_set_dirty.csv")

In [None]:
# 1. Displaying the first 10 records
df.head(10)

In [None]:
# 2. Check dataset info - field types, non-null values, dtypes
df.info()
n_features = df.shape[1]
print(n_features)
feature_names = df.columns
print(feature_names)
print("Null counts for each column:")
print(df.isnull().sum())

In [None]:
# 3a. Pre-processing (Cleaning): Address missing (NULL) values - drop or imputation
# Check number and percentage of missing values
def check_missing_values(df):
    missing_values = df.isnull().sum()
    missing_percentage = (df.isnull().sum() / len(df)) * 100
    missing_df = pd.DataFrame({
        'Missing Values': missing_values,
        'Percentage': missing_percentage.round(2)
    })
    print("Missing Values Analysis:")
    print(missing_df[missing_df['Missing Values'] > 0])

check_missing_values(df)

In [6]:
# Check for exact duplicates (all columns must match exactly)
duplicates = df[df.duplicated(keep='first', subset=df.columns)]

'''
# Or check duplicates based on specific columns that matter most
important_columns = ['traffic_volume', 'temp', 'rain_1h', 'snow_1h', 'clouds_all', 'date_time', 'weather_description']
duplicates = df[df.duplicated(subset=important_columns, keep='first')]
'''

# To see the actual duplicate rows along with their original rows
df[df.duplicated(keep=False)].sort_index()

print("Original shape:", df.shape)
df_no_duplicates = df.drop_duplicates()
print("Shape after dropping duplicates:", df_no_duplicates.drop_duplicates().shape)

Original shape: (38563, 9)
Shape after dropping duplicates: (38550, 9)


In [7]:
# 3d. Pre-processing (Encoding): Convert categorical values to numeric
# Use df['col_name'].value_counts() to find out all the categories available per column

In [None]:
from sklearn.preprocessing import LabelEncoder
def prepare_features(df_no_duplicates):
    # Create a copy to avoid modifying original data
    df_processed = df_no_duplicates.copy()

    # Handle datetime
    df_processed['date_time'] = pd.to_datetime(df_processed['date_time'])
    df_processed['hour'] = df_processed['date_time'].dt.hour
    df_processed['day'] = df_processed['date_time'].dt.day
    df_processed['month'] = df_processed['date_time'].dt.month
    df_processed['day_of_week'] = df_processed['date_time'].dt.dayofweek

    # Handle categorical variables
    # Option 1: Label Encoding
    le = LabelEncoder()
    categorical_columns = ['weather_main', 'weather_description', 'holiday']

    for col in categorical_columns:
        df_processed[f'{col}_encoded'] = le.fit_transform(df_processed[col].fillna('Missing'))

    # Option 2: One-Hot Encoding
    #df_processed = pd.get_dummies(df_processed, columns=categorical_columns)

    return df_processed

# Apply the transformations
df_processed = prepare_features(df_no_duplicates)

# Check new columns
#print("New numerical columns:", df_processed.select_dtypes(include=['number']).columns.tolist())
#print(df_processed.info)
for column in df_processed.columns:
    print(column)


In [None]:
# 4a. Data understanding - Find out stats regarding your data (df.describe(), df.mean(), df.median())
df_processed.info()
df_processed.head(10)

In [10]:
# Drop original categorical columns since we have their encoded versions
columns_to_drop = ['holiday', 'weather_main', 'weather_description', 'date_time']

# Create new dataframe without these columns
df_cleaned = df_processed.drop(columns=columns_to_drop)

# Verify columns after dropping
print("Remaining columns:", df_cleaned.columns.tolist())

Remaining columns: ['traffic_volume', 'temp', 'rain_1h', 'snow_1h', 'clouds_all', 'hour', 'day', 'month', 'day_of_week', 'weather_main_encoded', 'weather_description_encoded', 'holiday_encoded']


In [None]:
# Any other relevant pre-processing (upto your exploration)
# Make a copy of the dataframe
df_clean = df_cleaned.copy()

# For traffic_volume (target variable)
df_clean['traffic_volume'] = df_clean['traffic_volume'].fillna(df_clean['traffic_volume'].mean())

# For temperature and other weather measurements
df_clean['temp'] = df_clean['temp'].fillna(df_clean['temp'].median())
df_clean['rain_1h'] = df_clean['rain_1h'].fillna(0)  # Assume no rain if NaN
df_clean['snow_1h'] = df_clean['snow_1h'].fillna(0)  # Assume no snow if NaN
df_clean['clouds_all'] = df_clean['clouds_all'].fillna(df_clean['clouds_all'].median())

# For encoded categorical columns (if they have any NaNs)
df_clean['weather_main_encoded'] = df_clean['weather_main_encoded'].fillna(df_clean['weather_main_encoded'].mode()[0])
df_clean['weather_description_encoded'] = df_clean['weather_description_encoded'].fillna(df_clean['weather_description_encoded'].mode()[0])
df_clean['holiday_encoded'] = df_clean['holiday_encoded'].fillna(0)  # Assume not a holiday if NaN

# For time-based features
time_columns = ['hour', 'day', 'month', 'day_of_week']
for col in time_columns:
    df_clean[col] = df_clean[col].fillna(df_clean[col].median())

# Verify no NaN values remain
print("\nRemaining NaN values after cleaning:")
print(df_clean.isnull().sum())

In [None]:
# Separate features (X) and target (y)
# First, create a copy of all features except traffic_volume
X = df_clean.drop('traffic_volume', axis=1)

# Extract traffic_volume as target variable
y = df_clean['traffic_volume']

# Print shapes to verify the split
print("Features (X) shape:", X.shape)
print("Target (y) shape:", y.shape)

# To see the feature names
print("\nFeature names:")
print(X.columns.tolist())

# To see first few rows of the split
print("\nFirst few rows of features (X):")
print(X.head(10))
print("\nFirst few rows of target (y):")
print(y.head(10))

In [None]:
# 4b. Data understanding - Make use of plots to build more understanding of the data
# Hint: Can use df.plot()

In [None]:
'''
# 4b. (Optional) Create More plots to understand the relationship b/w different variables

# Example: Top 20 actors of movies based on the imdb rating of the movies

plt.figure(figsize=(10, 8))

# Create a new dataframe with top 20 values
new_df = df.sort_values(by ='imdb_score' , ascending=False)
new_df = new_df.head(20)

# plotting
ax=sns.pointplot(x=new_df['actor_1_name'], y=new_df['imdb_score'], hue=new_df['movie_title'])
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()'''

In [None]:
# 4c. Find out which input features are the most important
# Hint: Start out with df.corr(). Can visualise with seaborn library
from sklearn.feature_selection import mutual_info_regression, SelectPercentile,SelectKBest
# 1. First, explicitly define all features including holiday_encoded
features = ['temp', 'rain_1h', 'snow_1h', 'clouds_all', 'hour', 'day',
           'month', 'day_of_week', 'weather_main_encoded',
           'weather_description_encoded', 'holiday_encoded']  # Explicitly include holiday_encoded

# 2. Verify features list
print("Features to be analyzed:", features)

# 3. Create X and y ensuring holiday_encoded is included
X = df_clean[features]  # Features including holiday_encoded
y = df_clean['traffic_volume']  # Target

# 4. Calculate mutual information
mi = mutual_info_regression(X, y)

# 5. Create and sort feature importance
feature_importance = dict(zip(features, mi))
sorted_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)

# 6. Print results ensuring all features are shown
print("\nFeature Importance Ranking:")
print("-" * 40)
for feature, score in sorted_features:
    print(f"{feature:<25} {score:.4f}")

# 7. Double check holiday_encoded specifically
print("\nHoliday encoded importance score:")
print(f"holiday_encoded: {feature_importance['holiday_encoded']:.4f}")

### Model Development

In [None]:
# 1. Divide dataframe into input and output
# X = df.drop(columns=['output_class']) -> Drop the column to be predicted
# y = df['output_class'] -> Choose Output column to be predicted

In [25]:
# 2. Perform Feature Selection - Experiment with the best one!
X = df_clean.drop('traffic_volume', axis=1)
X_selected = SelectKBest(mutual_info_regression,k=5).fit_transform(X,y)
selected_features = X.columns[selector.get_support()].tolist()
print(selected_features)

['temp', 'hour', 'day', 'month', 'day_of_week']


In [26]:
# Usually, we do a train-test split, but, in the hackathon, we'll already provide you with the separate datasets for each
from sklearn.model_selection import train_test_split
# First split your data
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

In [27]:
# 3. Data Normalisation: Bring into the range 0 to 1, or -1 to 1
# StandardScaler is used
# weather measurements (temp, rain, snow) likely follow a normal distribution
# target (traffic_volume) typically follows a normal distribution
# It's most compatible with statistical models and machine learning algorithms
from sklearn.preprocessing import StandardScaler
# Apply StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [29]:
# 4. Choose Model(s), fit
### Experiment with different models.
### https://scikit-learn.org/stable/supervised_learning.html
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
model = LinearRegression()
model.fit(X_train_scaled,y_train)
y_pred = model.predict(X_test_scaled)
print(r2_score(y_test,y_pred))

0.13283176730331325


In [28]:
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)
rf_model.fit(X_train_scaled, y_train)

# 5. Predict and evaluate
y_pred = rf_model.predict(X_test_scaled)
print("R² Score:", r2_score(y_test, y_pred))

R² Score: 0.7669929336653705


The difference in R² scores between Linear Regression and Random Forest:

# ***Linear Regression: R² = 0.1328 (13.28%)***

This indicates that the linear model can explain about 13.28% of the variance in traffic volume

The low score suggests that the relationship between features and target is not very linear

Linear Regression assumes a linear relationship between features and target


# ***Random Forest: R² = 0.7669 (76.69%)***

Much better performance, explaining about 76.69% of the variance
Random Forest performs better because:

It can capture non-linear relationships

It can handle feature interactions
It's more robust to noise in the data
It uses multiple decision trees (ensemble method)
The parameters we set (n_estimators=100, max_depth=10, etc.) helped optimize the model

The significant improvement (from 13% to 77%) suggests that:

The relationship between weather features and traffic volume is non-linear

There are complex interactions between features
Random Forest is more suitable for this type of prediction problem

In [None]:
# 5. Evaluate with relevant metric for your problem. Eg: accuracy_score(), r2_score()

In [None]:
# 6. After model choice is made, fine-tune with GridSearchCV, or RandomizedSearchCV()
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rf_model, X_train_scaled, y_train, cv=5)
print("Cross-validation scores:", scores)

### Testing and Creating Output CSV

In [36]:
# Creating output file for submission - Template Code

test_pd = pd.read_csv('/content/test_set_nogt.csv')

# saving in a new variable to modify
test = test_pd.copy(deep=True)

# Prepare data to be given as an input to your trained model
# 1. Repeat the pre-processing done above. Eg: Conversion to categorical, filling in mean values

# --- Perform the same pre-processing steps as you did for the training data ---
# Handle datetime to extract 'hour', 'day', 'month', 'day_of_week'
test['date_time'] = pd.to_datetime(test['date_time'], format='%d-%m-%Y %H:%M')
test['hour'] = test['date_time'].dt.hour
test['day'] = test['date_time'].dt.day
test['month'] = test['date_time'].dt.month
test['day_of_week'] = test['date_time'].dt.dayofweek

# Handle categorical variables using Label Encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
categorical_columns = ['weather_main', 'weather_description', 'holiday']

for col in categorical_columns:
    test[f'{col}_encoded'] = le.fit_transform(test[col].fillna('Missing'))

# Drop original categorical columns
columns_to_drop = ['holiday', 'weather_main', 'weather_description', 'date_time']
test = test.drop(columns=columns_to_drop)

# 2. Use the same features obtained in feature selection
chosen_features = selected_features # from above -> getting names of chosen features
test = test[chosen_features]

# 3. Normalise/Scale the features as done above
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()  # Assuming you used StandardScaler before
test = scaler.fit_transform(test)  # Apply scaling

# 4. Predict and obtain results from the model
y_pred = model.predict(test)

# 5. Save results to CSV
submission = pd.DataFrame({'ID': test_pd.index, 'Traffic_volume' : y_pred})
submission.to_csv('output_submission_eval_Lr.csv', index=False)

In [37]:
# 4. Predict and obtain results from the model
y_pred = model.predict(test)

# 5. Save results to CSV
submission = pd.DataFrame({'ID': test_pd.index, 'Traffic_volume' : y_pred})
submission.to_csv('output_submission_eval_RFF.csv', index=False)