<a href="https://colab.research.google.com/github/Faiz1310/Crowd-Flow-Prediction/blob/master/Final_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
# Load Dataset
df = pd.read_excel('/content/crowd_flow_india_2_years_new.xlsx')
df.head()

Unnamed: 0,Date,Time,Place,Crowd_Count (in Thousands),Weather,Day_of_Week,Holiday,Event,Temperature (°C),Week_of_Year,Special_Features,Region,Transportation_Type
0,2024-01-01,14:56:00,"Sanchi Stupa, Madhya Pradesh",825,Cloudy,Monday,Yes,Cultural Event,39,0,National Holiday,West,Walking
1,2024-01-02,00:48:00,"Qutub Minar, Delhi",8962,Cloudy,Tuesday,No,Regular Day,30,0,,South,Private
2,2024-01-03,02:06:00,"Varanasi Ghats, Varanasi",10000,Clear,Wednesday,No,Regular Day,32,0,,South,Public
3,2024-01-04,20:39:00,"Qutub Minar, Delhi",18632,Sunny,Thursday,No,Festival,24,0,,West,Private
4,2024-01-05,05:38:00,"Red Fort, Delhi",5155,Rainy,Friday,No,Cultural Event,39,0,,East,Walking


In [None]:
#  Clean and Prepare Data
# Drop 'Special_Features' (only 47 non-null values)
df = df.drop(columns=['Special_Features'])

# Create Timestamp
df['Timestamp'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])
df = df.drop(['Date', 'Time'], axis=1)
df = df.sort_values('Timestamp').set_index('Timestamp')

# Feature Engineering
df['Hour'] = df.index.hour
df['Day_of_Week_Num'] = df.index.dayofweek
df['Month'] = df.index.month

# Identify Columns
target = 'Crowd_Count (in Thousands)'
categorical_cols = ['Place', 'Weather', 'Day_of_Week', 'Holiday', 'Event', 'Region', 'Transportation_Type']
numerical_cols = ['Temperature (°C)', 'Week_of_Year', 'Hour', 'Day_of_Week_Num', 'Month']

#  Label Encoding
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [None]:
print("After Preprocessing")
df.head()

After Preprocessing


Unnamed: 0_level_0,Place,Crowd_Count (in Thousands),Weather,Day_of_Week,Holiday,Event,Temperature (°C),Week_of_Year,Region,Transportation_Type,Hour,Day_of_Week_Num,Month
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2024-01-01 14:56:00,5,825,1,1,1,0,39,0,4,2,14,0,1
2024-01-02 00:48:00,3,8962,1,5,0,3,30,0,3,0,0,1,1
2024-01-03 02:06:00,7,10000,0,6,0,3,32,0,3,1,2,2,1
2024-01-04 20:39:00,3,18632,3,4,0,1,24,0,4,0,20,3,1
2024-01-05 05:38:00,4,5155,2,0,0,0,39,0,1,2,5,4,1


In [None]:
#  Define Features and Target
features = categorical_cols + numerical_cols
X = df[features]
y = np.log1p(df[target])  # Apply log1p transformation

#  Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False, random_state=42)

#  Random Forest + Hyperparameter Tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestRegressor(random_state=42)
grid_search = RandomizedSearchCV(rf, param_grid, n_iter=10, cv=3, scoring='neg_mean_squared_error', random_state=42)
grid_search.fit(X_train, y_train)


In [None]:
#  Make Predictions
y_pred_log = grid_search.predict(X_test)
y_pred = np.expm1(y_pred_log)
y_test_original = np.expm1(y_test)

#  Model Evaluation
mse = mean_squared_error(y_test_original, y_pred)
mae = mean_absolute_error(y_test_original, y_pred)
r2 = r2_score(y_test_original, y_pred)

print("\n📈 Best Hyperparameters:", grid_search.best_params_)
print(f" Mean Squared Error (MSE): {mse:.2f}")
print(f" Mean Absolute Error (MAE): {mae:.2f}")
print(f" R-squared Score: {r2:.2f}")


📈 Best Hyperparameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 20}
 Mean Squared Error (MSE): 14474004.91
 Mean Absolute Error (MAE): 2770.11
 R-squared Score: 0.90


In [None]:
#  Predict on a Sample Input
sample_input = pd.DataFrame([{
    'Place': label_encoders['Place'].transform(['Red Fort, Delhi'])[0],
    'Weather': label_encoders['Weather'].transform(['Sunny'])[0],
    'Day_of_Week': label_encoders['Day_of_Week'].transform(['Monday'])[0],
    'Holiday': label_encoders['Holiday'].transform(['Yes'])[0],
    'Event': label_encoders['Event'].transform(['Regular Day'])[0],
    'Region': label_encoders['Region'].transform(['Central'])[0],
    'Transportation_Type': label_encoders['Transportation_Type'].transform(['Walking'])[0],
    'Temperature (°C)': 25,
    'Week_of_Year': 20,
    'Hour': 10,
    'Day_of_Week_Num': 0,
    'Month': 5
}])

# Match feature order
sample_input = sample_input[features]

# Predict
sample_pred_log = grid_search.predict(sample_input)
sample_pred = np.expm1(sample_pred_log)[0]

print(f"\n Predicted Crowd Count for sample: {sample_pred:.2f}")



 Predicted Crowd Count for sample: 7120.52
