In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np


In [None]:
filepath= "C:/Users/gupta/OneDrive/Desktop/New_folder/Internship_UBER/Dataset_Uber_Traffic.csv"
filepath2= 'c:/Users/gupta/OneDrive/Desktop/New_folder/Internship_UBER/Weather_Data.csv'
filepath3= 'c:/Users/gupta/OneDrive/Desktop/New_folder/Internship_UBER/Calendar_Data.csv'
df= pd.read_csv(filepath)

In [None]:
df.tail()

In [None]:
# Check for null values in the dataframe
null_values = df.isnull().sum()

# Display the count of null values for each column
print(null_values)

In [None]:
# Check for duplicates
duplicates = df.duplicated()
# Display rows that are duplicates
df[duplicates]

In [None]:
# Handle missing values by filling them with 0, though the data doesn't have any missing values nor any duplicates.
df.fillna(0, inplace=True)

# Remove duplicates
df.drop_duplicates(inplace=True)

# Correcting the data types, although the data type for the columns were correct except for DateTime column
df = df.astype({
    'Junction': 'int64',
    'Vehicles': 'int64',
    'ID': 'int64'
})

In [None]:
# Ensuring the DateTime column is in datetime format
df['DateTime'] = pd.to_datetime(df['DateTime'], format='%d/%m/%y %H:%M', dayfirst=True)

# Aggregating traffic data into hourly intervals for each junction
hourly_traffic = df.groupby([df['DateTime'].dt.floor('H'), 'Junction']).agg({'Vehicles': 'sum'}).reset_index()

# Renaming columns for clarity
hourly_traffic.rename(columns={'DateTime': 'Hour', 'Vehicles': 'TotalVehicles'}, inplace=True)

# Displaying the aggregated data
hourly_traffic.head()

In [None]:
# Initializing the MinMaxScaler
scaler = MinMaxScaler()

# Normalizing the TotalVehicles column
hourly_traffic['NormalizedVehicles'] = scaler.fit_transform(hourly_traffic[['TotalVehicles']])

# Displaying the updated dataframe
hourly_traffic.head()

In [None]:
# Generate time-based features
df['HourOfDay'] = df['DateTime'].dt.hour
df['DayOfWeek'] = df['DateTime'].dt.dayofweek
df['Month'] = df['DateTime'].dt.month

# Develop lag features (e.g., traffic data from the previous hour)
df['Lag1HourVehicles'] = df['Vehicles'].shift(1)
df['Lag24HourVehicles'] = df['Vehicles'].shift(24)

# Update special event dates as per the Indian calendar
special_event_dates = ['2015-08-15', '2015-10-22', '2015-11-11', '2016-08-15', '2016-10-30', '2016-11-07']  # Example dates for Independence Day, Dussehra, and Diwali
special_event_dates = pd.to_datetime(special_event_dates)
df['IsSpecialEvent'] = df['DateTime'].dt.date.isin(special_event_dates.date).astype(int)

# Create the 'IsWeekend' column based on 'DayOfWeek'
df['IsWeekend'] = df['DayOfWeek'].apply(lambda x: 1 if x in [5, 6] else 0)

# Display the updated dataframe
df.head()

In [None]:
# Correlation analysis for numerical features
correlation_matrix = df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

# Prepare data for Random Forest feature importance

X = df[['HourOfDay', 'DayOfWeek', 'Month', 'Lag1HourVehicles', 'Lag24HourVehicles', 'IsWeekend', 'IsSpecialEvent']]
y = df['Vehicles']

# Handle missing values in lag features
X.fillna(0, inplace=True)

# Train a Random Forest model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X, y)

# Extract feature importance
feature_importances = rf_model.feature_importances_
features = X.columns

# Plot feature importance
plt.figure(figsize=(8, 6))
sns.barplot(x=feature_importances, y=features, palette='viridis')
plt.title("Feature Importance from Random Forest")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()

In [None]:
# Predictions using the Random Forest model
y_pred = rf_model.predict(X)

# Calculate evaluation metrics
r2 = r2_score(y, y_pred)
mae = mean_absolute_error(y, y_pred)
rmse = np.sqrt(mean_squared_error(y, y_pred))

# Display the metrics
print(f"R^2 Score: {r2}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")

In [None]:
df1 = pd.read_csv(filepath2, index_col='time')


In [None]:
print(df1.info())

In [None]:
traffic_data= pd.read_csv(filepath)
weather_data= pd.read_csv(filepath2)
events_data= pd.read_csv(filepath3)

In [None]:
# Convert DateTime columns to datetime format for merging
traffic_data['DateTime'] = pd.to_datetime(traffic_data['DateTime'], format='%d/%m/%y %H:%M')
weather_data['time'] = pd.to_datetime(weather_data['time'])
events_data['DateTime'] = pd.to_datetime(events_data['DateTime'], format='%d-%m-%Y %H:%M')

# Merge datasets on DateTime
merged_data = pd.merge(traffic_data, weather_data, left_on='DateTime', right_on='time', how='inner')
merged_data = pd.merge(merged_data, events_data, on='DateTime', how='inner')

# Drop redundant 'time' column from weather_data
merged_data.drop(columns=['time'], inplace=True)

# Display the merged dataset
merged_data

In [None]:
merged_data.rename(columns={
	"temperature_2m (°C)": "Temperature (°C)",
	"wind_speed_10m (km/h)": "Wind Speed (km/h)",
	"precipitation (mm)": "Precipitation (mm)",
	"relative_humidity_2m (%)": "Humidity (%)"
}, inplace=True)

In [None]:
# Create the 'IsWeekend' column based on the 'DateTime' column
merged_data['IsWeekend'] = merged_data['DateTime'].dt.dayofweek.apply(lambda x: 1 if x in [5, 6] else 0)

In [None]:
# Display the updated DataFrame
merged_data

In [None]:
# STEP 1: Assign distance based on Junction number
# Each junction is 15 km apart
junction_distance_map = {
    1: 15,
    2: 30,
    3: 45,
    4: 60
}
merged_data['Distance_km'] = merged_data['Junction'].map(junction_distance_map)

# STEP 2: Define Fare Estimation Function
def calculate_estimated_fare(row):
    base_fare = 150  # base minimum fare
    distance_charge = row['Distance_km'] * 10  # ₹10 per km
    traffic_charge = row['Vehicles'] * 0.6     # moderate traffic impact
    temp_effect = (row['Temperature (°C)'] - 20) * 0.4
    rain_effect = row['Precipitation (mm)'] * 1.2
    holiday_charge = 75 if row['Is Holiday'] == 1 else 0
    weekend_charge = 50 if row['IsWeekend'] == 1 else 0

    total_fare = base_fare + distance_charge + traffic_charge + temp_effect + rain_effect + holiday_charge + weekend_charge
    return round(total_fare, 2)

# STEP 3: Apply Fare Calculation
merged_data['EstimatedFare'] = merged_data.apply(calculate_estimated_fare, axis=1)

# STEP 4: Plot Fare vs Distance
plt.figure(figsize=(10, 6))
sns.scatterplot(x=merged_data['Distance_km'], y=merged_data['EstimatedFare'], alpha=0.5)
plt.title("Estimated Fare vs Distance (Junction-Based Distance)")
plt.xlabel("Distance (km)")
plt.ylabel("Estimated Fare")
plt.tight_layout()
plt.show()

# STEP 5: Check correlation (Optional)
corr = merged_data[['Distance_km', 'EstimatedFare']].corr().iloc[0, 1]
print(f"Correlation between Distance and Estimated Fare: {corr:.3f}")


In [None]:
avg_fare = merged_data.groupby('Distance_km')['EstimatedFare'].mean().reset_index()

plt.figure(figsize=(8, 5))
sns.lineplot(data=avg_fare, x='Distance_km', y='EstimatedFare', marker='o')
plt.title("Average Estimated Fare by Distance")
plt.xlabel("Distance (km)")
plt.ylabel("Avg Fare")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
sns.scatterplot(data=merged_data, x='Vehicles', y='EstimatedFare', alpha=0.4)
plt.title("Traffic Volume vs Estimated Fare")
plt.xlabel("Number of Vehicles (Traffic)")
plt.ylabel("Estimated Fare")
plt.tight_layout()
plt.show()

In [None]:
hourly = merged_data.groupby('HourOfDay')[['Vehicles', 'EstimatedFare']].mean().reset_index()

fig, ax1 = plt.subplots(figsize=(10, 6))
sns.lineplot(data=hourly, x='HourOfDay', y='Vehicles', ax=ax1, label='Avg Traffic', color='red')
ax2 = ax1.twinx()
sns.lineplot(data=hourly, x='HourOfDay', y='EstimatedFare', ax=ax2, label='Avg Fare', color='blue')
ax1.set_xlabel("Hour of Day")
ax1.set_ylabel("Avg Traffic (Vehicles)", color='red')
ax2.set_ylabel("Avg Estimated Fare", color='blue')
plt.title("Hourly Traffic vs Fare Pattern")
plt.tight_layout()
plt.show()

In [None]:
junction_avg = merged_data.groupby('Junction')[['Vehicles', 'EstimatedFare']].mean().reset_index()

fig, ax1 = plt.subplots(figsize=(8, 5))
sns.barplot(data=junction_avg, x='Junction', y='Vehicles', color='orange', ax=ax1)
ax2 = ax1.twinx()
sns.lineplot(data=junction_avg, x='Junction', y='EstimatedFare', marker='o', color='blue', ax=ax2)
ax1.set_xlabel("Junction")
ax1.set_ylabel("Avg Vehicles", color='orange')
ax2.set_ylabel("Avg Estimated Fare", color='blue')
plt.title("Junction-wise Avg Traffic and Fare")
plt.tight_layout()
plt.show()


In [None]:
merged_data

In [None]:
merged_data.to_csv('merged_data.csv')

In [None]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Define the model (RandomForestRegressor is already imported and initialized as rf_model)
model = rf_model

# Define evaluation metrics function
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, r2

# Time-based cross-validation
tscv = TimeSeriesSplit(n_splits=5)
mae_scores, rmse_scores, r2_scores = [], [], []

for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    mae, rmse, r2 = evaluate_model(y_test, y_pred)
    mae_scores.append(mae)
    rmse_scores.append(rmse)
    r2_scores.append(r2)

# Display cross-validation results
print(f"Mean MAE: {np.mean(mae_scores):.4f}, Std: {np.std(mae_scores):.4f}")
print(f"Mean RMSE: {np.mean(rmse_scores):.4f}, Std: {np.std(rmse_scores):.4f}")
print(f"Mean R^2: {np.mean(r2_scores):.4f}, Std: {np.std(r2_scores):.4f}")

# Final model training on the entire dataset
model.fit(X, y)

# Predict on the entire dataset for final evaluation
y_pred_final = model.predict(X)
final_mae, final_rmse, final_r2 = evaluate_model(y, y_pred_final)

# Display final evaluation metrics
print(f"Final Model Evaluation:")
print(f"MAE: {final_mae:.4f}")
print(f"RMSE: {final_rmse:.4f}")
print(f"R^2: {final_r2:.4f}")

In [None]:
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Define the Gradient Boosting Regressor model
gbr = GradientBoostingRegressor(random_state=42)

# Define hyperparameter grid for tuning
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'subsample': [0.8, 1.0]
}

# Time-based cross-validation
tscv = TimeSeriesSplit(n_splits=5)

# Perform grid search for hyperparameter tuning
grid_search = GridSearchCV(estimator=gbr, param_grid=param_grid, cv=tscv, scoring='neg_mean_squared_error', verbose=1)
grid_search.fit(X_train, y_train)

# Best model from grid search
best_model = grid_search.best_estimator_

# Evaluate the best model on the validation set
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

# Display evaluation metrics
print(f"Best Model Parameters: {grid_search.best_params_}")
print(f"Validation MAE: {mae:.4f}")
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation R^2: {r2:.4f}")