In [None]:
import pandas as pd
!pip install seaborn

import seaborn as sns
import numpy as np
from sklearn import set_config
set_config(display = 'diagram')
import warnings
warnings.filterwarnings('ignore')



In [None]:
df1 = pd.read_csv('/content/dataset (2).csv')

In [None]:
df1.columns

Index(['ID', 'SystemCodeNumber', 'Capacity', 'Latitude', 'Longitude',
       'Occupancy', 'VehicleType', 'TrafficConditionNearby', 'QueueLength',
       'IsSpecialDay', 'LastUpdatedDate', 'LastUpdatedTime'],
      dtype='object')

In [None]:
# prompt: find isnul, deplicate and
# and fill missing value with respective columns

# Check for null values
print("Null values per column:")
print(df1.isnull().sum())

# Check for duplicate rows
print("\nNumber of duplicate rows:")
print(df1.duplicated().sum())

# Fill missing values with the mean of respective columns
# This assumes numerical data. For categorical data, consider mode or another strategy.
for col in df1.columns:
    if df1[col].isnull().any():
        if df1[col].dtype in ['int64', 'float64']:
            df1[col].fillna(df1[col].mean(), inplace=True)
        # Add handling for other data types if needed
        # elif df1[col].dtype == 'object':
        #    df1[col].fillna(df1[col].mode()[0], inplace=True)

print("\nNull values after filling:")
print(df1.isnull().sum())

Null values per column:
ID                        0
SystemCodeNumber          0
Capacity                  0
Latitude                  0
Longitude                 0
Occupancy                 0
VehicleType               0
TrafficConditionNearby    0
QueueLength               0
IsSpecialDay              0
LastUpdatedDate           0
LastUpdatedTime           0
dtype: int64

Number of duplicate rows:
0

Null values after filling:
ID                        0
SystemCodeNumber          0
Capacity                  0
Latitude                  0
Longitude                 0
Occupancy                 0
VehicleType               0
TrafficConditionNearby    0
QueueLength               0
IsSpecialDay              0
LastUpdatedDate           0
LastUpdatedTime           0
dtype: int64


In [None]:
import pandas as pd

# Drop unnecessary columns
df2 = df1.drop(['ID', 'Latitude', 'Longitude'], axis=1)

# Convert LastUpdatedDate with dayfirst format and extract month and day
df2['LastUpdatedDate'] = pd.to_datetime(df2['LastUpdatedDate'], dayfirst=True)
df2['Month'] = df2['LastUpdatedDate'].dt.month
df2['Date'] = df2['LastUpdatedDate'].dt.day

# Convert LastUpdatedTime (with seconds) and extract hour and minute
df2['LastUpdatedTime'] = pd.to_datetime(df2['LastUpdatedTime'], format='%H:%M:%S')
df2['Hour'] = df2['LastUpdatedTime'].dt.hour
df2['Minute'] = df2['LastUpdatedTime'].dt.minute

# Drop original datetime columns
df2 = df2.drop(['LastUpdatedDate', 'LastUpdatedTime'], axis=1)

# Output
print(df2.head())
print(df2.columns)
print(df2.info())


  SystemCodeNumber  Capacity  Occupancy VehicleType TrafficConditionNearby  \
0      BHMBCCMKT01       577         61         car                    low   
1      BHMBCCMKT01       577         64         car                    low   
2      BHMBCCMKT01       577         80         car                    low   
3      BHMBCCMKT01       577        107         car                    low   
4      BHMBCCMKT01       577        150        bike                    low   

   QueueLength  IsSpecialDay  Month  Date  Hour  Minute  
0            1             0     10     4     7      59  
1            1             0     10     4     8      25  
2            2             0     10     4     8      59  
3            2             0     10     4     9      32  
4            2             0     10     4     9      59  
Index(['SystemCodeNumber', 'Capacity', 'Occupancy', 'VehicleType',
       'TrafficConditionNearby', 'QueueLength', 'IsSpecialDay', 'Month',
       'Date', 'Hour', 'Minute'],
      dty

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

# Normalize TrafficConditionNearby values to match expected ordinal categories
df['TrafficConditionNearby'] = df['TrafficConditionNearby'].str.capitalize()

# Label encode VehicleType (must be done outside the ColumnTransformer)
df['VehicleType'] = LabelEncoder().fit_transform(df['VehicleType'])

# Define the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), ['SystemCodeNumber']),
        ('ordinal', OrdinalEncoder(categories=[['Low', 'Average', 'High']]), ['TrafficConditionNearby'])
    ],
    remainder='passthrough'  # Keep the rest of the columns
)

# Apply the transformations
transformed_data = preprocessor.fit_transform(df)

# Retrieve column names
onehot_feature_names = preprocessor.named_transformers_['onehot'].get_feature_names_out(['SystemCodeNumber'])

# Define final column names
final_columns = list(onehot_feature_names) + ['TrafficConditionNearby'] + [
    col for col in df.columns if col not in ['SystemCodeNumber', 'TrafficConditionNearby']
]

# Create final DataFrame
df = df_transformed = pd.DataFrame(transformed_data, columns=final_columns)

# Convert data to numeric (optional, to avoid dtype 'object')
df_transformed = df_transformed.apply(pd.to_numeric, errors='ignore')

# Output
print(df_transformed.head())
print(df_transformed.columns)
print(df_transformed.info())


   SystemCodeNumber_BHMBCCMKT01  SystemCodeNumber_BHMBCCTHL01  \
0                           1.0                           0.0   
1                           0.0                           1.0   
2                           0.0                           0.0   
3                           0.0                           0.0   
4                           0.0                           0.0   

   SystemCodeNumber_BHMEURBRD01  SystemCodeNumber_BHMMBMMBX01  \
0                           0.0                           0.0   
1                           0.0                           0.0   
2                           1.0                           0.0   
3                           0.0                           1.0   
4                           0.0                           0.0   

   SystemCodeNumber_BHMNCPHST01  SystemCodeNumber_BHMNCPNST01  \
0                           0.0                           0.0   
1                           0.0                           0.0   
2                      

In [None]:
import pandas as pd

# Example input: replace this with your actual DataFrame
# df = pd.read_csv("your_data.csv")

# 1. Define dynamic alpha function
def get_dynamic_alpha(hour, is_special_day, traffic_condition):
    """
    Calculates alpha based on hour, special day, and traffic condition.
    """
    # Base alpha based on time and special event
    if is_special_day == 1:
        base_alpha = 3.5
    elif 7 <= hour <= 10:
        base_alpha = 2.5
    elif 17 <= hour <= 20:
        base_alpha = 2.0
    else:
        base_alpha = 1.5

    # Traffic modifier
    traffic = str(traffic_condition).strip().lower()
    if traffic == 'low':
        traffic_modifier = 1.2
    elif traffic == 'high':
        traffic_modifier = 0.8
    else:
        traffic_modifier = 1.0

    # Final alpha
    alpha = base_alpha * traffic_modifier
    return alpha

# 2. Define initial price function based on vehicle type
def get_initial_price(vehicle_type):
    """
    Assigns base price by vehicle type.
    """
    vehicle = str(vehicle_type).strip().lower()
    if 'truck' in vehicle:
        return 10.00
    elif 'car' in vehicle:
        return 5.00
    elif 'bike' in vehicle:
        return 3.00
    elif 'cycle' in vehicle:
        return 2.00
    else:
        return 5.00  # Default

# 3. Define the price update function
def calculate_next_price(current_price, alpha, occupancy, capacity):
    """
    Computes the next price using the linear model formula.
    """
    if capacity == 0:
        return current_price
    occupancy_ratio = occupancy / capacity
    return current_price + alpha * occupancy_ratio

# -------------------------
# MAIN PRICING CALCULATION
# -------------------------

# Sort DataFrame chronologically
df = df2.sort_values(by=['Month', 'Date', 'Hour', 'Minute']).reset_index(drop=True)

# Initialize tracking dictionaries
prices = []
current_prices = {}       # per vehicle type
previous_dates = {}       # track daily resets

# Loop through each row
for index, row in df.iterrows():
    vehicle_type = row['VehicleType'].lower()
    traffic = row['TrafficConditionNearby']
    date_key = (vehicle_type, row['Month'], row['Date'])

    # If date changes for this vehicle type, reset initial price
    if date_key != previous_dates.get(vehicle_type):
        initial_price = get_initial_price(vehicle_type)
        current_prices[vehicle_type] = initial_price
        previous_dates[vehicle_type] = date_key

    # Store the current price
    prices.append(round(current_prices[vehicle_type], 2))

    # Get dynamic alpha based on hour, special day, and traffic
    alpha = get_dynamic_alpha(
        row['Hour'],
        row['IsSpecialDay'],
        traffic
    )

    # Calculate next price
    current_prices[vehicle_type] = calculate_next_price(
        current_prices[vehicle_type],
        alpha,
        row['Occupancy'],
        row['Capacity']
    )

# Add the computed prices back to DataFrame
df['Price'] = prices

# Display final result
print("--- Model 1: Dynamic Linear Pricing Model (Vehicle & Traffic Sensitive) ---")
print(df[['Month', 'Date', 'Hour', 'Minute', 'VehicleType', 'TrafficConditionNearby',
          'IsSpecialDay', 'Capacity', 'Occupancy', 'Price']])


--- Model 1: Dynamic Linear Pricing Model (Vehicle & Traffic Sensitive) ---
       Month  Date  Hour  Minute VehicleType TrafficConditionNearby  \
0         10     4     7      59         car                    low   
1         10     4     7      59         car                    low   
2         10     4     7      59         car                    low   
3         10     4     7      59         car                    low   
4         10     4     7      59        bike                    low   
...      ...   ...   ...     ...         ...                    ...   
18363     12    19    16      30         car                    low   
18364     12    19    16      30         car                    low   
18365     12    19    16      30         car                average   
18366     12    19    16      30         car                    low   
18367     12    19    16      30         car                    low   

       IsSpecialDay  Capacity  Occupancy   Price  
0                 0 

In [None]:
df

Unnamed: 0,SystemCodeNumber_BHMBCCMKT01,SystemCodeNumber_BHMBCCTHL01,SystemCodeNumber_BHMEURBRD01,SystemCodeNumber_BHMMBMMBX01,SystemCodeNumber_BHMNCPHST01,SystemCodeNumber_BHMNCPNST01,SystemCodeNumber_Broad Street,SystemCodeNumber_Others-CCCPS105a,SystemCodeNumber_Others-CCCPS119a,SystemCodeNumber_Others-CCCPS135a,...,Capacity,Occupancy,VehicleType,QueueLength,IsSpecialDay,Month,Date,Hour,Minute,Price
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,577.0,61.0,1.0,1.0,0.0,10.0,4.0,7.0,59.0,5.00
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,387.0,120.0,1.0,2.0,0.0,10.0,4.0,7.0,59.0,5.32
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,470.0,117.0,1.0,2.0,0.0,10.0,4.0,7.0,59.0,6.25
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,687.0,264.0,1.0,2.0,0.0,10.0,4.0,7.0,59.0,6.99
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1200.0,237.0,0.0,2.0,0.0,10.0,4.0,7.0,59.0,3.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,3883.0,2533.0,1.0,3.0,0.0,12.0,19.0,16.0,30.0,179.54
18364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2937.0,1184.0,1.0,2.0,0.0,12.0,19.0,16.0,30.0,180.72
18365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1322.0,806.0,1.0,3.0,0.0,12.0,19.0,16.0,30.0,181.44
18366,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3103.0,1671.0,1.0,3.0,0.0,12.0,19.0,16.0,30.0,182.36


In [None]:
# prompt: Please export the df file in CSV.

# Export the dataframe 'df' to a CSV file
df.to_csv('processed_data.csv', index=False)

print("\nDataFrame exported to 'processed_data.csv'")


DataFrame exported to 'processed_data.csv'


In [None]:
import pandas as pd
import numpy as np
import pickle


from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer


# Load dataset
df = pd.read_csv('/content/processed_data.csv')

X = df.drop(columns=['Price'], errors='ignore')
y = df['Price']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Columns for PowerTransformer
power_transform_cols = ['Occupancy', 'QueueLength']
numeric_cols = X.columns

# Preprocessing pipeline
preprocessor = ColumnTransformer(transformers=[
    ('power', PowerTransformer(), power_transform_cols),
    ('scale', StandardScaler(), numeric_cols.difference(power_transform_cols))
])

# Evaluation function
def evaluate_regression_model(X_train, y_train, X_test, y_test, pipeline_model, param_grid, model_name):
    search = RandomizedSearchCV(
        pipeline_model,
        param_distributions=param_grid,
        n_iter=10,
        cv=KFold(n_splits=5, shuffle=True, random_state=42),
        scoring='r2',
        n_jobs=-1,
        verbose=1,
        random_state=42
    )

    print(f"\nRunning RandomizedSearchCV for {model_name}")
    search.fit(X_train, y_train)

    best_model = search.best_estimator_
    best_params = search.best_params_
    best_cv_score = search.best_score_

    y_pred = best_model.predict(X_test)

    test_r2 = r2_score(y_test, y_pred)
    test_mse = mean_squared_error(y_test, y_pred)
    test_mae = mean_absolute_error(y_test, y_pred)
    n = len(y_test)
    p = X.shape[1]
    test_adjusted_r2 = 1 - (1 - test_r2) * (n - 1) / (n - p - 1)

    print(f"\nBest Params: {best_params}")
    print(f"CV R² Score: {best_cv_score:.4f}")
    print(f"Test R² Score: {test_r2:.4f}")
    print(f"Test Adjusted R²: {test_adjusted_r2:.4f}")
    print(f"Test MSE: {test_mse:.4f}")
    print(f"Test MAE: {test_mae:.4f}")

    return best_model, test_r2

# Store models and accuracies
models_info = {}

# --- Linear Regression ---
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])
lr_param_grid = {}  # No hyperparameters for LinearRegression

best_lr_model, lr_r2 = evaluate_regression_model(
    X_train, y_train, X_test, y_test, lr_pipeline, lr_param_grid, 'LinearRegression'
)
models_info['LinearRegression'] = (best_lr_model, lr_r2)

# --- Random Forest Regressor ---
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])
rf_param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [10, 20, None],
    'regressor__min_samples_split': [2, 5],
    'regressor__min_samples_leaf': [1, 2],
    'regressor__max_features': ['auto', 'sqrt']
}

best_rf_model, rf_r2 = evaluate_regression_model(
    X_train, y_train, X_test, y_test, rf_pipeline, rf_param_grid, 'RandomForestRegressor'
)
models_info['RandomForestRegressor'] = (best_rf_model, rf_r2)

# --- XGBoost Regressor ---
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(random_state=42, verbosity=0))
])
xgb_param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [3, 6, 10],
    'regressor__learning_rate': [0.01, 0.1, 0.3],
    'regressor__subsample': [0.7, 1.0],
    'regressor__colsample_bytree': [0.7, 1.0]
}

best_xgb_model, xgb_r2 = evaluate_regression_model(
    X_train, y_train, X_test, y_test, xgb_pipeline, xgb_param_grid, 'XGBRegressor'
)
models_info['XGBRegressor'] = (best_xgb_model, xgb_r2)

# --- Summary and Save Best Model ---
print("\n--- Final Model Comparison ---")
for model_name, (_, r2) in models_info.items():
    print(f"{model_name}: R² = {r2:.4f}")

best_model_name = max(models_info, key=lambda k: models_info[k][1])
best_model = models_info[best_model_name][0]
print(f"\nBest Model: {best_model_name}")

# Save best model
with open(f'best_model_{best_model_name}.pkl', 'wb') as file:
    pickle.dump(best_model, file)
print(f"Best model saved as 'best_model_{best_model_name}.pkl'")



Running RandomizedSearchCV for LinearRegression
Fitting 5 folds for each of 1 candidates, totalling 5 fits

Best Params: {}
CV R² Score: 0.4467
Test R² Score: 0.4414
Test Adjusted R²: 0.4377
Test MSE: 1378.0407
Test MAE: 30.1877

Running RandomizedSearchCV for RandomForestRegressor
Fitting 5 folds for each of 10 candidates, totalling 50 fits

Best Params: {'regressor__n_estimators': 100, 'regressor__min_samples_split': 2, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 'sqrt', 'regressor__max_depth': None}
CV R² Score: 0.9277
Test R² Score: 0.9314
Test Adjusted R²: 0.9309
Test MSE: 169.2914
Test MAE: 9.3766

Running RandomizedSearchCV for XGBRegressor
Fitting 5 folds for each of 10 candidates, totalling 50 fits

Best Params: {'regressor__subsample': 0.7, 'regressor__n_estimators': 200, 'regressor__max_depth': 6, 'regressor__learning_rate': 0.1, 'regressor__colsample_bytree': 1.0}
CV R² Score: 0.9865
Test R² Score: 0.9869
Test Adjusted R²: 0.9868
Test MSE: 32.4122
Test MAE

In [None]:
# prompt: Please convert the dfs file Into csv.

# Assume df is the DataFrame you want to convert to CSV
# Ensure df is the correct DataFrame you intend to save.
# Based on the preceding code, 'df' was the result of the first pricing model calculation.
# If you intend to save the 'df_transformed' from the pipeline or the final 'df' with 'PredictedPrice',
# make sure to assign the correct DataFrame to the variable 'df' before this step.

# For example, to save the DataFrame with PredictedPrice:
# df_to_save = df # Make sure 'df' at this point is the one with 'PredictedPrice'

# To save the DataFrame from the first pricing model:
# df_to_save = df # This is already done in the original code snippet

# To save the df_transformed from the second pipeline example:
# df_to_save = df_transformed # You would need to assign df_transformed to df_to_save

# Let's assume we want to save the final 'df' which includes 'PredictedPrice'
# Ensure the 'df' variable holds the data you want to save.
# Based on the last lines of the preceding code, 'df' is the one with 'PredictedPrice'.

df.to_csv('output_dataframe.csv', index=False)

print("DataFrame successfully converted and saved to 'output_dataframe.csv'")


DataFrame successfully converted and saved to 'output_dataframe.csv'


In [None]:
# prompt: Convert the latest dfs file into csv Name that file. Final predicted price model 2.

df.to_csv('Final predicted price model 2.csv', index=False)