In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# STEP 1: Load the combined dataset without treating "None" as NaN
combined_file = "/content/drive/MyDrive/ML PROJECT/Bus_Service_Predictions/Data Collection/merged_dataset.csv"
merged_df = pd.read_csv(combined_file, keep_default_na=False)

# STEP 2: Basic info about the combined file
print("Combined dataset shape:", merged_df.shape)
print("\nCombined dataset info:")
merged_df.info()

# STEP 3: Fill missing categorical columns
merged_df["Special_Event"] = merged_df["Special_Event"].fillna("None")
merged_df["Time_Slot"] = merged_df["Time_Slot"].fillna("unknown")
merged_df["Day_Type"] = merged_df["Day_Type"].fillna("unknown")

# STEP 4: Clean key columns (date, time slot, locations)
def clean_columns(df):
    df["Date"] = pd.to_datetime(df["Date"], errors='coerce').dt.strftime("%Y-%m-%d")
    df["Time_Slot"] = df["Time_Slot"].astype(str).str.strip().str.title()
    df["From_Location"] = df["From_Location"].astype(str).str.strip().str.title()
    df["To_Location"] = df["To_Location"].astype(str).str.strip().str.title()
    return df

merged_df = clean_columns(merged_df)

# STEP 5: Feature engineering - calculate Total_Revenue
merged_df["Total_Revenue"] = (
    merged_df["Passenger_Count"] *
    merged_df["Ticket_Price"] *
    (1 - merged_df["Discount_Offered (%)"] / 100)
)

# STEP 6: Create High_Demand flag (Passenger_Count > 250)
merged_df["High_Demand"] = merged_df["Passenger_Count"].apply(lambda x: 1 if x > 250 else 0)

# STEP 7: Date transformations - convert to datetime and extract weekday
merged_df["Date"] = pd.to_datetime(merged_df["Date"], errors='coerce')
merged_df["Weekday"] = merged_df["Date"].dt.day_name()

# STEP 8: Label encoding for categorical columns
le = LabelEncoder()
merged_df["Time_Slot_Encoded"] = le.fit_transform(merged_df["Time_Slot"])
merged_df["Day_Type_Encoded"] = le.fit_transform(merged_df["Day_Type"])
merged_df["Special_Event_Encoded"] = le.fit_transform(merged_df["Special_Event"].astype(str))

# STEP 9: Save processed dataset
output_path = "/content/drive/MyDrive/ML PROJECT/Bus_Service_Predictions/Data Preprocessing/merged_bus_data.csv"
merged_df.to_csv(output_path, index=False)

# STEP 10: Summary print
print("\nFinal merged dataset saved to:", output_path)
print("Final dataset shape:", merged_df.shape)
print("Sample data:")
print(merged_df.head())


Combined dataset shape: (2000, 10)

Combined dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Date                  2000 non-null   object 
 1   Time_Slot             2000 non-null   object 
 2   From_Location         2000 non-null   object 
 3   To_Location           2000 non-null   object 
 4   Passenger_Count       2000 non-null   int64  
 5   Day_Type              2000 non-null   object 
 6   Distance_km           2000 non-null   float64
 7   Ticket_Price          2000 non-null   int64  
 8   Discount_Offered (%)  2000 non-null   int64  
 9   Special_Event         2000 non-null   object 
dtypes: float64(1), int64(3), object(6)
memory usage: 156.4+ KB

Final merged dataset saved to: /content/drive/MyDrive/ML PROJECT/Bus_Service_Predictions/Data Preprocessing/merged_bus_data.csv
Final dataset shape: (2000, 1

In [None]:
Combined dataset shape: (2000, 10)

Combined dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   Date                  2000 non-null   object
 1   Time_Slot             2000 non-null   object
 2   From_Location         2000 non-null   object
 3   To_Location           2000 non-null   object
 4   Passenger_Count       2000 non-null   int64
 5   Day_Type              2000 non-null   object
 6   Distance_km           2000 non-null   float64
 7   Ticket_Price          2000 non-null   int64
 8   Discount_Offered (%)  2000 non-null   int64
 9   Special_Event         2000 non-null   object
dtypes: float64(1), int64(3), object(6)
memory usage: 156.4+ KB

Final merged dataset saved to: /content/drive/MyDrive/ML PROJECT/Bus_Service_Predictions/Data Preprocessing/merged_bus_data.csv
Final dataset shape: (2000, 16)
Sample data:
        Date  Time_Slot  From_Location   To_Location  Passenger_Count  \
0 2024-01-01    Evening  Visakhapatnam  Secunderabad              233
1 2024-01-01    Evening      Nizamabad        Guntur              195
2 2024-01-02      Night      Hyderabad       Kurnool              136
3 2024-01-02    Morning         Guntur   Rajahmundry              241
4 2024-01-02  Afternoon       Tirupati  Secunderabad              184

  Day_Type  Distance_km  Ticket_Price  Discount_Offered (%) Special_Event  \
0  Weekday       610.32           732                     5      No Event
1  Weekday       441.78           530                     0      No Event
2  Weekday       213.32           256                     3      No Event
3  Weekday       195.41           234                     1      No Event
4  Weekday       559.94           672                     2      No Event

   Total_Revenue  High_Demand  Weekday  Time_Slot_Encoded  Day_Type_Encoded  \
0      162028.20            0   Monday                  1                 0
1      103350.00            0   Monday                  1                 0
2       33771.52            0  Tuesday                  3                 0
3       55830.06            0  Tuesday                  2                 0
4      121175.04            0  Tuesday                  0                 0

   Special_Event_Encoded
0                      2
1                      2
2                      2
3                      2
4                      2

In [8]:
import pandas as pd
# merged = pd.read_csv("/content/drive/MyDrive/ML PROJECT/Bus_Service_Predictions/Data Preprocessing/merged_bus_data.csv")
merged = pd.read_csv("/content/drive/MyDrive/ML PROJECT/Bus_Service_Predictions/Data Collection/price_data.csv", keep_default_na=False)
merged

Unnamed: 0,Date,Time_Slot,From_Location,To_Location,Distance_km,Ticket_Price,Discount_Offered (%),Special_Event
0,1/1/2024,Evening,Visakhapatnam,Secunderabad,610.32,732,5,
1,1/1/2024,Evening,Nizamabad,Guntur,441.78,530,0,
2,2/1/2024,Night,Hyderabad,Kurnool,213.32,256,3,
3,2/1/2024,Morning,Guntur,Rajahmundry,195.41,234,1,
4,2/1/2024,Afternoon,Tirupati,Secunderabad,559.94,672,2,
...,...,...,...,...,...,...,...,...
1995,5/30/2025,Morning,Tenali,Vijayawada,33.07,40,0,
1996,5/30/2025,Evening,Nizamabad,Kurnool,385.09,462,2,
1997,5/31/2025,Evening,Mahbubnagar,Rajahmundry,495.83,595,10,Weekend
1998,5/31/2025,Morning,Rajahmundry,Vijayawada,158.45,190,10,Weekend
