In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Reload the original dataset
df = pd.read_csv("Port_Data.csv")

# -----------------------------
# Reapply preprocessing and feature engineering
# -----------------------------
df_clean = df.copy()
df_clean['UN Code'].fillna('Unknown', inplace=True)
df_clean['Port Name'] = df_clean['Port Name'].str.upper().str.strip()
df_clean['Also known as'] = (
    df_clean['Also known as']
    .str.replace(r"[\[\]']", "", regex=True)
    .str.replace(",", ";")
    .str.strip()
)

df_clean['Active Ratio'] = (
    (df_clean['Arrivals(Last 24 Hours)'] + df_clean['Departures(Last 24 Hours)']) 
    / df_clean['Expected Arrivals']
).round(2)

df_clean['Traffic Density'] = (
    df_clean['Vessels in Port'] 
    / (df_clean['Arrivals(Last 24 Hours)'] + df_clean['Departures(Last 24 Hours)'])
).round(2)

df_clean['Port Activity Index'] = (
    0.4 * df_clean['Arrivals(Last 24 Hours)'] +
    0.4 * df_clean['Departures(Last 24 Hours)'] +
    0.2 * df_clean['Expected Arrivals']
)

quantiles = df_clean['Port Activity Index'].quantile([0.33, 0.66])
def categorize_activity(x):
    if x <= quantiles.iloc[0]:
        return "Low"
    elif x <= quantiles.iloc[1]:
        return "Medium"
    else:
        return "High"

df_clean['Traffic Category'] = df_clean['Port Activity Index'].apply(categorize_activity)

# -----------------------------
# Generate synthetic Shipments and Daily Report tables
# -----------------------------

np.random.seed(42)
num_shipments = 1000
sample_ports = df_clean["Port Name"].unique()

shipment_data = {
    "Container_ID": [f"CNT{10000+i}" for i in range(num_shipments)],
    "Port_Name": np.random.choice(sample_ports, num_shipments),
    "Status": np.random.choice(["Arrived", "Departed", "Delayed", "In Transit"], num_shipments, p=[0.3, 0.3, 0.2, 0.2]),
    "ETA": [datetime.now() + timedelta(hours=np.random.randint(-72, 72)) for _ in range(num_shipments)],
    "Delay_Hours": np.random.choice([0, 1, 2, 3, 5, 8, 12, 24], num_shipments, p=[0.5, 0.1, 0.1, 0.1, 0.05, 0.05, 0.05, 0.05]),
    "Cargo_Type": np.random.choice(["Electronics", "Machinery", "Textiles", "Food", "Chemicals", "Automobiles", "Oil & Gas"], num_shipments),
}
shipments_df = pd.DataFrame(shipment_data)

num_days = 7
daily_records = []
for port in random.sample(list(sample_ports), 60):
    for day_offset in range(num_days):
        record_date = datetime.now().date() - timedelta(days=day_offset)
        vessels = np.random.randint(100, 2500)
        avg_delay = round(np.random.uniform(0.5, 12), 2)
        weather = np.random.choice(["Clear", "Cloudy", "Stormy", "Rainy", "Windy"], p=[0.4, 0.3, 0.1, 0.1, 0.1])
        remark = "Heavy congestion" if vessels > 1800 else "Moderate activity" if vessels > 1000 else "Smooth operations"
        daily_records.append([record_date, port, vessels, avg_delay, weather, remark])

daily_report_df = pd.DataFrame(daily_records, columns=["Date", "Port_Name", "Vessels_in_Port", "Avg_Delay", "Weather", "Remarks"])

# -----------------------------
# Save all datasets
# -----------------------------
port_data_path = "Port_Data_Clean.csv"
shipments_path = "Shipments.csv"
daily_report_path = "Daily_Report.csv"

df_clean.to_csv(port_data_path, index=False)
shipments_df.to_csv(shipments_path, index=False)
daily_report_df.to_csv(daily_report_path, index=False)

port_data_path, shipments_path, daily_report_path


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean['UN Code'].fillna('Unknown', inplace=True)


('Port_Data_Clean.csv', 'Shipments.csv', 'Daily_Report.csv')