In [None]:

import pandas as pd

# Load your CSV
df = pd.read_csv("ridership_headline.csv")

# 1. Fill empty values (NaN) with 0
df = df.fillna(0)

# 2. Remove all bus columns
df = df.drop(columns=[c for c in df.columns if c.startswith("bus_")])

# 3. Combine ETS, Intercity, Komuter_Utara, Tebrau, and Komuter into one column
df["rail_komuter"] = (
    df["rail_ets"]
    + df["rail_intercity"]
    + df["rail_komuter_utara"]
    + df["rail_tebrau"]
    + df["rail_komuter"]
)

# 4. Drop the old columns (optional)
df = df.drop(columns=["rail_ets", "rail_intercity", "rail_komuter_utara", "rail_tebrau"])

df.rename(columns={
    "rail_lrt_ampang": "Ampang Line",
    "rail_lrt_kj": "Kelana Jaya Line",
    "rail_monorail": "KL Monorail",
    "rail_mrt_pjy": "Putrajaya Line",
    "rail_mrt_kajang": "Kajang Line",
    "rail_komuter": "KTM"
}, inplace=True)

# 1. Ensure date is parsed as datetime
df["date"] = pd.to_datetime(df["date"])

# 2. Extract year
df["year"] = df["date"].dt.year

# 3. Group by year and sum numeric columns
df_yearly = df.groupby("year", as_index=False).sum(numeric_only=True)

df_yearly = df_yearly.melt(
    id_vars=["year"],
    var_name="route_line",
    value_name="total_ridership"
)
df_yearly = df_yearly.pivot(
    index="route_line",
    columns="year",
    values="total_ridership"
)

df_yearly.columns = [f"ridership_{int(c)}" for c in df_yearly.columns]

df_yearly = df_yearly.reset_index()

# 4. Save to new file
df_yearly.to_csv("transport_yearly_sum.csv", index=False)



In [1]:
import pandas as pd

# Load your original daily CSV
df = pd.read_csv("ridership_headline.csv")

# Fill empty values with 0
df = df.fillna(0)

# Combine ETS, Intercity, Komuter_Utara, Tebrau, and Komuter into one commuter column
df["rail_komuter_total"] = (
    df["rail_ets"] +
    df["rail_intercity"] +
    df["rail_komuter_utara"] +
    df["rail_tebrau"] +
    df["rail_komuter"]
)

# Keep only the rail lines you want
df = df[["date","rail_lrt_ampang","rail_mrt_kajang","rail_lrt_kj","rail_monorail","rail_mrt_pjy","rail_komuter_total"]]

df.rename(columns={
    "rail_lrt_ampang": "Ampang Line",
    "rail_lrt_kj": "Kelana Jaya Line",
    "rail_monorail": "KL Monorail",
    "rail_mrt_pjy": "Putrajaya Line",
    "rail_mrt_kajang": "Kajang Line",
    "rail_komuter_total": "KTM"
}, inplace=True)

df["date"] = pd.to_datetime(df["date"])
df = df[(df["date"].dt.year >= 2020) & (df["date"].dt.year <= 2024)]

# Pivot longer so each row = route + day
df_long = df.melt(
    id_vars=["date"], 
    value_vars=["Ampang Line","Kajang Line","Kelana Jaya Line","KL Monorail","Putrajaya Line","KTM"],
    var_name="route_line", 
    value_name="ridership"
)

# Save CSV
df_long.to_csv("ridership_filtered.csv", index=False)


In [2]:
import geopandas as gpd

gdf = gpd.read_file("../map_data/routes_selangor.geojson")
gdf = gdf[gdf["name:en"].str.contains("Kelana Jaya|Ampang|Putrajaya|Kajang|KTM|Monorail", case=False, na=False)]
gdf["name:en"] = gdf["name:en"].replace(
    {"Ampang and Sri Petaling Lines" : "Ampang Line"}
)
print(gdf["name:en"].unique())
gdf.to_file("routes_selangor2.geojson", driver="GeoJSON")


['Putrajaya Line' 'KTM' 'Kajang Line' 'Kelana Jaya Line' 'KL Monorail'
 'Ampang Line']


In [2]:
import pandas as pd

# Load CSV
df = pd.read_csv('fuelprice.csv')

# Ensure 'date' is parsed as datetime
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Filter: series == 'level'
df = df[df['series_type'] == 'level']

# Keep only needed columns (make sure they exist)
columns_to_keep = ['date', 'ron95', 'ron97']
df = df[columns_to_keep]

# Filter date range: 2020 - 2024 inclusive
df = df[(df['date'].dt.year >= 2020) & (df['date'].dt.year <= 2024)]

# Save filtered data (optional)
df.to_csv('fuelprice_filtered.csv', index=False)

print(df.head())


         date  ron95  ron97
41 2024-12-26   2.05   3.25
42 2024-12-19   2.05   3.22
43 2024-12-12   2.05   3.19
44 2024-12-05   2.05   3.19
45 2024-11-28   2.05   3.19


In [15]:
import pandas as pd

# Read CSV files
df = pd.read_csv("population_district_filtered.csv")
df_area = pd.read_csv("district_area.csv")

# Clean column names
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
df_area.columns = df_area.columns.str.strip().str.lower().str.replace(" ", "_")

print("Population columns:", df.columns.tolist())
print("Area columns:", df_area.columns.tolist())

# Convert population to absolute numbers
df["population_absolute"] = df["population"] * 1000

# Merge and calculate density
merged_df = pd.merge(df, df_area, on="district", how="left")
merged_df["population_density"] = merged_df["population_absolute"] / merged_df["area_km2"]

# Filter and export
filtered_df = merged_df[(merged_df["year"] >= 2020) & (merged_df["year"] <= 2024)]
filtered_df.to_csv("population_district_filtered.csv", index=False)


Population columns: ['year', 'state', 'district', 'sex', 'age', 'ethnicity', 'population']
Area columns: ['district', 'area_km2']
