In [1]:
import pandas as pd

# Load dataset from previous step
df = pd.read_parquet("data_01_10.parquet")

# Define the variable
var = "Intersection"

# Step 1: Basic info
print("Data type:", df[var].dtype)
print("Number of unique non-null values:", df[var].nunique(dropna=True))
print("\nSample unique values:")
print(df[var].dropna().unique()[:20])
missing_count = df[var].isna().sum()
missing_percent = df[var].isna().mean() * 100
print(f"\nMissing values: {missing_count}")
print(f"Missing percentage: {missing_percent:.2f}%")

# Step 2: Translate Polish values to English
translation_dict = {
    "Z drogą z pierwsz.": "With priority road",
    "O ruchu okrężnym": "Roundabout",
    "Równorzędne": "Equal priority"
}
df[var] = df[var].replace(translation_dict)

# Step 3: Recalculate distribution table AFTER translation
distribution_table = df[var].value_counts(dropna=False).reset_index()
distribution_table.columns = ["Value", "Count"]
distribution_table["Share [%]"] = (distribution_table["Count"] / len(df) * 100).round(2)

# Step 4: Save updated full DataFrame to Parquet
df.to_parquet("data_01_14.parquet", index=False)
print("Saved file: data_01_14.parquet")

# Step 5: Save updated distribution table to Excel
distribution_table.to_excel("01_14_intersection_distribution.xlsx", index=False)
print("Saved table: 01_14_intersection_distribution.xlsx")


Data type: object
Number of unique non-null values: 3

Sample unique values:
['Z drogą z pierwsz.' 'O ruchu okrężnym' 'Równorzędne']

Missing values: 5217858
Missing percentage: 73.11%
Saved file: data_01_14.parquet
Saved table: 01_14_intersection_distribution.xlsx
