In [1]:
import pandas as pd

# 1. Load the dataset from the CSV file.
file_path = 'transportation.csv'
df = pd.read_csv(file_path)
print("Initial data shape:", df.shape)

# 2. Take a quick look at the data.
print("First few rows:")
print(df.head())
print("\nDataFrame info:")
print(df.info())

# 3. Check for missing values.
print("\nMissing values per column:")
print(df.isnull().sum())

# 4. (Optional) If there is a date column, convert it to a datetime format.
# Replace 'date' with the actual column name if it exists.
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'], errors='coerce')

# 5. Remove duplicate rows, if any.
df = df.drop_duplicates()
print("\nData shape after removing duplicates:", df.shape)

# 6. Impute or fill missing values.
# For numerical columns, fill missing values using the median.
# For categorical columns, fill missing values using the mode.
for col in df.columns:
    if df[col].dtype in ['float64', 'int64']:
        median_val = df[col].median()
        df[col].fillna(median_val, inplace=True)
    else:
        # For non-numerical columns, use the most frequent value (mode)
        mode_val = df[col].mode()[0] if not df[col].mode().empty else "Unknown"
        df[col].fillna(mode_val, inplace=True)

print("\nMissing values after imputation:")
print(df.isnull().sum())

# 7. Optional: Save the cleaned dataset to a new CSV file.
df.to_csv('cleaned_transportation.csv', index=False)
print("\nPreprocessing complete. Cleaned data saved to 'cleaned_transportation.csv'.")


Initial data shape: (50000, 5)
First few rows:
   route_id  vehicle_id       departure_time         arrival_time   status
0       135        2324  2023-01-01 00:00:00  2023-01-01 00:32:00  delayed
1       111        2867  2023-01-01 00:01:00  2023-01-01 00:38:00  on_time
2       102        4183  2023-01-01 00:02:00  2023-01-01 01:02:00  delayed
3       197        4853  2023-01-01 00:03:00  2023-01-01 00:15:00  delayed
4       192        1885  2023-01-01 00:04:00  2023-01-01 00:25:00  on_time

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   route_id        50000 non-null  int64 
 1   vehicle_id      50000 non-null  int64 
 2   departure_time  50000 non-null  object
 3   arrival_time    50000 non-null  object
 4   status          50000 non-null  object
dtypes: int64(2), object(3)
memory usage: 1.9+ MB
None

Missing values

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mode_val, inplace=True)



Missing values after imputation:
route_id          0
vehicle_id        0
departure_time    0
arrival_time      0
status            0
dtype: int64

Preprocessing complete. Cleaned data saved to 'cleaned_transportation.csv'.
