# Data Cleaning

In [5]:
import pandas as pd

data = pd.read_csv('raw_data.csv')

print(data)

           Date    Time Direction Type Occupancy Colour       Brand
0    25/10/2023  8.20am        in  car         1  White        Mini
1    25/10/2023  8.20am        in  car         1   Grey       Volvo
2    25/10/2023  8.20am        in  car         1   Blue        Ford
3    25/10/2023  8.20am        in  car         1    Red     Hyundai
4    25/10/2023  8.20am        in  car         1   Blue        Ford
..          ...     ...       ...  ...       ...    ...         ...
151  25/10/2023  8.45am        in  car         2  White     Hyundai
152  25/10/2023  8.45am        in  car         1  Black       Skoda
153  25/10/2023  8.45am        in  car         1  Black        Ford
154  25/10/2023  8.45am        in  car         2  Black       Volvo
155  25/10/2023  8.50am        in  car         1   Grey  Volkswagen

[156 rows x 7 columns]


# Convert Date To The Correct Format
From dd/mm/yyyy to dd-mm-yyyy

In [6]:
data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%Y').dt.strftime('%d-%m-%Y')

print(data)

           Date    Time Direction Type Occupancy Colour       Brand
0    25-10-2023  8.20am        in  car         1  White        Mini
1    25-10-2023  8.20am        in  car         1   Grey       Volvo
2    25-10-2023  8.20am        in  car         1   Blue        Ford
3    25-10-2023  8.20am        in  car         1    Red     Hyundai
4    25-10-2023  8.20am        in  car         1   Blue        Ford
..          ...     ...       ...  ...       ...    ...         ...
151  25-10-2023  8.45am        in  car         2  White     Hyundai
152  25-10-2023  8.45am        in  car         1  Black       Skoda
153  25-10-2023  8.45am        in  car         1  Black        Ford
154  25-10-2023  8.45am        in  car         2  Black       Volvo
155  25-10-2023  8.50am        in  car         1   Grey  Volkswagen

[156 rows x 7 columns]


# Missing value checking

In [7]:
missing_values = data.isna()


#check if there is any true value (missing value)
for i in missing_values:
    missing_values_check = missing_values[i].unique()
print(missing_values_check)
print()    
    
#show row data that has missing value    
row_missing = missing_values.any(axis =1)
print(data[row_missing])

[False  True]

           Date    Time Direction       Type Occupancy Colour Brand
8    25-10-2023  8.20am        in        bus       90%   Blue   NaN
12   25-10-2023  8.20am        in        bus       75%   Blue   NaN
13   25-10-2023  8.20am        in        bus       80%  White   NaN
15   25-10-2023  8.20am        in    bicycle         1  Black   NaN
42   25-10-2023  8.30am        in        bus       25%   Blue   NaN
46   25-10-2023  8.30am        in        bus       20%   Blue   NaN
62   25-10-2023  8.35am        in        bus      100%   Blue   NaN
66   25-10-2023  8.35am        in        bus      100%   Blue   NaN
70   25-10-2023  8.35am        in    bicycle         1  Black   NaN
83   25-10-2023  8.35am        in  motorbike         1  Black   NaN
85   25-10-2023  8.35am        in        bus       80%   Blue   NaN
92   25-10-2023  8.40am        in        bus      100%   Blue   NaN
117  25-10-2023  8.45am        in        bus      100%   Blue   NaN
123  25-10-2023  8.45am        in

# Vehicle Type Check for Brand missing data 
To check if the missing data has vehicle type as 'Bus' or 'Bicycle' or 'Motorbike'

As we do not focus on the brand of those vehicle types

Thus, the brand data is not necessary to collect for those vehicle types

In [8]:
#check missing value for vehicle type apart from bus, bicycle, and motorbike 
#should not be missing
type_missing = (data['Brand'].isna()) & (data['Type'] != 'bus') & (data['Type'] != 'bicycle') & (data['Type'] != 'motorbike')
alltype_missing = data[type_missing]

print(alltype_missing)

Empty DataFrame
Columns: [Date, Time, Direction, Type, Occupancy, Colour, Brand]
Index: []


# Write to the new file

In [9]:
data.to_csv('completed_data.csv', index=False) 