# Exercise 02 : Preprocessing

In [1]:
import pandas as pd

## Read the CSV file and make ID the index column

In [2]:
df = pd.read_csv('../auto.csv', index_col='ID')

## Count the number of observations using the method count()

In [3]:
df.count().max()

931

## Drop the duplicates, taking into account only the following columns: CarNumber, Make_n_model, Fines

- between the two equal observations, you need to choose the last
- check the number of observations again

In [4]:
df.drop_duplicates(subset=['CarNumber', 'Make_n_model', 'Fines'], keep='last', inplace=True)

df.count().max()

725

## Work with missing values

- check how many values are missing from each column
- drop all the columns with over 500 missing values using the argument thresh, check how many missing values are in each column
- replace all the missing values in the Refund column with the previous value in that column for that cell, use the argument method, check how many values are missing from each column
- replace all the missing values in the Fines column with the mean value of this column (exclude NA/null values when computing the mean value), check how many values are missing from each column

In [5]:
missing_values_before = df.isnull().sum()
print(f"Missing values before:\n{missing_values_before}")

# Drop columns with over 500 missing values
df.dropna(thresh=len(df) - 500, axis=1, inplace=True)
print(f"Missing values after drop:\n{df.isnull().sum()}")

# Replace missing values in Refund column with previous value
df['Refund'].fillna(method='ffill', inplace=True)
print(f"Missing values after Refund fill:\n{df.isnull().sum()}")

# Replace missing values in Fines column with mean value
df['Fines'].fillna(df['Fines'].mean(), inplace=True)
print(f"Missing values after Fines fill:\n{df.isnull().sum()}")

Missing values before:
CarNumber         0
Make_n_model      0
Refund           12
Fines            60
History         660
dtype: int64
Missing values after drop:
CarNumber        0
Make_n_model     0
Refund          12
Fines           60
dtype: int64
Missing values after Refund fill:
CarNumber        0
Make_n_model     0
Refund           0
Fines           60
dtype: int64
Missing values after Fines fill:
CarNumber       0
Make_n_model    0
Refund          0
Fines           0
dtype: int64


## Split and parse the make and model

- use the method apply both for splitting and for extracting the values to the new columns Make and Model
- drop the column Make_n_model
- save the dataframe in the JSON file auto.json

In [6]:
# Split and parse Make and Model
df[['Make', 'Model']] = df['Make_n_model'].str.split(' ', n=1, expand=True)

# Drop the column Make_n_model
df.drop(columns='Make_n_model', inplace=True)

# Save the dataframe to a JSON file
df.reset_index().to_json('auto.json', orient='records')

In [7]:
df

Unnamed: 0_level_0,CarNumber,Refund,Fines,Make,Model
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Y163O8161RUS,2.0,3200.000000,Ford,Focus
1,E432XX77RUS,1.0,6500.000000,Toyota,Camry
2,7184TT36RUS,1.0,2100.000000,Ford,Focus
3,X582HE161RUS,2.0,2000.000000,Ford,Focus
5,92918M178RUS,1.0,5700.000000,Ford,Focus
...,...,...,...,...,...
926,Y163O8161RUS,2.0,1600.000000,Ford,Focus
927,M0309X197RUS,1.0,22300.000000,Ford,Focus
928,O673E8197RUS,2.0,600.000000,Ford,Focus
929,8610T8154RUS,1.0,2000.000000,Ford,Focus
