In [3]:
import pandas as pd

## • download and read the CSV file and make ID the index column

In [27]:
df = pd.DataFrame(pd.read_csv('../data/auto.csv',
                              delimiter=','))

df.set_index('ID', inplace=True)

## • count the number of observations using the method count()

In [28]:
df.count()

CarNumber       931
Make_n_model    931
Refund          914
Fines           869
History          82
dtype: int64

## • drop the duplicates, taking into account only the following columns: CarNumber, Make_n_model, Fines

In [29]:
df.drop_duplicates(subset=['CarNumber', 'Make_n_model', 'Fines'], keep='last', inplace=True)

In [30]:
df.count()

CarNumber       725
Make_n_model    725
Refund          713
Fines           665
History          65
dtype: int64

In [None]:
names = ['CarNumber', 'Make_n_model', 'Refund', 'Fines', 'History']

## • work with missing values
## [How To Get Number of Missing Values in Each Column in Pandas](https://cmdlinetips.com/2020/11/how-to-get-number-of-missing-values-in-each-column-in-pandas/)

In [36]:
def print_isna(names, dframe):
    for name in names:
        if (name in dframe):
            print(f'In {name} missing {dframe[name].isna().sum()} values', sep='\n')

In [32]:
print_isna(names, df)

In CarNumber missing 0 values
In Make_n_model missing 0 values
In Refund missing 12 values
In Fines missing 60 values
In History missing 660 values


## [Remove missing values.](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dropna.html)

In [37]:
df.dropna(axis='columns', thresh=501, inplace=True)

In [38]:
print_isna(names, df)


In CarNumber missing 0 values
In Make_n_model missing 0 values
In Refund missing 12 values
In Fines missing 60 values


## [replace all the missing values in thecolumn with the previous value ](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.fillna.html)

In [39]:
df['Refund'].fillna(method="ffill", inplace=True)

In [41]:
print_isna(names, df)

In CarNumber missing 0 values
In Make_n_model missing 0 values
In Refund missing 0 values
In Fines missing 60 values


In [42]:
mean_fines = df['Fines'].mean(skipna=True)
df['Fines'].fillna(mean_fines, inplace=True)

In [43]:
print_isna(names, df)

In CarNumber missing 0 values
In Make_n_model missing 0 values
In Refund missing 0 values
In Fines missing 0 values


## • split and parse the make and model

## [apply](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.apply.html)

In [46]:
df[['Make', 'Model']] = df['Make_n_model'].apply(lambda x: pd.Series(x.split(' ')))



Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines,Make,Model
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Y163O8161RUS,Ford Focus,2.0,3200.000000,Ford,Focus
1,E432XX77RUS,Toyota Camry,1.0,6500.000000,Toyota,Camry
2,7184TT36RUS,Ford Focus,1.0,2100.000000,Ford,Focus
3,X582HE161RUS,Ford Focus,2.0,2000.000000,Ford,Focus
5,92918M178RUS,Ford Focus,1.0,5700.000000,Ford,Focus
...,...,...,...,...,...,...
926,Y163O8161RUS,Ford Focus,2.0,1600.000000,Ford,Focus
927,M0309X197RUS,Ford Focus,1.0,22300.000000,Ford,Focus
928,O673E8197RUS,Ford Focus,2.0,600.000000,Ford,Focus
929,8610T8154RUS,Ford Focus,1.0,2000.000000,Ford,Focus


In [47]:
df.drop('Make_n_model', axis='columns', inplace=True)

Unnamed: 0_level_0,CarNumber,Refund,Fines,Make,Model
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Y163O8161RUS,2.0,3200.000000,Ford,Focus
1,E432XX77RUS,1.0,6500.000000,Toyota,Camry
2,7184TT36RUS,1.0,2100.000000,Ford,Focus
3,X582HE161RUS,2.0,2000.000000,Ford,Focus
5,92918M178RUS,1.0,5700.000000,Ford,Focus
...,...,...,...,...,...
926,Y163O8161RUS,2.0,1600.000000,Ford,Focus
927,M0309X197RUS,1.0,22300.000000,Ford,Focus
928,O673E8197RUS,2.0,600.000000,Ford,Focus
929,8610T8154RUS,1.0,2000.000000,Ford,Focus


In [49]:
df.to_json('../data/auto.json', orient='records')