In [3]:
import pandas as pd

## • download and read the CSV file and make ID the index column

In [4]:
df = pd.DataFrame(pd.read_csv('../data/auto.csv',
                              delimiter=','))

df.set_index('ID', inplace=True)

## • count the number of observations using the method count()

In [5]:
df.count()

CarNumber       931
Make_n_model    931
Refund          914
Fines           869
History          82
dtype: int64

## • drop the duplicates, taking into account only the following columns: CarNumber, Make_n_model, Fines

In [6]:
df.drop_duplicates(subset=['CarNumber', 'Make_n_model', 'Fines'], keep='last', inplace=True)

In [7]:
df.count()

CarNumber       725
Make_n_model    725
Refund          713
Fines           665
History          65
dtype: int64

In [8]:
names = ['CarNumber', 'Make_n_model', 'Refund', 'Fines', 'History']

## • work with missing values
## [How To Get Number of Missing Values in Each Column in Pandas](https://cmdlinetips.com/2020/11/how-to-get-number-of-missing-values-in-each-column-in-pandas/)

In [9]:
def print_isna(names, dframe):
    for name in names:
        if (name in dframe):
            print(f'In {name} missing {dframe[name].isna().sum()} values', sep='\n')

In [10]:
print_isna(names, df)

In CarNumber missing 0 values
In Make_n_model missing 0 values
In Refund missing 12 values
In Fines missing 60 values
In History missing 660 values


## [Remove missing values.](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dropna.html)

In [11]:
df.dropna(axis='columns', thresh=501, inplace=True)

In [12]:
print_isna(names, df)


In CarNumber missing 0 values
In Make_n_model missing 0 values
In Refund missing 12 values
In Fines missing 60 values


## [replace all the missing values in thecolumn with the previous value ](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.fillna.html)

In [13]:
df['Refund'].fillna(method="ffill", inplace=True)

In [14]:
print_isna(names, df)

In CarNumber missing 0 values
In Make_n_model missing 0 values
In Refund missing 0 values
In Fines missing 60 values


In [15]:
mean_fines = df['Fines'].mean(skipna=True)
df['Fines'].fillna(mean_fines, inplace=True)

In [16]:
print_isna(names, df)

In CarNumber missing 0 values
In Make_n_model missing 0 values
In Refund missing 0 values
In Fines missing 0 values


## • split and parse the make and model

## [apply](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.apply.html)

In [17]:
df[['Make', 'Model']] = df['Make_n_model'].apply(lambda x: pd.Series(x.split(' ')))



In [18]:
df.drop('Make_n_model', axis='columns', inplace=True)

In [19]:
df.to_json('../data/auto.json', orient='records')