In [1]:
import pandas as pd

# 1. Download and read the CSV file, making ID the index column.

In [2]:
auto = pd.read_csv('../data/auto.csv', index_col='ID')

# 2. Count the number of observations using the method count().

In [3]:
auto.count()

CarNumber       931
Make_n_model    931
Refund          914
Fines           869
History          82
dtype: int64

# 3. Drop the duplicates, taking into account only the following columns: CarNumber, Make_n_Model, and Fines.
- Between two equal observations, choose the last one.
- Check the number of observations again.

In [4]:
auto = auto.drop_duplicates(subset=['CarNumber', 'Make_n_model', 'Fines'], keep='last')
auto.count()

CarNumber       725
Make_n_model    725
Refund          713
Fines           665
History          65
dtype: int64

# 4. Work with missing values. 
- Check how many values are missing from each column.

In [5]:
auto.isnull().sum()

CarNumber         0
Make_n_model      0
Refund           12
Fines            60
History         660
dtype: int64

- Drop all columns with over 500 missing values using the argument thresh. Check how many missing values are in each column.

In [6]:
auto = auto.dropna(axis=1, thresh=len(auto) - 500)
auto.isnull().sum()

CarNumber        0
Make_n_model     0
Refund          12
Fines           60
dtype: int64

- Replace all the missing values in the Refund column with the previous value in that column for that cell. Use the argument method and check how many values are missing from each column.

In [7]:
auto['Refund'] = auto['Refund'].ffill()
auto.isnull().sum()

CarNumber        0
Make_n_model     0
Refund           0
Fines           60
dtype: int64

- Replace all the missing values in the Fines column with the mean value of this column (excluding NA/NULL values when computing the mean). Check how many values are missing from each column.

In [8]:
mean_fines = auto['Fines'].mean()
auto['Fines'] = auto['Fines'].fillna(mean_fines)
auto.isnull().sum()

CarNumber       0
Make_n_model    0
Refund          0
Fines           0
dtype: int64

# 5. Split and parse the make and model.
- Use the apply method for both splitting and extracting values to the new columns, Make and Model.

In [9]:
def split_make_model(value):
    if pd.isna(value):
        return pd.Series([pd.NA, pd.NA])
    
    parts = value.split(' ', 1)
    if len(parts) == 2:
        return pd.Series([parts[0], parts[1]])
    else:
        return pd.Series([parts[0], pd.NA])
auto[['Make', 'Model']] = auto['Make_n_model'].apply(split_make_model)
auto[['Make', 'Model']].head()

Unnamed: 0_level_0,Make,Model
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Ford,Focus
1,Toyota,Camry
2,Ford,Focus
3,Ford,Focus
5,Ford,Focus


- Drop the column Make_n_Model.

In [10]:
auto.drop('Make_n_model', axis=1)

Unnamed: 0_level_0,CarNumber,Refund,Fines,Make,Model
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Y163O8161RUS,2.0,3200.000000,Ford,Focus
1,E432XX77RUS,1.0,6500.000000,Toyota,Camry
2,7184TT36RUS,1.0,2100.000000,Ford,Focus
3,X582HE161RUS,2.0,2000.000000,Ford,Focus
5,92918M178RUS,1.0,5700.000000,Ford,Focus
...,...,...,...,...,...
926,Y163O8161RUS,2.0,1600.000000,Ford,Focus
927,M0309X197RUS,1.0,22300.000000,Ford,Focus
928,O673E8197RUS,2.0,600.000000,Ford,Focus
929,8610T8154RUS,1.0,2000.000000,Ford,Focus


- Save the dataframe in the auto.json JSON file in the format below

In [11]:
auto = auto[['CarNumber', 'Refund', 'Fines', 'Make', 'Model']]
auto.reset_index(drop=True).to_json('../data/auto.json', orient='records', indent=4)

In [12]:
df2 = pd.read_json('../data/auto.json', orient='records')

In [13]:
df2.count()

CarNumber    725
Refund       725
Fines        725
Make         725
Model        716
dtype: int64

In [14]:
df2['Fines'].mean()

np.float64(8594.586466165412)

In [15]:
df2['Refund'].mean()

np.float64(1.5172413793103448)