In [281]:
import pandas as pd
import numpy as np

# 1. Read the JSON file that you saved in ex02.
- One of the columns is a float, so let's define its format in Pandas using pd.options.display.float_format: floats should be displayed with two decimals.
- There are missing values from the Model; do not do anything with them.

In [282]:
df = pd.read_json('../data/auto.json')
pd.options.display.float_format = '{:.2f}'.format

# 2. Enrich the dataframe using a sample from that dataframe.
- Create a sample with 200 new observations using random_state = 21. 
    - The sample should not contain new combinations of the car number, make, and model, so the entire dataset will be consistent in this regard.

In [283]:
sample = df.sample(n=200, random_state=21)
sample = sample[['CarNumber', 'Make', 'Model']]
sample

Unnamed: 0,CarNumber,Make,Model
445,M0299X197RUS,Ford,Focus
22,83298C154RUS,Ford,Focus
93,H957HY161RUS,Ford,Focus
173,T941CC96RUS,Ford,Focus
697,H966HY161RUS,Ford,Focus
...,...,...,...
14,8182XX154RUS,Ford,Focus
623,X796TH96RUS,Ford,Focus
498,T011MY163RUS,Ford,Focus
536,T341CC96RUS,Volkswagen,Passat


    - There are no restrictions on the refund and fines columns. You can randomly select a value from these columns and apply it to any car number.

In [284]:
refund_fines = df[['Refund', 'Fines']].sample(n=len(sample), random_state=21, replace=True)
sample = sample.reset_index(drop=True)
refund_fines = refund_fines.reset_index(drop=True)
sample[['Refund', 'Fines']] = refund_fines
sample

Unnamed: 0,CarNumber,Make,Model,Refund,Fines
0,M0299X197RUS,Ford,Focus,1,1500.00
1,83298C154RUS,Ford,Focus,2,4000.00
2,H957HY161RUS,Ford,Focus,1,4500.00
3,T941CC96RUS,Ford,Focus,2,2000.00
4,H966HY161RUS,Ford,Focus,2,1300.00
...,...,...,...,...,...
195,8182XX154RUS,Ford,Focus,1,2000.00
196,X796TH96RUS,Ford,Focus,2,400.00
197,T011MY163RUS,Ford,Focus,1,12800.00
198,T341CC96RUS,Volkswagen,Passat,2,800.00


- Concatenate the sample with the initial dataframe to create a new dataframe, concat_rows.

In [285]:
concat_rows = pd.concat([df, sample], ignore_index=True)
concat_rows.count()

CarNumber    925
Refund       925
Fines        925
Make         925
Model        914
dtype: int64

# 3. Enrich the concat_rows dataframe with a new column containing generated data.
- Create a series named "Year" with random integers from 1980 to 2019.
- Use np.random.seed(21) before generating the years.

In [286]:
np.random.seed(21)
year = np.random.randint(1980, 2019, size=len(concat_rows))
year = pd.Series(year, name = 'Year')
year

0      1989
1      1995
2      1984
3      2015
4      2014
       ... 
920    1996
921    2002
922    1996
923    2012
924    1984
Name: Year, Length: 925, dtype: int32

- Concatenate the series with the data frame and name it fines.

In [287]:
fines = pd.concat([concat_rows, year], axis=1)
fines

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.00,Ford,Focus,1989
1,E432XX77RUS,1,6500.00,Toyota,Camry,1995
2,7184TT36RUS,1,2100.00,Ford,Focus,1984
3,X582HE161RUS,2,2000.00,Ford,Focus,2015
4,92918M178RUS,1,5700.00,Ford,Focus,2014
...,...,...,...,...,...,...
920,8182XX154RUS,1,2000.00,Ford,Focus,1996
921,X796TH96RUS,2,400.00,Ford,Focus,2002
922,T011MY163RUS,1,12800.00,Ford,Focus,1996
923,T341CC96RUS,2,800.00,Volkswagen,Passat,2012


# 4. Enrich the dataframe with data from another dataframe.
- Create a new dataframe with car numbers and their owners. 
    - Get the most popular surnames in the US (you can find the file surname.json in the folder datasets).

In [288]:
surnames_data = pd.read_json('../../datasets/surname.json')
surnames_list = surnames_data[0].tolist()[1:]
surnames_list

['ADAMS',
 'ALLEN',
 'ALVAREZ',
 'ANDERSON',
 'BAILEY',
 'BAKER',
 'BENNETT',
 'BROOKS',
 'BROWN',
 'CAMPBELL',
 'CARTER',
 'CASTILLO',
 'CHAVEZ',
 'CLARK',
 'COLLINS',
 'COOK',
 'COOPER',
 'COX',
 'CRUZ',
 'DAVIS',
 'DIAZ',
 'EDWARDS',
 'EVANS',
 'FLORES',
 'FOSTER',
 'GARCIA',
 'GOMEZ',
 'GONZALEZ',
 'GRAY',
 'GREEN',
 'GUTIERREZ',
 'HALL',
 'HARRIS',
 'HERNANDEZ',
 'HILL',
 'HOWARD',
 'HUGHES',
 'JACKSON',
 'JAMES',
 'JIMENEZ',
 'JOHNSON',
 'JONES',
 'KELLY',
 'KIM',
 'KING',
 'LEE',
 'LEWIS',
 'LONG',
 'LOPEZ',
 'MARTIN',
 'MARTINEZ',
 'MENDOZA',
 'MILLER',
 'MITCHELL',
 'MOORE',
 'MORALES',
 'MORGAN',
 'MORRIS',
 'MURPHY',
 'MYERS',
 'NELSON',
 'NGUYEN',
 'ORTIZ',
 'PARKER',
 'PATEL',
 'PEREZ',
 'PETERSON',
 'PHILLIPS',
 'PRICE',
 'RAMIREZ',
 'RAMOS',
 'REED',
 'REYES',
 'RICHARDSON',
 'RIVERA',
 'ROBERTS',
 'ROBINSON',
 'RODRIGUEZ',
 'ROGERS',
 'ROSS',
 'RUIZ',
 'SANCHEZ',
 'SANDERS',
 'SCOTT',
 'SMITH',
 'STEWART',
 'TAYLOR',
 'THOMAS',
 'THOMPSON',
 'TORRES',
 'TURNER',
 'WALKE

      - Create a new series with the surnames. They should not contain special characters, such as commas or brackets. The count should equal the number of unique car numbers in the sample (use `random_state = 21`).


In [289]:
unique_CarNumbers = fines['CarNumber'].unique()
np.random.seed(21)
surnames = pd.Series(np.random.choice(surnames_list, size=len(unique_CarNumbers)))
surnames

0      RICHARDSON
1            ROSS
2          MORGAN
3          BAILEY
4           LOPEZ
          ...    
526      CAMPBELL
527          HALL
528         BAKER
529          DIAZ
530        MORGAN
Length: 531, dtype: object

    - Create the dataframe `owners` with two columns: `CarNumber` and `SURNAME`.

In [290]:
owners = pd.DataFrame({
    'CarNumber': unique_CarNumbers,
    'SURNAME': surnames
})
owners

Unnamed: 0,CarNumber,SURNAME
0,Y163O8161RUS,RICHARDSON
1,E432XX77RUS,ROSS
2,7184TT36RUS,MORGAN
3,X582HE161RUS,BAILEY
4,92918M178RUS,LOPEZ
...,...,...
526,O136HO197RUS,CAMPBELL
527,O22097197RUS,HALL
528,M0309X197RUS,BAKER
529,O673E8197RUS,DIAZ


   - Append five more observations to the `fines` dataframe. Come up with your own ideas for `CarNumber`, etc.

In [291]:
fines_1 = pd.DataFrame({
    'CarNumber': ['X999XX99RUS', 'Y888YY88RUS', 'Z777ZZ77RUS', 'C444CC44RUS', 'D333DD33RUS'],
    'Refund': [1.0, 2.0, 1.0, 2.0, 1.0],
    'Fines': [5000.0, 3000.0, 7000.0, 2000.0, 6000.0],
    'Make': ['Ford', 'Toyota', 'Skoda', 'Volkswagen', 'Ford'],
    'Model': ['Focus', 'Camry', 'Octavia', 'Golf', 'Focus'],
})
fines = pd.concat([fines, fines_1], ignore_index=True)

In [292]:
fines.count()

CarNumber    930
Refund       930
Fines        930
Make         930
Model        919
Year         925
dtype: int64

   - Delete the last 20 observations from the `owners` dataframe and add three new observations that are not the same as those added to the `fines` dataframe.

In [293]:
len(owners)

531

In [294]:
owners = owners[:-20]
new_owners = pd.DataFrame({
    'CarNumber': ['UNIQUE001', 'UNIQUE002', 'UNIQUE003'],
    'SURNAME': ['SMITH', 'JOHNSON', 'WILLIAMS']
})
owners = pd.concat([owners, new_owners], ignore_index=True)
owners

Unnamed: 0,CarNumber,SURNAME
0,Y163O8161RUS,RICHARDSON
1,E432XX77RUS,ROSS
2,7184TT36RUS,MORGAN
3,X582HE161RUS,BAILEY
4,92918M178RUS,LOPEZ
...,...,...
509,O50197197RUS,WRIGHT
510,7608EE777RUS,HILL
511,UNIQUE001,SMITH
512,UNIQUE002,JOHNSON


- Join the two dataframes.
     - The new dataframe should contain **only** the car numbers that exist in **both** dataframes.

In [295]:
inner_join = pd.merge(fines, owners, on='CarNumber', how='inner')
inner_join

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2.00,3200.00,Ford,Focus,1989.00,RICHARDSON
1,E432XX77RUS,1.00,6500.00,Toyota,Camry,1995.00,ROSS
2,7184TT36RUS,1.00,2100.00,Ford,Focus,1984.00,MORGAN
3,X582HE161RUS,2.00,2000.00,Ford,Focus,2015.00,BAILEY
4,92918M178RUS,1.00,5700.00,Ford,Focus,2014.00,LOPEZ
...,...,...,...,...,...,...,...
894,8182XX154RUS,1.00,2000.00,Ford,Focus,1996.00,SMITH
895,X796TH96RUS,2.00,400.00,Ford,Focus,2002.00,WATSON
896,T011MY163RUS,1.00,12800.00,Ford,Focus,1996.00,SANDERS
897,T341CC96RUS,2.00,800.00,Volkswagen,Passat,2012.00,PEREZ


     - The new dataframe should contain **all** the car numbers from **both** dataframes. (Outer Join)

In [296]:
outer_join = pd.merge(fines, owners, on='CarNumber', how='outer')
outer_join

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,704687163RUS,2.00,1400.00,Ford,Focus,2014.00,ADAMS
1,704787163RUS,2.00,2800.00,Ford,Focus,2005.00,MORGAN
2,704987163RUS,2.00,8594.59,Ford,Focus,2014.00,MITCHELL
3,705287163RUS,2.00,2000.00,Ford,Focus,1990.00,GOMEZ
4,705387163RUS,2.00,700.00,Ford,Focus,2005.00,STEWART
...,...,...,...,...,...,...,...
928,Y973O8197RUS,1.00,34800.00,Ford,Focus,2013.00,YOUNG
929,Y973O8197RUS,1.00,69600.00,Ford,Focus,1989.00,YOUNG
930,Y973O8197RUS,2.00,15300.00,Ford,Focus,2009.00,YOUNG
931,Y973O8197RUS,2.00,6800.00,Ford,Focus,2014.00,YOUNG


     - The new dataframe should contain **only** the car numbers from the `fines` dataframe.

In [297]:
left_join = pd.merge(fines, owners, on='CarNumber', how='left')
left_join

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2.00,3200.00,Ford,Focus,1989.00,RICHARDSON
1,E432XX77RUS,1.00,6500.00,Toyota,Camry,1995.00,ROSS
2,7184TT36RUS,1.00,2100.00,Ford,Focus,1984.00,MORGAN
3,X582HE161RUS,2.00,2000.00,Ford,Focus,2015.00,BAILEY
4,92918M178RUS,1.00,5700.00,Ford,Focus,2014.00,LOPEZ
...,...,...,...,...,...,...,...
925,X999XX99RUS,1.00,5000.00,Ford,Focus,,
926,Y888YY88RUS,2.00,3000.00,Toyota,Camry,,
927,Z777ZZ77RUS,1.00,7000.00,Skoda,Octavia,,
928,C444CC44RUS,2.00,2000.00,Volkswagen,Golf,,


    - The new dataframe should contain **only** the car numbers from the `owners` dataframe. (Right Join)

In [298]:
right_join = pd.merge(fines, owners, on='CarNumber', how='right')
right_join

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2.00,3200.00,Ford,Focus,1989.00,RICHARDSON
1,Y163O8161RUS,2.00,1600.00,Ford,Focus,1999.00,RICHARDSON
2,E432XX77RUS,1.00,6500.00,Toyota,Camry,1995.00,ROSS
3,E432XX77RUS,2.00,13000.00,Toyota,Camry,1992.00,ROSS
4,7184TT36RUS,1.00,2100.00,Ford,Focus,1984.00,MORGAN
...,...,...,...,...,...,...,...
897,7608EE777RUS,1.00,4000.00,Skoda,Octavia,2013.00,HILL
898,7608EE777RUS,2.00,20600.00,Skoda,Octavia,1987.00,HILL
899,UNIQUE001,,,,,,SMITH
900,UNIQUE002,,,,,,JOHNSON


In [299]:
len(fines)

930

# 5. Create a pivot table from the `fines` dataframe. It should look like this (the values are the sums of the fines), but with all the years. The values may be different for you.

In [300]:
pivot_table = fines.pivot_table(
    values='Fines',
    index=['Make', 'Model'],
    columns='Year',
    aggfunc='sum',
    fill_value=0
)
pivot_table

Unnamed: 0_level_0,Year,1980.00,1981.00,1982.00,1983.00,1984.00,1985.00,1986.00,1987.00,1988.00,1989.00,...,2009.00,2010.00,2011.00,2012.00,2013.00,2014.00,2015.00,2016.00,2017.00,2018.00
Make,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Ford,Focus,74994.59,306394.59,88994.59,163994.59,112500.0,262694.59,62200.0,161494.59,77772.93,168594.59,...,128794.59,111100.0,116994.59,143789.17,292183.76,74083.76,171500.0,102789.17,102894.59,99500.0
Ford,Mondeo,0.0,0.0,46200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,41100.0,0.0,0.0,0.0,8600.0,0.0
Skoda,Octavia,9094.59,1900.0,8894.59,0.0,500.0,10894.59,0.0,22600.0,5100.0,8594.59,...,0.0,4000.0,3000.0,1700.0,11800.0,52600.0,16394.59,35700.0,2400.0,153200.0
Toyota,Camry,12000.0,0.0,43600.0,1000.0,1000.0,0.0,19800.0,0.0,0.0,800.0,...,0.0,22400.0,0.0,7500.0,0.0,0.0,0.0,8594.59,0.0,0.0
Toyota,Corolla,0.0,6800.0,0.0,12800.0,0.0,6300.0,0.0,54300.0,0.0,7800.0,...,8594.59,6000.0,1000.0,0.0,0.0,0.0,7500.0,0.0,9200.0,0.0
Volkswagen,Golf,20800.0,8594.59,5000.0,200.0,0.0,168000.0,0.0,500.0,0.0,300.0,...,0.0,0.0,0.0,0.0,0.0,13900.0,5300.0,0.0,0.0,5000.0
Volkswagen,Jetta,0.0,1000.0,0.0,0.0,0.0,9000.0,0.0,0.0,46000.0,500.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Volkswagen,Passat,900.0,4000.0,0.0,1100.0,8594.59,0.0,16000.0,2000.0,8594.59,0.0,...,21800.0,9500.0,0.0,800.0,1600.0,3800.0,0.0,0.0,0.0,0.0
Volkswagen,Touareg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5800.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 6. Save both the `fines` and `owners` dataframes to CSV files without an index.

In [301]:
fines['Year'].iloc[-5:] = [2020, 2021, 2022, 2023, 2024]

fines.to_csv('../data/fines.csv', index=False)
owners.to_csv('../data/owners.csv', index=False)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  fines['Year'].iloc[-5:] = [2020, 2021, 2022, 2023, 2024]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fines[