### Shuffling a Dataset

In [1]:
import os
import pandas as pd
import numpy as np

df = pd.read_csv(
    "https://data.heatonresearch.com/data/t81-558/auto-mpg.csv", 
    na_values=['NA', '?'])

np.random.seed(42) # Uncomment this line to get the same shuffle each time
df = df.reindex(np.random.permutation(df.index))
display(df)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
198,33.0,4,91.0,53.0,1795,17.4,76,3,honda civic
396,28.0,4,120.0,79.0,2625,18.6,82,1,ford ranger
33,19.0,6,232.0,100.0,2634,13.0,71,1,amc gremlin
208,13.0,8,318.0,150.0,3940,13.2,76,1,plymouth volare premier v8
93,14.0,8,318.0,150.0,4237,14.5,73,1,plymouth fury gran sedan
...,...,...,...,...,...,...,...,...,...
71,19.0,3,70.0,97.0,2330,13.5,72,3,mazda rx2 coupe
106,12.0,8,350.0,180.0,4499,12.5,73,1,oldsmobile vista cruiser
270,21.1,4,134.0,95.0,2515,14.8,78,3,toyota celica gt liftback
348,37.7,4,89.0,62.0,2050,17.3,81,3,toyota tercel


The following code demonstrates a reindex.  Notice how the reindex orders the row indexes.

In [2]:
df.reset_index(inplace=True, drop=True)
display(df)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,33.0,4,91.0,53.0,1795,17.4,76,3,honda civic
1,28.0,4,120.0,79.0,2625,18.6,82,1,ford ranger
2,19.0,6,232.0,100.0,2634,13.0,71,1,amc gremlin
3,13.0,8,318.0,150.0,3940,13.2,76,1,plymouth volare premier v8
4,14.0,8,318.0,150.0,4237,14.5,73,1,plymouth fury gran sedan
...,...,...,...,...,...,...,...,...,...
393,19.0,3,70.0,97.0,2330,13.5,72,3,mazda rx2 coupe
394,12.0,8,350.0,180.0,4499,12.5,73,1,oldsmobile vista cruiser
395,21.1,4,134.0,95.0,2515,14.8,78,3,toyota celica gt liftback
396,37.7,4,89.0,62.0,2050,17.3,81,3,toyota tercel


### Sorting a Data Set


In [3]:
import os
import pandas as pd

df = pd.read_csv(
    "https://data.heatonresearch.com/data/t81-558/auto-mpg.csv", 
    na_values=['NA', '?'])

df = df.sort_values(by='name', ascending=True)
print(f"The first car is: {df['name'].iloc[0]}")
display(df)

The first car is: amc ambassador brougham


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
96,13.0,8,360.0,175.0,3821,11.0,73,1,amc ambassador brougham
9,15.0,8,390.0,190.0,3850,8.5,70,1,amc ambassador dpl
66,17.0,8,304.0,150.0,3672,11.5,72,1,amc ambassador sst
315,24.3,4,151.0,90.0,3003,20.1,80,1,amc concord
257,19.4,6,232.0,90.0,3210,17.2,78,1,amc concord
...,...,...,...,...,...,...,...,...,...
394,44.0,4,97.0,52.0,2130,24.6,82,2,vw pickup
309,41.5,4,98.0,76.0,2144,14.7,80,2,vw rabbit
197,29.0,4,90.0,70.0,1937,14.2,76,2,vw rabbit
325,44.3,4,90.0,48.0,2085,21.7,80,2,vw rabbit c (diesel)


### Grouping a Data Set

In [4]:
import os
import pandas as pd

df = pd.read_csv(
    "https://data.heatonresearch.com/data/t81-558/auto-mpg.csv", 
    na_values=['NA', '?'])

g = df.groupby('cylinders')['mpg'].mean()
g

cylinders
3    20.550000
4    29.286765
5    27.366667
6    19.985714
8    14.963107
Name: mpg, dtype: float64

It might be useful to have these **mean** values as a dictionary.

In [None]:
d = g.to_dict()
d

{3: 20.55,
 4: 29.28676470588236,
 5: 27.366666666666664,
 6: 19.985714285714284,
 8: 14.963106796116508}