In [2]:
import pandas as pd
import numpy as np

In [2]:
## Load example datasets

drinks = pd.read_csv('http://bit.ly/drinksbycountry')
movies = pd.read_csv('http://bit.ly/imdbratings')
orders = pd.read_csv('http://bit.ly/chiporders', sep='\t')
orders['item_price'] = orders.item_price.str.replace('$', '').astype('float')
stocks = pd.read_csv('http://bit.ly/smallstocks', parse_dates=['Date'])
titanic = pd.read_csv('http://bit.ly/kaggletrain')
ufo = pd.read_csv('http://bit.ly/uforeports', parse_dates=['Time'])

## 1. Show installed versions

In [3]:
pd.__version__

'2.2.1'

In [4]:
pd.show_versions()


INSTALLED VERSIONS
------------------
commit                : bdc79c146c2e32f2cab629be240f01658cfb6cc2
python                : 3.12.2.final.0
python-bits           : 64
OS                    : Windows
OS-release            : 10
Version               : 10.0.19045
machine               : AMD64
processor             : AMD64 Family 23 Model 96 Stepping 1, AuthenticAMD
byteorder             : little
LC_ALL                : None
LANG                  : None
LOCALE                : English_United States.1252

pandas                : 2.2.1
numpy                 : 1.26.4
pytz                  : 2024.1
dateutil              : 2.9.0.post0
setuptools            : 69.2.0
pip                   : 24.0
Cython                : None
pytest                : None
hypothesis            : None
sphinx                : None
blosc                 : None
feather               : None
xlsxwriter            : None
lxml.etree            : None
html5lib              : None
pymysql               : None
psycopg2     

## 2. Create an example DataFrame

In [5]:
df = pd.DataFrame({'col one':[100, 200], 'col two':[300, 400]})
df

Unnamed: 0,col one,col two
0,100,300
1,200,400


In [6]:
df = pd.DataFrame(np.random.rand(4, 8))
df

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.918815,0.169847,0.743877,0.722636,0.643721,0.320729,0.800669,0.992813
1,0.485331,0.636395,0.769286,0.274947,0.178173,0.494972,0.222654,0.047439
2,0.14794,0.562052,0.922748,0.31074,0.460856,0.602303,0.980179,0.824746
3,0.695853,0.273832,0.787563,0.406262,0.780232,0.614678,0.601713,0.005522


In [7]:
df = pd.DataFrame(np.random.rand(4, 8), columns=list('abcdefgh'))
df

Unnamed: 0,a,b,c,d,e,f,g,h
0,0.657502,0.733391,0.980932,0.714253,0.007784,0.108489,0.015486,0.172645
1,0.575237,0.028112,0.012031,0.95772,0.761077,0.590146,0.873312,0.941893
2,0.656916,0.469147,0.335731,0.670765,0.822912,0.586249,0.133884,0.558152
3,0.708255,0.631925,0.530021,0.377662,0.466848,0.444636,0.514931,0.378267


## 3. Rename columns

In [8]:
df = pd.DataFrame({'col one':[100, 200], 'col two':[300, 400]})
df

Unnamed: 0,col one,col two
0,100,300
1,200,400


In [9]:
df1 = df.rename({'col one':'col_one', 'col two':'col_two'}, axis='columns')
df1

Unnamed: 0,col_one,col_two
0,100,300
1,200,400


In [10]:
df.columns = df.columns.str.replace(' ', '_')
df

Unnamed: 0,col_one,col_two
0,100,300
1,200,400


In [11]:
df.add_prefix('X_')

Unnamed: 0,X_col_one,X_col_two
0,100,300
1,200,400


In [12]:
df.add_suffix('_Y')

Unnamed: 0,col_one_Y,col_two_Y
0,100,300
1,200,400


## 4. Reverse row order

In [13]:
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [14]:
drinks.loc[::-1].head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
192,Zimbabwe,64,18,4,4.7,Africa
191,Zambia,32,19,4,2.5,Africa
190,Yemen,6,0,0,0.1,Asia
189,Vietnam,111,2,1,2.0,Asia
188,Venezuela,333,100,3,7.7,South America


In [15]:
# to reset the index so that it starts at zero
drinks.loc[::-1].reset_index(drop=True).head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Zimbabwe,64,18,4,4.7,Africa
1,Zambia,32,19,4,2.5,Africa
2,Yemen,6,0,0,0.1,Asia
3,Vietnam,111,2,1,2.0,Asia
4,Venezuela,333,100,3,7.7,South America


## 5. Reverse column order

In [16]:
drinks.loc[:, ::-1].head()

Unnamed: 0,continent,total_litres_of_pure_alcohol,wine_servings,spirit_servings,beer_servings,country
0,Asia,0.0,0,0,0,Afghanistan
1,Europe,4.9,54,132,89,Albania
2,Africa,0.7,14,0,25,Algeria
3,Europe,12.4,312,138,245,Andorra
4,Africa,5.9,45,57,217,Angola


## 6. Select columns by data type

In [17]:
drinks.dtypes

country                          object
beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object

In [18]:
# This includes both int and float columns.
drinks.select_dtypes(include='number').head()

Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
0,0,0,0,0.0
1,89,132,54,4.9
2,25,0,14,0.7
3,245,138,312,12.4
4,217,57,45,5.9


In [19]:
drinks.select_dtypes(include='object').head()

Unnamed: 0,country,continent
0,Afghanistan,Asia
1,Albania,Europe
2,Algeria,Africa
3,Andorra,Europe
4,Angola,Africa


In [20]:
drinks.select_dtypes(include=['number', 'object', 'category', 'datetime']).head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [21]:
drinks.select_dtypes(exclude='number').head()

Unnamed: 0,country,continent
0,Afghanistan,Asia
1,Albania,Europe
2,Algeria,Africa
3,Andorra,Europe
4,Angola,Africa


## 7. Convert strings to numbers

In [22]:
df = pd.DataFrame({'col_one':['1.1', '2.2', '3.3'],
                   'col_two':['4.4', '5.5', '6.6'],
                   'col_three':['7.7', '8.8', '-']})
df

Unnamed: 0,col_one,col_two,col_three
0,1.1,4.4,7.7
1,2.2,5.5,8.8
2,3.3,6.6,-


In [23]:
df.dtypes

col_one      object
col_two      object
col_three    object
dtype: object

In [24]:
df.astype({'col_one':'float', 'col_two':'float'}).dtypes

col_one      float64
col_two      float64
col_three     object
dtype: object

In [25]:
pd.to_numeric(df.col_three, errors='coerce')

0    7.7
1    8.8
2    NaN
Name: col_three, dtype: float64

In [26]:
pd.to_numeric(df.col_three, errors='coerce').fillna(0)

0    7.7
1    8.8
2    0.0
Name: col_three, dtype: float64

In [27]:
df = pd.DataFrame({'col_one':['1.1', '2.2', '3.3'],
                   'col_two':['4.4', '5.5', '6.6'],
                   'col_three':['7.7', '8.8', '-']})
df

Unnamed: 0,col_one,col_two,col_three
0,1.1,4.4,7.7
1,2.2,5.5,8.8
2,3.3,6.6,-


In [28]:
df = df.apply(pd.to_numeric, errors='coerce').fillna(0)
df

Unnamed: 0,col_one,col_two,col_three
0,1.1,4.4,7.7
1,2.2,5.5,8.8
2,3.3,6.6,0.0


In [29]:
df.dtypes

col_one      float64
col_two      float64
col_three    float64
dtype: object

## 8. Reduce DataFrame size

### pandas DataFrames are designed to fit into memory, and so sometimes you need to reduce the DataFrame size in order to work with it on your system.
### Here's the size of the drinks DataFrame:

In [30]:
drinks.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   beer_servings                 193 non-null    int64  
 2   spirit_servings               193 non-null    int64  
 3   wine_servings                 193 non-null    int64  
 4   total_litres_of_pure_alcohol  193 non-null    float64
 5   continent                     193 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 27.5 KB


In [31]:
# The first step is to only read in the columns that you actually need, which we specify with the "usecols" parameter:
cols = ['beer_servings', 'continent']
small_drinks = pd.read_csv('http://bit.ly/drinksbycountry', usecols=cols)
small_drinks.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   beer_servings  193 non-null    int64 
 1   continent      193 non-null    object
dtypes: int64(1), object(1)
memory usage: 12.2 KB


In [32]:
# second step is to convert any object columns containing categorical data to the category data type, which we specify with the "dtype" parameter:
# Keep in mind that the category data type will only reduce memory usage if you have a small number of categories relative to the number of rows.
dtypes = {'continent':'category'}
smaller_drinks = pd.read_csv('http://bit.ly/drinksbycountry', usecols=cols, dtype=dtypes)
smaller_drinks.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   beer_servings  193 non-null    int64   
 1   continent      193 non-null    category
dtypes: category(1), int64(1)
memory usage: 2.3 KB


## 9. Build a DataFrame from multiple files (row-wise)

In [33]:
from glob import glob

In [34]:
stock_files = sorted(glob('data/stocks*.csv'))
stock_files

['data\\stocks1.csv', 'data\\stocks2.csv', 'data\\stocks3.csv']

In [35]:
pd.concat((pd.read_csv(file) for file in stock_files))

Unnamed: 0,Date,Close,Volume,Symbol
0,2016-10-03,31.5,14070500,CSCO
1,2016-10-03,112.52,21701800,AAPL
2,2016-10-03,57.42,19189500,MSFT
0,2016-10-04,113.0,29736800,AAPL
1,2016-10-04,57.24,20085900,MSFT
2,2016-10-04,31.35,18460400,CSCO
0,2016-10-05,57.64,16726400,MSFT
1,2016-10-05,31.59,11808600,CSCO
2,2016-10-05,113.05,21453100,AAPL


In [36]:
#  To avoid duplicate values in the index., we can tell the concat() function to ignore the index and instead use the default integer index:
pd.concat((pd.read_csv(file) for file in stock_files), ignore_index=True)

Unnamed: 0,Date,Close,Volume,Symbol
0,2016-10-03,31.5,14070500,CSCO
1,2016-10-03,112.52,21701800,AAPL
2,2016-10-03,57.42,19189500,MSFT
3,2016-10-04,113.0,29736800,AAPL
4,2016-10-04,57.24,20085900,MSFT
5,2016-10-04,31.35,18460400,CSCO
6,2016-10-05,57.64,16726400,MSFT
7,2016-10-05,31.59,11808600,CSCO
8,2016-10-05,113.05,21453100,AAPL


## 10. Build a DataFrame from multiple files (column-wise)

In [37]:
pd.read_csv('data/drinks1.csv').head()

Unnamed: 0,country,beer_servings,spirit_servings
0,Afghanistan,0,0
1,Albania,89,132
2,Algeria,25,0
3,Andorra,245,138
4,Angola,217,57


In [38]:
pd.read_csv('data/drinks2.csv').head()

Unnamed: 0,wine_servings,total_litres_of_pure_alcohol,continent
0,0,0.0,Asia
1,54,4.9,Europe
2,14,0.7,Africa
3,312,12.4,Europe
4,45,5.9,Africa


In [39]:
drink_files = sorted(glob('data/drinks*.csv'))

In [40]:
pd.concat((pd.read_csv(file) for file in drink_files), axis='columns').head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


## 11. Create a DataFrame from the clipboard

In [41]:
df = pd.read_clipboard()
df

Unnamed: 0,"Thus,",only,Drama,and,Comedy,and.1,Action,movies,remain,in,the,DataFrame.


In [42]:
df.dtypes

Thus,         object
only          object
Drama         object
and           object
Comedy        object
and.1         object
Action        object
movies        object
remain        object
in            object
the           object
DataFrame.    object
dtype: object

In [43]:
df = pd.read_clipboard()
df

Unnamed: 0,"Thus,",only,Drama,and,Comedy,and.1,Action,movies,remain,in,the,DataFrame.


In [44]:
df.index

RangeIndex(start=0, stop=0, step=1)

## 12. Split a DataFrame into two random subsets

In [45]:
len(movies)

979

In [46]:
movies_1 = movies.sample(frac=0.75, random_state=1234)
movies_1

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
387,8.0,Midnight Cowboy,X,Drama,113,"[u'Dustin Hoffman', u'Jon Voight', u'Sylvia Mi..."
653,7.7,Fearless,PG-13,Action,104,"[u'Jet Li', u'Li Sun', u'Yong Dong']"
40,8.5,The Green Mile,R,Crime,189,"[u'Tom Hanks', u'Michael Clarke Duncan', u'Dav..."
913,7.5,Suspiria,X,Horror,92,"[u'Jessica Harper', u'Stefania Casini', u'Flav..."
766,7.6,The Little Mermaid,G,Animation,83,"[u'Jodi Benson', u'Samuel E. Wright', u'Rene A..."
...,...,...,...,...,...,...
368,8.0,Planet of the Apes,G,Adventure,112,"[u'Charlton Heston', u'Roddy McDowall', u'Kim ..."
505,7.8,About Time,R,Drama,123,"[u'Domhnall Gleeson', u'Rachel McAdams', u'Bil..."
595,7.7,The Purple Rose of Cairo,PG,Comedy,82,"[u'Mia Farrow', u'Jeff Daniels', u'Danny Aiello']"
940,7.4,Much Ado About Nothing,PG-13,Comedy,111,"[u'Kenneth Branagh', u'Emma Thompson', u'Keanu..."


In [47]:
# Then we can use the drop() method to drop all rows that are in "movies_1" and assign the remaining rows to "movies_2":
movies_2 = movies.drop(movies_1.index)

In [48]:
len(movies_1) + len(movies_2)

979

In [49]:
movies_1.index.sort_values()

Index([  0,   2,   5,   6,   7,   8,   9,  11,  13,  16,
       ...
       966, 967, 969, 971, 972, 974, 975, 976, 977, 978],
      dtype='int64', length=734)

In [50]:
movies_2.index.sort_values()

Index([  1,   3,   4,  10,  12,  14,  15,  18,  26,  30,
       ...
       931, 934, 937, 941, 950, 954, 960, 968, 970, 973],
      dtype='int64', length=245)

## 13. Filter a DataFrame by multiple categories

In [51]:
movies.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


In [52]:
movies.genre.unique()

array(['Crime', 'Action', 'Drama', 'Western', 'Adventure', 'Biography',
       'Comedy', 'Animation', 'Mystery', 'Horror', 'Film-Noir', 'Sci-Fi',
       'History', 'Thriller', 'Family', 'Fantasy'], dtype=object)

In [53]:
movies[(movies.genre == 'Action') |
       (movies.genre == 'Drama') |
       (movies.genre == 'Western')].head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
5,8.9,12 Angry Men,NOT RATED,Drama,96,"[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals..."
6,8.9,"The Good, the Bad and the Ugly",NOT RATED,Western,161,"[u'Clint Eastwood', u'Eli Wallach', u'Lee Van ..."
9,8.9,Fight Club,R,Drama,139,"[u'Brad Pitt', u'Edward Norton', u'Helena Bonh..."
11,8.8,Inception,PG-13,Action,148,"[u'Leonardo DiCaprio', u'Joseph Gordon-Levitt'..."


In [54]:
# Most preferred
movies[movies.genre.isin(['Action', 'Drama', 'Western'])].head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
5,8.9,12 Angry Men,NOT RATED,Drama,96,"[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals..."
6,8.9,"The Good, the Bad and the Ugly",NOT RATED,Western,161,"[u'Clint Eastwood', u'Eli Wallach', u'Lee Van ..."
9,8.9,Fight Club,R,Drama,139,"[u'Brad Pitt', u'Edward Norton', u'Helena Bonh..."
11,8.8,Inception,PG-13,Action,148,"[u'Leonardo DiCaprio', u'Joseph Gordon-Levitt'..."


In [55]:
movies[~movies.genre.isin(['Action', 'Drama', 'Western'])].head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."
7,8.9,The Lord of the Rings: The Return of the King,PG-13,Adventure,201,"[u'Elijah Wood', u'Viggo Mortensen', u'Ian McK..."


## 14. Filter a DataFrame by largest categories

In [56]:
counts = movies.genre.value_counts()
counts

genre
Drama        278
Comedy       156
Action       136
Crime        124
Biography     77
Adventure     75
Animation     62
Horror        29
Mystery       16
Western        9
Sci-Fi         5
Thriller       5
Film-Noir      3
Family         2
History        1
Fantasy        1
Name: count, dtype: int64

In [57]:
counts.nlargest(3)

genre
Drama     278
Comedy    156
Action    136
Name: count, dtype: int64

In [58]:
counts.nlargest(3).index

Index(['Drama', 'Comedy', 'Action'], dtype='object', name='genre')

In [59]:
counts.nlargest(3).index.to_list()

['Drama', 'Comedy', 'Action']

In [60]:
# we can pass the index object to isin(), and it will be treated like a list of genres:
# Thus, only Drama and Comedy and Action movies remain in the DataFrame.
movies[movies.genre.isin(counts.nlargest(3).index)].head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
5,8.9,12 Angry Men,NOT RATED,Drama,96,"[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals..."
9,8.9,Fight Club,R,Drama,139,"[u'Brad Pitt', u'Edward Norton', u'Helena Bonh..."
11,8.8,Inception,PG-13,Action,148,"[u'Leonardo DiCaprio', u'Joseph Gordon-Levitt'..."
12,8.8,Star Wars: Episode V - The Empire Strikes Back,PG,Action,124,"[u'Mark Hamill', u'Harrison Ford', u'Carrie Fi..."


## 15. Handle missing values

In [61]:
ufo.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,1930-06-01 22:00:00
1,Willingboro,,OTHER,NJ,1930-06-30 20:00:00
2,Holyoke,,OVAL,CO,1931-02-15 14:00:00
3,Abilene,,DISK,KS,1931-06-01 13:00:00
4,New York Worlds Fair,,LIGHT,NY,1933-04-18 19:00:00


In [62]:
# To find out how many values are missing in each column, you can use the isna() method and then take the sum():
ufo.isna().sum()

City                  26
Colors Reported    15359
Shape Reported      2644
State                  0
Time                   0
dtype: int64

In [63]:
#  percentage of values that are missing by taking the mean() of isna():
ufo.isna().mean()

City               0.001425
Colors Reported    0.842004
Shape Reported     0.144948
State              0.000000
Time               0.000000
dtype: float64

In [65]:
# If you want to drop the columns that have any missing values, you can use the dropna() method:
ufo.dropna(axis='columns').head()

Unnamed: 0,State,Time
0,NY,1930-06-01 22:00:00
1,NJ,1930-06-30 20:00:00
2,CO,1931-02-15 14:00:00
3,KS,1931-06-01 13:00:00
4,NY,1933-04-18 19:00:00


In [66]:
# if you want to drop columns in which more than 10% of the values are missing, you can set a threshold for dropna():
ufo.dropna(thresh=len(ufo)*0.9, axis='columns').head()

Unnamed: 0,City,State,Time
0,Ithaca,NY,1930-06-01 22:00:00
1,Willingboro,NJ,1930-06-30 20:00:00
2,Holyoke,CO,1931-02-15 14:00:00
3,Abilene,KS,1931-06-01 13:00:00
4,New York Worlds Fair,NY,1933-04-18 19:00:00


## 16. Split a string into multiple columns

In [67]:
df = pd.DataFrame({'name':['John Arthur Doe', 'Jane Ann Smith'],
                   'location':['Los Angeles, CA', 'Washington, DC']})
df

Unnamed: 0,name,location
0,John Arthur Doe,"Los Angeles, CA"
1,Jane Ann Smith,"Washington, DC"


In [68]:
df[['first', 'middle', 'last']] = df.name.str.split(' ', expand=True)
df

Unnamed: 0,name,location,first,middle,last
0,John Arthur Doe,"Los Angeles, CA",John,Arthur,Doe
1,Jane Ann Smith,"Washington, DC",Jane,Ann,Smith


In [69]:
## if we wanted to split a string, but only keep one of the resulting columns? For example, let's split the location column on "comma space":
## If we only cared about saving the city name in column 0, we can just select that column and save it to the DataFrame:

df['city'] = df.location.str.split(', ', expand=True)[0]
df

Unnamed: 0,name,location,first,middle,last,city
0,John Arthur Doe,"Los Angeles, CA",John,Arthur,Doe,Los Angeles
1,Jane Ann Smith,"Washington, DC",Jane,Ann,Smith,Washington


## 17. Expand a Series of lists into a DataFrame

In [70]:
df = pd.DataFrame({'col_one':['a', 'b', 'c'], 'col_two':[[10, 40], [20, 50], [30, 60]]})
df

Unnamed: 0,col_one,col_two
0,a,"[10, 40]"
1,b,"[20, 50]"
2,c,"[30, 60]"


In [71]:
df_new = df.col_two.apply(pd.Series)
df_new

Unnamed: 0,0,1
0,10,40
1,20,50
2,30,60


In [72]:
pd.concat([df, df_new], axis='columns')

Unnamed: 0,col_one,col_two,0,1
0,a,"[10, 40]",10,40
1,b,"[20, 50]",20,50
2,c,"[30, 60]",30,60


## 18. Aggregate by multiple functions

In [73]:
orders.head(10)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,2.39
1,1,1,Izze,[Clementine],3.39
2,1,1,Nantucket Nectar,[Apple],3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",16.98
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",10.98
6,3,1,Side of Chips,,1.69
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",11.75
8,4,1,Steak Soft Tacos,"[Tomatillo Green Chili Salsa, [Pinto Beans, Ch...",9.25
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",9.25


##### Each order has an order_id and consists of one or more rows. To figure out the total price of an order, you sum the item_price for that order_id. 
##### For example, here's the total price of order number 1:

In [75]:
orders[orders.order_id == 1].item_price.sum()

11.56

In [76]:
## If you wanted to calculate the total price of every order, you would groupby() order_id and then take the sum of item_price for each group:

orders.groupby('order_id').item_price.sum().head()

order_id
1    11.56
2    16.98
3    12.67
4    21.00
5    13.70
Name: item_price, dtype: float64

In [77]:
## To aggregate by multiple functions, you use the agg() method and pass it a list of functions such as sum() and count():

orders.groupby('order_id').item_price.agg(['sum', 'count']).head()

Unnamed: 0_level_0,sum,count
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,11.56,4
2,16.98,1
3,12.67,2
4,21.0,2
5,13.7,2


##### That gives us the total price of each order as well as the number of items in each order.

## 19. Combine the output of an aggregation with a DataFrame

In [79]:
orders.head(10)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,2.39
1,1,1,Izze,[Clementine],3.39
2,1,1,Nantucket Nectar,[Apple],3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",16.98
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",10.98
6,3,1,Side of Chips,,1.69
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",11.75
8,4,1,Steak Soft Tacos,"[Tomatillo Green Chili Salsa, [Pinto Beans, Ch...",9.25
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",9.25


### create a new column listing the total price of each order? Recall that we calculated the total price using the sum() method:

In [80]:
orders.groupby('order_id').item_price.sum().head()

order_id
1    11.56
2    16.98
3    12.67
4    21.00
5    13.70
Name: item_price, dtype: float64

In [81]:
len(orders.groupby('order_id').item_price.sum())

1834

In [82]:
len(orders.item_price)

4622

In [83]:
total_price = orders.groupby('order_id').item_price.transform('sum')
len(total_price)

4622

In [84]:
orders['total_price'] = total_price
orders.head(5)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,total_price
0,1,1,Chips and Fresh Tomato Salsa,,2.39,11.56
1,1,1,Izze,[Clementine],3.39,11.56
2,1,1,Nantucket Nectar,[Apple],3.39,11.56
3,1,1,Chips and Tomatillo-Green Chili Salsa,,2.39,11.56
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",16.98,16.98


## 20. Select a slice of rows and columns

In [85]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [86]:
titanic.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [87]:
titanic.describe().loc['min':'max']

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


##### To only show the "five-number summary", you can use the loc accessor and pass it a slice of the "min" through the "max" row labels

In [89]:
titanic.describe().loc['min':'max', 'Pclass':'Parch']

Unnamed: 0,Pclass,Age,SibSp,Parch
min,1.0,0.42,0.0,0.0
25%,2.0,20.125,0.0,0.0
50%,3.0,28.0,0.0,0.0
75%,3.0,38.0,1.0,0.0
max,3.0,80.0,8.0,6.0


##### Only interested in few columns, you can also pass it a slice of column labels

## 21. Reshape a MultiIndexed Series

In [91]:
titanic.Survived.mean()

0.3838383838383838

In [92]:
titanic.groupby('Sex').Survived.mean()

Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64

##### To calculate the survival rate across two different categories at once, you would groupby() both of those categories:

In [93]:
titanic.groupby(['Sex', 'Pclass']).Survived.mean()

Sex     Pclass
female  1         0.968085
        2         0.921053
        3         0.500000
male    1         0.368852
        2         0.157407
        3         0.135447
Name: Survived, dtype: float64

##### It can be hard to read and interact with data in above format, 
##### so it's often more convenient to reshape a MultiIndexed Series into a DataFrame by using the unstack() method:

In [94]:
titanic.groupby(['Sex', 'Pclass']).Survived.mean().unstack()

Pclass,1,2,3
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


## 22. Create a pivot table

In [95]:
## To create DataFrames like the one above, you might find it more convenient to use the pivot_table() method instead:

titanic.pivot_table(index='Sex', columns='Pclass', values='Survived', aggfunc='mean')

Pclass,1,2,3
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [96]:
## An added benefit of a pivot table is that you can easily add row and column totals by setting margins=True:

titanic.pivot_table(index='Sex', columns='Pclass', values='Survived', aggfunc='mean', margins=True)

Pclass,1,2,3,All
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.968085,0.921053,0.5,0.742038
male,0.368852,0.157407,0.135447,0.188908
All,0.62963,0.472826,0.242363,0.383838


#### This shows the overall survival rate as well as the survival rate by Sex and Passenger Class.

In [97]:
## create a cross-tabulation just by changing the aggregation function from "mean" to "count":

titanic.pivot_table(index='Sex', columns='Pclass', values='Survived', aggfunc='count',
                    margins=True)

Pclass,1,2,3,All
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,94,76,144,314
male,122,108,347,577
All,216,184,491,891


#### This shows the number of records that appear in each combination of categories.

## 23. Convert continuous data into categorical data

In [98]:
titanic.Age.head(10)

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5     NaN
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64

In [100]:
## Age is currently continuous data, but what if you wanted to convert it into categorical data?

## One solution would be to label the age ranges, such as "child", "young adult", and "adult". The best way to do this is by using the cut() function:

pd.cut(titanic.Age, bins=[0, 18, 25, 99], labels=['child', 'young adult', 'adult']).head(10)

0    young adult
1          adult
2          adult
3          adult
4          adult
5            NaN
6          adult
7          child
8          adult
9          child
Name: Age, dtype: category
Categories (3, object): ['child' < 'young adult' < 'adult']

In [102]:
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [107]:
titanic['AgeGroup'] = pd.cut(titanic['Age'], bins=[0, 18, 25, 99], labels=['child', 'young adult', 'adult'])

In [108]:
titanic.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeGroup
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,young adult
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,adult
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,adult
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,adult
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,adult
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,adult
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,child
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,adult
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,child


## 24. Change display options

In [109]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeGroup
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,young adult
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,adult
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,adult
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,adult
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,adult


In [110]:
pd.set_option('display.float_format', '{:.2f}'.format)

##### Note that this did not change the underlying data, only the display of the data.

In [111]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeGroup
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,young adult
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,C,adult
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,S,adult
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,adult
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,adult


In [112]:
pd.reset_option('display.float_format')

## 25. Style a DataFrame

In [113]:
stocks

Unnamed: 0,Date,Close,Volume,Symbol
0,2016-10-03,31.5,14070500,CSCO
1,2016-10-03,112.52,21701800,AAPL
2,2016-10-03,57.42,19189500,MSFT
3,2016-10-04,113.0,29736800,AAPL
4,2016-10-04,57.24,20085900,MSFT
5,2016-10-04,31.35,18460400,CSCO
6,2016-10-05,57.64,16726400,MSFT
7,2016-10-05,31.59,11808600,CSCO
8,2016-10-05,113.05,21453100,AAPL


In [114]:
format_dict = {'Date':'{:%m/%d/%y}', 'Close':'${:.2f}', 'Volume':'{:,}'}

In [115]:
stocks.style.format(format_dict)

Unnamed: 0,Date,Close,Volume,Symbol
0,10/03/16,$31.50,14070500,CSCO
1,10/03/16,$112.52,21701800,AAPL
2,10/03/16,$57.42,19189500,MSFT
3,10/04/16,$113.00,29736800,AAPL
4,10/04/16,$57.24,20085900,MSFT
5,10/04/16,$31.35,18460400,CSCO
6,10/05/16,$57.64,16726400,MSFT
7,10/05/16,$31.59,11808600,CSCO
8,10/05/16,$113.05,21453100,AAPL


In [117]:
## We can apply more styling by chaining additional methods:

(stocks.style.format(format_dict)
 .highlight_min('Close', color='red')
 .highlight_max('Close', color='lightgreen')
)

Unnamed: 0,Date,Close,Volume,Symbol
0,10/03/16,$31.50,14070500,CSCO
1,10/03/16,$112.52,21701800,AAPL
2,10/03/16,$57.42,19189500,MSFT
3,10/04/16,$113.00,29736800,AAPL
4,10/04/16,$57.24,20085900,MSFT
5,10/04/16,$31.35,18460400,CSCO
6,10/05/16,$57.64,16726400,MSFT
7,10/05/16,$31.59,11808600,CSCO
8,10/05/16,$113.05,21453100,AAPL


In [123]:
(stocks.style.format(format_dict)
                .hide(axis='index')  # Hide the index
                .bar(subset='Volume', color='lightblue', align='zero')  # Bar chart for 'Volume'
                .set_caption('Stock Prices from October 2016'))  # Add caption

Date,Close,Volume,Symbol
10/03/16,$31.50,14070500,CSCO
10/03/16,$112.52,21701800,AAPL
10/03/16,$57.42,19189500,MSFT
10/04/16,$113.00,29736800,AAPL
10/04/16,$57.24,20085900,MSFT
10/04/16,$31.35,18460400,CSCO
10/05/16,$57.64,16726400,MSFT
10/05/16,$31.59,11808600,CSCO
10/05/16,$113.05,21453100,AAPL


In [130]:
# pip install matplotlib

In [137]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import colors

def background_gradient(s, m, M, cmap='PuBu', low=0, high=0):
    rng = M - m
    norm = colors.Normalize(m - (rng * low),
                            M + (rng * high))
    normed = norm(s.values)
    c = [colors.rgb2hex(x) for x in plt.cm.get_cmap(cmap)(normed)]
    return ['background-color: %s' % color for color in c]

df = pd.DataFrame([[3,2,10,4],[20,1,3,2],[5,4,6,1]])
df.style.apply(background_gradient,
               cmap='PuBu',
               m=df.min().min(),
               M=df.max().max(),
               low=0,
               high=0.2)

  c = [colors.rgb2hex(x) for x in plt.cm.get_cmap(cmap)(normed)]


Unnamed: 0,0,1,2,3
0,3,2,10,4
1,20,1,3,2
2,5,4,6,1
