### This notebook covers:
* Structure and content of dataframe
* new data: nutrition info
* hands on data munging and preparation
* sampling, random numbers and more...

### Revision:
* Creating a dataframe in 5 ways
* attributes like df.ndim, df.shape, df.size, df.name, df.dtypes
* methods like 
    - df.info(verbose=True, memory_usage=True, max_col=2), 
    - pd.read_csv(use_cols=[list_of_columns], index_col=index of column to be used as index), 
    - drop('column_name', axis=1), 
    - set_index(column/list_of_columns, drop=False, append=False, verify_integrity=False)
* sample() -> parameters: 
    - n= number of records to get, 
    - frac=0.05 to get percentage samples, 
    - random_state=int to keep random state same, 
    - replace= True to get samples with replacement, 
    - weights = pd.Series for weighted samplings
* axes and index: df.axes df.index, difference between int64Index and RangeIndex
* Exctrating elements with -
    - labels(loc) 
    - indexes(iloc)
    - boolean mask
    - at/iat
* getting index from label - getloc(labelname), getting lable from index - index(int)
* astype(type) - for all columns and for specific columns 
* df.replace(to_replace='', value='', regex=False)
* df.rename(index={}, columns={}, inplace=True), df.rename(mapper={}, axis=0/1)
* dropna(how='any',axis=0/1, subset=[index/columns depends on axis parameter]), dropna(thresh=int, axis=1/0)
* filter(like='example', regex = '(?!)octopus' , items = ['a','b'], axis=0/1)
* sort_values(by=['a','b'], ascending=[False,True])
* between(15,20) - on series
* max(axis=0), min(), idxmax(), idxmin() - only on series
* nlargest(n, columns=['a','b']), nsmallest(n)

In [1]:
import pandas as pd
import numpy as np

In [2]:
names = ['John','Mak','Chetan','Alex']
ages = [23,33, 25, 28]
married = [True, False, False, True]

In [9]:
df = pd.DataFrame({'name': names, 'ages': ages, 'married': married})
df

Unnamed: 0,name,ages,married
0,John,23,True
1,Mak,33,False
2,Chetan,25,False
3,Alex,28,True


In [10]:
# Dataframes are 2D. They have labeled indices and columns

In [11]:
df.iloc[2,0]

'Chetan'

In [12]:
df.ndim, df.shape, df.size

(2, (4, 3), 12)

In [13]:
# Each column in a dataframe is a series

In [14]:
print(type(df.name))
df.name

<class 'pandas.core.series.Series'>


0      John
1       Mak
2    Chetan
3      Alex
Name: name, dtype: object

In [15]:
# Unlike series dataframe can be heterogenious

In [16]:
df.dtypes

name       object
ages        int64
married      bool
dtype: object

### Creating DataFrame:

In [17]:
# 1. with dictionary of lists:
pd.DataFrame({'name':names, 'age': ages, 'married': married})  # names, ages, and married should be of equal length

Unnamed: 0,name,age,married
0,John,23,True
1,Mak,33,False
2,Chetan,25,False
3,Alex,28,True


In [18]:
# 2. with dictionary of tuples:
pd.DataFrame({'name': tuple(names), 'age': tuple(ages), 'married': tuple(married)})  # names, ages, and married should be of equal length

Unnamed: 0,name,age,married
0,John,23,True
1,Mak,33,False
2,Chetan,25,False
3,Alex,28,True


In [19]:
# 3. with dict of series:
pd.DataFrame({'name': pd.Series(names), 'age': pd.Series(ages), 'married': pd.Series(married)})

Unnamed: 0,name,age,married
0,John,23,True
1,Mak,33,False
2,Chetan,25,False
3,Alex,28,True


In [20]:
# 4. with dict of dicts:
pd.DataFrame({'name': {x:y for x, y in enumerate(names)}, 'age': {x:y for x, y in enumerate(ages)}, 'married': {x:y for x, y in enumerate(married)}})

Unnamed: 0,name,age,married
0,John,23,True
1,Mak,33,False
2,Chetan,25,False
3,Alex,28,True


In [21]:
# 5. list of dicts:
pd.DataFrame([{'name': name, 'age': age, 'married': m} for name,age,m in zip(names, ages, married)])

Unnamed: 0,name,age,married
0,John,23,True
1,Mak,33,False
2,Chetan,25,False
3,Alex,28,True


### Info method:

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     4 non-null      object
 1   ages     4 non-null      int64 
 2   married  4 non-null      bool  
dtypes: bool(1), int64(1), object(1)
memory usage: 196.0+ bytes


In [23]:
df.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Columns: 3 entries, name to married
dtypes: bool(1), int64(1), object(1)
memory usage: 196.0+ bytes


In [24]:
df.info(memory_usage=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     4 non-null      object
 1   ages     4 non-null      int64 
 2   married  4 non-null      bool  
dtypes: bool(1), int64(1), object(1)

In [27]:
df.info(max_cols=2)  # truncated if columns are more than max_cols parameter

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Columns: 3 entries, name to married
dtypes: bool(1), int64(1), object(1)
memory usage: 196.0+ bytes


In [28]:
df.info(memory_usage='deep')  #exact memory usage

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     4 non-null      object
 1   ages     4 non-null      int64 
 2   married  4 non-null      bool  
dtypes: bool(1), int64(1), object(1)
memory usage: 409.0 bytes


In [29]:
nutrition_url = 'https://andybek.com/pandas-nutrition'
nutrition_data = pd.read_csv(nutrition_url)

In [30]:
nutrition_data.head()

Unnamed: 0.1,Unnamed: 0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
0,0,Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
1,1,"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
2,2,"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
3,3,"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
4,4,"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [31]:
nutrition_data.info(verbose=False, memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8789 entries, 0 to 8788
Columns: 77 entries, Unnamed: 0 to water
dtypes: int64(3), object(74)
memory usage: 39.2 MB


In [32]:
# Removing duplicate index:
nutrition_data.drop('Unnamed: 0', axis=1).head()  # axis=1 (column) , axis=0 (row)

Unnamed: 0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
0,Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
1,"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
2,"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
3,"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
4,"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [33]:
nutrition_data.set_index('Unnamed: 0').head()

Unnamed: 0_level_0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
1,"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
2,"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
3,"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
4,"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [43]:
nutrition_df = pd.read_csv(nutrition_url, index_col=0)
nutrition_df.head()

Unnamed: 0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
0,Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
1,"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
2,"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
3,"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
4,"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [44]:
nutrition_df.set_index('name',drop=True,inplace=True)
nutrition_df.head()

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,0.000 mg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,1.167 mg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,0.649 mg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,3.363 mg,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,0.063 mg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


### Sample() method:

In [45]:
nutrition_df.sample()   # gives a sample record from df

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Cookies, unenriched, commercially prepared, butter",100 g,467,19g,11g,117mg,351.00 mg,0,6.00 mcg,0.00 mcg,0.320 mg,...,18.80 g,11.051 g,5.522 g,0.982 g,117.00 mg,0,1.50 g,0,0,4.60 g


In [47]:
nutrition_df.sample(random_state=12)   # this will keep value same according to random_state

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Thuringer, pork, beef, summer sausage, cervelat",100 g,362,30g,12g,74mg,1300.00 mg,78.9 mg,2.00 mcg,0.00 mcg,4.310 mg,...,30.43 g,11.510 g,12.970 g,1.200 g,74.00 mg,0.0 g,3.63 g,0.00 mg,0.00 mg,45.18 g


In [48]:
nutrition_df.sample(n=3)

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Olives, green, canned or bottled, pickled",100 g,145,15g,2g,0,1556.00 mg,14.2 mg,3.00 mcg,0.00 mcg,0.237 mg,...,15.32 g,2.029 g,11.314 g,1.307 g,0.00 mg,0.0 g,4.53 g,0.00 mg,0.00 mg,75.28 g
"Nuts, smoke flavor, with salt added, oil roasted, almonds",100 g,607,56g,3.6g,0,548.00 mg,0,0,0,0,...,55.89 g,3.571 g,0,0,0.00 mg,0,2.03 g,0,0,2.80 g
"Salad dressing, regular, honey mustard",100 g,464,41g,5g,29mg,512.00 mg,20.7 mg,5.00 mcg,0.00 mcg,0.061 mg,...,40.83 g,5.000 g,11.458 g,22.839 g,29.00 mg,0.0 g,2.32 g,0.00 mg,0.00 mg,32.64 g


In [59]:
nutrition_df.sample(frac=0.0005)

Unnamed: 0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
1740,"Babyfood, strained, turkey, meat",100 g,111,6.2g,1.6g,58mg,49.00 mg,39.8 mg,8.00 mcg,0.00 mcg,...,6.20 g,1.641 g,2.384 g,1.741 g,58.00 mg,0.0 g,0.58 g,0.00 mg,0.00 mg,80.32 g
4331,"Salad dressing, with salt, regular, mayonnaise...",100 g,250,22g,3.4g,19mg,653.00 mg,14.6 mg,6.00 mcg,0.00 mcg,...,21.60 g,3.396 g,5.053 g,12.187 g,19.00 mg,0.0 g,1.76 g,0.00 mg,0.00 mg,61.22 g
8731,"Fast foods, regular patty; double decker bun w...",100 g,261,14g,5g,36mg,485.00 mg,31.5 mg,27.00 mcg,18.00 mcg,...,14.10 g,5.045 g,5.088 g,3.826 g,36.00 mg,0.0 g,1.89 g,0.00 mg,0.00 mg,50.54 g
2939,"Beans, raw, mature seeds, royal red, kidney",100 g,329,0.5g,0.1g,0,13.00 mg,0,393.00 mcg,0.00 mcg,...,0.45 g,0.065 g,0.035 g,0.249 g,0.00 mg,0,4.00 g,0,0,11.90 g


In [62]:
# sample with replacement: probablity of choosing same record remains same
nutrition_df.sample(3,replace=True)

Unnamed: 0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
7386,"Lamb, braised, cooked, separable lean only, br...",100 g,270,18g,6g,102mg,86.00 mg,0,0,0,...,17.53 g,6.004 g,4.317 g,0.775 g,102.00 mg,0.0 g,0.75 g,0.00 mg,0.00 mg,54.77 g
326,Vital wheat gluten,100 g,370,1.9g,0.3g,0,29.00 mg,0,0.00 mcg,0.00 mcg,...,1.85 g,0.272 g,0.156 g,0.810 g,0.00 mg,0.0 g,1.00 g,0.00 mg,0.00 mg,8.20 g
3377,"Fast foods, with beans and beef, burrito",100 g,191,7.5g,2.9g,26mg,570.00 mg,31.7 mg,73.00 mcg,24.00 mcg,...,7.47 g,2.905 g,3.257 g,0.577 g,26.00 mg,0.0 g,1.51 g,0.00 mg,0.00 mg,60.00 g


In [71]:
# weighted sampling:
weights = pd.Series(data=[10,10,10,1,2], index=[7,25,17,55,66])
nutrition_df.sample(n=3,weights=weights)

Unnamed: 0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
17,"Peppers, raw, jalapeno",100 g,29,0.4g,0.1g,0,3.00 mg,7.5 mg,27.00 mcg,0.00 mcg,...,0.37 g,0.092 g,0.029 g,0.112 g,0.00 mg,0.0 g,0.53 g,0.00 mg,0.00 mg,91.69 g
25,"PACE, Green Taco Sauce",100 g,25,0g,,0,0,0,0,0,...,0.00 g,0.000 g,0,0,0.00 mg,0,3.05 g,0,0,90.70 g
7,"Lamb, raw, ground",100 g,282,23g,10g,73mg,59.00 mg,69.3 mg,18.00 mcg,0.00 mcg,...,23.41 g,10.190 g,9.600 g,1.850 g,73.00 mg,0.0 g,0.87 g,0.00 mg,0.00 mg,59.47 g


### Axes

In [74]:
nutrition_df.axes     # rows=0 and columns=1

[Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
             ...
             8779, 8780, 8781, 8782, 8783, 8784, 8785, 8786, 8787, 8788],
            dtype='int64', length=8789),
 Index(['name', 'serving_size', 'calories', 'total_fat', 'saturated_fat',
        'cholesterol', 'sodium', 'choline', 'folate', 'folic_acid', 'niacin',
        'pantothenic_acid', 'riboflavin', 'thiamin', 'vitamin_a',
        'vitamin_a_rae', 'carotene_alpha', 'carotene_beta',
        'cryptoxanthin_beta', 'lutein_zeaxanthin', 'lucopene', 'vitamin_b12',
        'vitamin_b6', 'vitamin_c', 'vitamin_d', 'vitamin_e', 'tocopherol_alpha',
        'vitamin_k', 'calcium', 'copper', 'irom', 'magnesium', 'manganese',
        'phosphorous', 'potassium', 'selenium', 'zink', 'protein', 'alanine',
        'arginine', 'aspartic_acid', 'cystine', 'glutamic_acid', 'glycine',
        'histidine', 'hydroxyproline', 'isoleucine', 'leucine', 'lysine',
        'methionine', 'phenylalanine', 'proline', 'ser

### Index

In [75]:
nutrition_df.index

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            8779, 8780, 8781, 8782, 8783, 8784, 8785, 8786, 8787, 8788],
           dtype='int64', length=8789)

In [77]:
# RangeIndex vs. int64Index
# RangeIndex is special case of int64index
# both are immutable, sequence of numbers
# RangeIndex is an optimized alternative
print(type(nutrition_df.index))

<class 'pandas.core.indexes.numeric.Int64Index'>


In [73]:
nutrition_df.index = pd.RangeIndex(start=0,stop=8789,step=1)
print(type(nutrition_df.index))
nutrition_df.set_index('name', drop=True, inplace=False, append=False, verify_integrity=False).head()
# append will create multi index, drop=False will also keep column in data
# verify_integrity=True will check if all are unique values

<class 'pandas.core.indexes.range.RangeIndex'>


KeyError: "None of ['name'] are in the columns"

### Extracting by Labels:

In [49]:
# nutrition_df.set_index('name', inplace=True)
nutrition_df.head()

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,0.000 mg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,1.167 mg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,0.649 mg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,3.363 mg,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,0.063 mg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [50]:
print(nutrition_df.loc['Eggplant, raw'])
print(nutrition_df.loc['Eggplant, raw']['calories'])
print(nutrition_df.loc['Eggplant, raw','calories'])

serving_size       100 g
calories              25
total_fat           0.2g
saturated_fat        NaN
cholesterol            0
                  ...   
alcohol            0.0 g
ash               0.66 g
caffeine         0.00 mg
theobromine      0.00 mg
water            92.30 g
Name: Eggplant, raw, Length: 75, dtype: object
25
25


In [91]:
nutrition_df.loc['Eggplant, raw':'Sherbet, orange','calories':'sodium']

Unnamed: 0_level_0,calories,total_fat,saturated_fat,cholesterol,sodium
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"Eggplant, raw",25,0.2g,,0,2.00 mg
"Teff, uncooked",367,2.4g,0.4g,0,12.00 mg
"Sherbet, orange",144,2g,1.2g,1mg,46.00 mg


In [92]:
nutrition_df.loc[['Eggplant, raw','Sherbet, orange'],['calories','sodium'] ]

Unnamed: 0_level_0,calories,sodium
name,Unnamed: 1_level_1,Unnamed: 2_level_1
"Eggplant, raw",25,2.00 mg
"Sherbet, orange",144,46.00 mg


### Extracting by Position

In [95]:
print(nutrition_df.iloc[2])
print(nutrition_df.iloc[2][1])

serving_size       100 g
calories              25
total_fat           0.2g
saturated_fat        NaN
cholesterol            0
                  ...   
alcohol            0.0 g
ash               0.66 g
caffeine         0.00 mg
theobromine      0.00 mg
water            92.30 g
Name: Eggplant, raw, Length: 75, dtype: object
25


In [97]:
nutrition_df.iloc[2:5,1:5]

Unnamed: 0_level_0,calories,total_fat,saturated_fat,cholesterol
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Eggplant, raw",25,0.2g,,0
"Teff, uncooked",367,2.4g,0.4g,0
"Sherbet, orange",144,2g,1.2g,1mg


In [98]:
nutrition_df.iloc[[2,4],[1,3]]

Unnamed: 0_level_0,calories,saturated_fat
name,Unnamed: 1_level_1,Unnamed: 2_level_1
"Eggplant, raw",25,
"Sherbet, orange",144,1.2g


### Boolean mask

In [100]:
nutrition_df.iloc[
    [True if i%2==0 else False for i in range(8789)],
    [True if i%2==0 else False for i in range(75)]
].head()

Unnamed: 0_level_0,serving_size,total_fat,cholesterol,choline,folic_acid,pantothenic_acid,thiamin,vitamin_a_rae,carotene_beta,lutein_zeaxanthin,...,carbohydrate,sugars,galactose,lactose,sucrose,saturated_fatty_acids,polyunsaturated_fatty_acids,alcohol,caffeine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,100 g,0.1g,0,0.4 mg,0.00 mcg,0.000 mg,0.000 mg,0.00 mcg,0.00 mcg,0.00 mcg,...,91.27 g,0.00 g,0,0,0,0.009 g,0.025 g,0.0 g,0.00 mg,8.32 g
"Eggplant, raw",100 g,0.2g,0,6.9 mg,0.00 mcg,0.281 mg,0.039 mg,1.00 mcg,14.00 mcg,36.00 mcg,...,5.88 g,3.53 g,0,0,0.26 g,0.034 g,0.076 g,0.0 g,0.00 mg,92.30 g
"Sherbet, orange",100 g,2g,1mg,7.7 mg,0.00 mcg,0.224 mg,0.027 mg,12.00 mcg,1.00 mcg,7.00 mcg,...,30.40 g,24.32 g,0,0,0,1.160 g,0.080 g,0.0 g,0.00 mg,66.10 g
"Taro leaves, raw",100 g,0.7g,0,12.8 mg,0.00 mcg,0.084 mg,0.209 mg,241.00 mcg,2895.00 mcg,1932.00 mcg,...,6.70 g,3.01 g,0,0,0,0.151 g,0.307 g,0.0 g,0.00 mg,85.66 g
"Cheese, camembert",100 g,24g,72mg,15.4 mg,0.00 mcg,1.364 mg,0.028 mg,241.00 mcg,12.00 mcg,0.00 mcg,...,0.46 g,0.46 g,0,0,0,15.259 g,0.724 g,0.0 g,0.00 mg,51.80 g


### Single value extraction with .at and .iat

In [102]:
# faster than loc/iloc

In [107]:
%timeit nutrition_df.loc['Eggplant, raw','calories']

12.5 µs ± 388 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [108]:
%timeit nutrition_df.at['Eggplant, raw','calories']

6.12 µs ± 264 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [109]:
%timeit nutrition_df.iloc[2,1]

27.6 µs ± 1.26 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [110]:
%timeit nutrition_df.iat[2,1]

19.1 µs ± 695 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


### get_loc() method

In [115]:
# Approach 1 - knowing label from index
index_label = nutrition_df.index[2]
print(index_label)

Eggplant, raw


In [117]:
# Approach 2 - knowing index from label
column_label = nutrition_df.columns.get_loc('calories')
print(column_label)

1


In [132]:
# Challenge: 
# 1 
nutr_mini = nutrition_df.sample(10)
nutr_mini

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Cookies, fortune",100 g,378,2.7g,0.7g,2mg,31.00 mg,6.0 mg,66.00 mcg,56.00 mcg,1.840 mg,...,2.70 g,0.669 g,1.345 g,0.466 g,2.00 mg,0.0 g,0.90 g,0.00 mg,0.00 mg,8.00 g
"Pork, roasted, heated, separable lean only, bone-in, rump, ham with natural juices, cured",100 g,137,4.3g,1.4g,72mg,861.00 mg,110.6 mg,1.00 mcg,0.00 mcg,7.620 mg,...,4.25 g,1.418 g,1.995 g,0.711 g,72.00 mg,0.0 g,3.29 g,0.00 mg,0.00 mg,67.90 g
"Beef, broiled, cooked, all grades, trimmed to 0"" fat, separable lean only, porterhouse steak, short loin",100 g,212,11g,3.9g,62mg,69.00 mg,99.3 mg,8.00 mcg,0.00 mcg,4.630 mg,...,11.18 g,3.912 g,5.192 g,0.379 g,62.00 mg,0.0 g,1.28 g,0.00 mg,0.00 mg,60.38 g
"WORTHINGTON Multigrain Cutlets, unprepared, canned",100 g,117,1.5g,0.3g,0,371.00 mg,0,2.00 mcg,0.00 mcg,26.290 mg,...,1.50 g,0.300 g,0.200 g,0.800 g,0.00 mg,0,1.22 g,0,0,66.69 g
"MORNINGSTAR FARMS Veggie Dog, unprepared, frozen",100 g,126,1.3g,0.2g,0,1076.00 mg,0,2.00 mcg,0,25.700 mg,...,1.30 g,0.200 g,0.200 g,0.600 g,0.00 mg,0,6.10 g,0,0,64.00 g
"Cereals ready-to-eat, Apple ZINGS, MALT-O-MEAL",100 g,390,2.7g,0.7g,0,454.00 mg,0,606.00 mcg,587.00 mcg,15.140 mg,...,2.71 g,0.730 g,0.550 g,0.670 g,0.00 mg,0,2.98 g,0.00 mg,0,2.50 g
"Cookies, coconut macaroon",100 g,460,23g,20g,0,241.00 mg,9.1 mg,3.00 mcg,0.00 mcg,0.220 mg,...,22.55 g,20.099 g,1.610 g,0.810 g,0.00 mg,0.0 g,1.71 g,0.00 mg,0.00 mg,11.50 g
"Milk, without added vitamin A and vitamin D, 3.25% milkfat, whole",100 g,61,3.3g,1.9g,10mg,43.00 mg,14.3 mg,5.00 mcg,0.00 mcg,0.089 mg,...,3.27 g,1.865 g,0.812 g,0.195 g,10.00 mg,0.0 g,0.67 g,0.00 mg,0.00 mg,88.13 g
"Onions, tops only, young green",100 g,27,0.5g,0.1g,0,15.00 mg,4.3 mg,30.00 mcg,0.00 mcg,0.330 mg,...,0.47 g,0.087 g,0.022 g,0.064 g,0.00 mg,0.0 g,0.51 g,0.00 mg,0.00 mg,92.32 g
"Cereals, salt, prepared with water, QUAKER MultiGrain Oatmeal, QUAKER",100 g,61,0.5g,0.1g,0,70.00 mg,0,5.00 mcg,4.00 mcg,0.655 mg,...,0.47 g,0.095 g,0.105 g,0.235 g,0.00 mg,0,0.54 g,0,0,83.44 g


In [120]:
# 2
print(nutr_mini.loc[:,['total_fat','cholesterol']])

                                                   total_fat cholesterol
name                                                                    
MOTHER'S, Coconut Cocadas Cookies                        25g        11mg
Soup, Chinese restaurant, wonton                        0.3g         4mg
Beef, braised, cooked, liver, variety meats and...      5.3g       396mg
CAMPBELL'S Red and White, condensed, Chicken an...        2g         8mg
Margarine, without salt, stick, composite, 80% ...       81g           0
Cereals ready-to-eat, KELLOGG'S SPECIAL K Red B...      1.3g           0
Melons, raw, honeydew                                   0.1g           0
Salad dressing, home recipe, french                      70g           0
Beef, raw, choice, trimmed to 1/8" fat, separab...       22g        91mg
Fast foods, hard shell, cheese and lettuce, tac...       13g        28mg


In [130]:
# 3
column_index = nutrition_df.columns.get_loc('vitamin_b12')
print(column_index)
print(nutr_mini.iloc[:3,column_index:])

20
                                                   vitamin_b12 vitamin_b6  \
name                                                                        
MOTHER'S, Coconut Cocadas Cookies                            0   0.060 mg   
Soup, Chinese restaurant, wonton                      0.09 mcg   0.076 mg   
Beef, braised, cooked, liver, variety meats and...   70.58 mcg   1.017 mg   

                                                   vitamin_c vitamin_d  \
name                                                                     
MOTHER'S, Coconut Cocadas Cookies                          0         0   
Soup, Chinese restaurant, wonton                      0.7 mg   0.00 IU   
Beef, braised, cooked, liver, variety meats and...    1.9 mg  49.00 IU   

                                                   vitamin_e tocopherol_alpha  \
name                                                                            
MOTHER'S, Coconut Cocadas Cookies                    0.45 mg          0.45 mg 

In [131]:
# 4

nutrition_df.iat[2,1]

25

In [9]:
nutrition_df.set_index('name', inplace=True)
nutrition_df.head()

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,0.000 mg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,1.167 mg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,0.649 mg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,3.363 mg,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,0.063 mg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


### astype()

In [51]:
people = pd.DataFrame({
    'names': ['John','Mak','Chetan'],
    'ages': [24,25,26],
    'languages': ['HTML','JS','PYTHON']
})
print(people.info())
people = people.astype(str)                   # for all columns
print(people.info())
people = people.astype({'ages':int})          # for specific column
print(people.info())
people = people.astype({'ages': np.int16})    # for specific column
print(people.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   names      3 non-null      object
 1   ages       3 non-null      int64 
 2   languages  3 non-null      object
dtypes: int64(1), object(2)
memory usage: 200.0+ bytes
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   names      3 non-null      object
 1   ages       3 non-null      object
 2   languages  3 non-null      object
dtypes: object(3)
memory usage: 200.0+ bytes
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   names      3 non-null      object
 1   ages       3 non-null      int32 
 2   languages  3 non-null      object
dtype

### DataFrame replace() and Regex

In [14]:
nutrition_df.head()

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,0.000 mg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,1.167 mg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,0.649 mg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,3.363 mg,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,0.063 mg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [16]:
nutrition_df['serving_size'].replace(to_replace=r'\sg', value="", regex=True)
nutrition_df.head()

Unnamed: 0_level_0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,100,381,0.1g,,0,9.00 mg,0.4 mg,0.00 mcg,0.00 mcg,0.000 mg,...,0.05 g,0.009 g,0.016 g,0.025 g,0.00 mg,0.0 g,0.09 g,0.00 mg,0.00 mg,8.32 g
"Nuts, pecans",100,691,72g,6.2g,0,0.00 mg,40.5 mg,22.00 mcg,0.00 mcg,1.167 mg,...,71.97 g,6.180 g,40.801 g,21.614 g,0.00 mg,0.0 g,1.49 g,0.00 mg,0.00 mg,3.52 g
"Eggplant, raw",100,25,0.2g,,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,0.649 mg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
"Teff, uncooked",100,367,2.4g,0.4g,0,12.00 mg,13.1 mg,0,0,3.363 mg,...,2.38 g,0.449 g,0.589 g,1.071 g,0,0,2.37 g,0,0,8.82 g
"Sherbet, orange",100,144,2g,1.2g,1mg,46.00 mg,7.7 mg,4.00 mcg,0.00 mcg,0.063 mg,...,2.00 g,1.160 g,0.530 g,0.080 g,1.00 mg,0.0 g,0.40 g,0.00 mg,0.00 mg,66.10 g


In [56]:
units = nutrition_df.astype(str).replace(r"[^a-zA-Z]", "",regex=True)
print(units.saturated_fat.value_counts())
print(units.saturated_fat.mode())
units.mode()

g      7199
nan    1590
Name: saturated_fat, dtype: int64
0    g
Name: saturated_fat, dtype: object


Unnamed: 0,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,niacin,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
0,g,,g,g,mg,mg,mg,mcg,mcg,mg,...,g,g,g,g,mg,g,g,mg,mg,g


### raname()

In [27]:
people = pd.DataFrame({
    'names': ['John','Mak','Chetan'],
    'ages': [24,25,26],
    'languages': ['HTML','JS','PYTHON']
})
people.head()
people.rename(index={0:'pickachu',1:'example'}, columns={'names': 'Names','ages': 'Ages'}, inplace=True)
people.head()

Unnamed: 0,Names,Ages,languages
pickachu,John,24,HTML
example,Mak,25,JS
2,Chetan,26,PYTHON


In [29]:
people.rename(mapper={'languages': 'Languages'}, axis=1) # can not apply mapper to both columns and indexes together.

Unnamed: 0,Names,Ages,Languages
pickachu,John,24,HTML
example,Mak,25,JS
2,Chetan,26,PYTHON


### dropna()

In [32]:
people.loc[2,:] = np.nan
people.loc['example','Ages'] = np.nan
people.dropna(how='any',axis=0)

Unnamed: 0,Names,Ages,languages
pickachu,John,24.0,HTML


In [34]:
people.dropna(how='all',axis=0)

Unnamed: 0,Names,Ages,languages
pickachu,John,24.0,HTML
example,Mak,,JS


In [39]:
people.dropna(thresh=2,axis=0)     # thresh = min nonna values in row

Unnamed: 0,Names,Ages,languages
pickachu,John,24.0,HTML
example,Mak,,JS


In [41]:
people.dropna(axis=1)   # by default how='any'

pickachu
example
2


In [42]:
people.dropna(how='all',axis=1)

Unnamed: 0,Names,Ages,languages
pickachu,John,24.0,HTML
example,Mak,,JS
2,,,


### dropna() with subset

In [44]:
people.loc[2,'languages'] = 'PYTHON'
people.dropna(how='any',axis=0,subset=['languages'])

Unnamed: 0,Names,Ages,languages
pickachu,John,24.0,HTML
example,Mak,,JS
2,,,PYTHON


In [45]:
people.dropna(how='any',axis=0, subset=['Names'])

Unnamed: 0,Names,Ages,languages
pickachu,John,24.0,HTML
example,Mak,,JS


In [46]:
people.dropna(how='any',axis=1, subset=['pickachu','example'])

Unnamed: 0,Names,languages
pickachu,John,HTML
example,Mak,JS
2,,PYTHON


In [57]:
# Mergin units with columns
# print(units.head())
units = units.mode()
print(units.head())

for k in units:
#     print(k)
#     print(type(units),type(units[k]))
    print(units[k].at[0])
    
units = units.replace('',np.nan).dropna(how='any',axis=1)
print('-'*50)
for k in units:
    print(units[k].at[0])

  serving_size calories total_fat saturated_fat cholesterol sodium choline  \
0            g                  g             g          mg     mg      mg   

  folate folic_acid niacin  ... fat saturated_fatty_acids  \
0    mcg        mcg     mg  ...   g                     g   

  monounsaturated_fatty_acids polyunsaturated_fatty_acids  \
0                           g                           g   

  fatty_acids_total_trans alcohol ash caffeine theobromine water  
0                      mg       g   g       mg          mg     g  

[1 rows x 75 columns]
g

g
g
mg
mg
mg
mcg
mcg
mg
mg
mg
mg
IU
mcg
mcg
mcg
mcg
mcg

mcg
mg
mg
IU
mg
mg
mcg
mg
mg
mg
mg
mg
mg
mg
mcg
mg
g
g
g
g
g
g
g
g

g
g
g
g
g
g
g
g
g
g
g
g
g
g






g
g
g
g
mg
g
g
mg
mg
g
--------------------------------------------------
g
g
g
mg
mg
mg
mcg
mcg
mg
mg
mg
mg
IU
mcg
mcg
mcg
mcg
mcg
mcg
mg
mg
IU
mg
mg
mcg
mg
mg
mg
mg
mg
mg
mg
mcg
mg
g
g
g
g
g
g
g
g
g
g
g
g
g
g
g
g
g
g
g
g
g
g
g
g
g
g
mg
g
g
mg
mg
g


In [58]:
mapper = {k:k + "_" + units[k].at[0] for k in units}
mapper

{'serving_size': 'serving_size_g',
 'total_fat': 'total_fat_g',
 'saturated_fat': 'saturated_fat_g',
 'cholesterol': 'cholesterol_mg',
 'sodium': 'sodium_mg',
 'choline': 'choline_mg',
 'folate': 'folate_mcg',
 'folic_acid': 'folic_acid_mcg',
 'niacin': 'niacin_mg',
 'pantothenic_acid': 'pantothenic_acid_mg',
 'riboflavin': 'riboflavin_mg',
 'thiamin': 'thiamin_mg',
 'vitamin_a': 'vitamin_a_IU',
 'vitamin_a_rae': 'vitamin_a_rae_mcg',
 'carotene_alpha': 'carotene_alpha_mcg',
 'carotene_beta': 'carotene_beta_mcg',
 'cryptoxanthin_beta': 'cryptoxanthin_beta_mcg',
 'lutein_zeaxanthin': 'lutein_zeaxanthin_mcg',
 'vitamin_b12': 'vitamin_b12_mcg',
 'vitamin_b6': 'vitamin_b6_mg',
 'vitamin_c': 'vitamin_c_mg',
 'vitamin_d': 'vitamin_d_IU',
 'vitamin_e': 'vitamin_e_mg',
 'tocopherol_alpha': 'tocopherol_alpha_mg',
 'vitamin_k': 'vitamin_k_mcg',
 'calcium': 'calcium_mg',
 'copper': 'copper_mg',
 'irom': 'irom_mg',
 'magnesium': 'magnesium_mg',
 'manganese': 'manganese_mg',
 'phosphorous': 'phospho

In [59]:
nutrition_df.rename(columns=mapper, inplace=True)

In [60]:
nutrition_df.replace(r'[a-zA-Z]','',regex=True,inplace=True)
nutrition_df.head()

Unnamed: 0_level_0,serving_size_g,calories,total_fat_g,saturated_fat_g,cholesterol_mg,sodium_mg,choline_mg,folate_mcg,folic_acid_mcg,niacin_mg,...,fat_g,saturated_fatty_acids_g,monounsaturated_fatty_acids_g,polyunsaturated_fatty_acids_g,fatty_acids_total_trans_mg,alcohol_g,ash_g,caffeine_mg,theobromine_mg,water_g
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cornstarch,100,381,0.1,,0,9.0,0.4,0.0,0.0,0.0,...,0.05,0.009,0.016,0.025,0.0,0.0,0.09,0.0,0.0,8.32
"Nuts, pecans",100,691,72.0,6.2,0,0.0,40.5,22.0,0.0,1.167,...,71.97,6.18,40.801,21.614,0.0,0.0,1.49,0.0,0.0,3.52
"Eggplant, raw",100,25,0.2,,0,2.0,6.9,22.0,0.0,0.649,...,0.18,0.034,0.016,0.076,0.0,0.0,0.66,0.0,0.0,92.3
"Teff, uncooked",100,367,2.4,0.4,0,12.0,13.1,0.0,0.0,3.363,...,2.38,0.449,0.589,1.071,0.0,0.0,2.37,0.0,0.0,8.82
"Sherbet, orange",100,144,2.0,1.2,1,46.0,7.7,4.0,0.0,0.063,...,2.0,1.16,0.53,0.08,1.0,0.0,0.4,0.0,0.0,66.1


In [61]:
nutrition_df.dtypes.value_counts()

object    73
int64      2
dtype: int64

In [62]:
nutrition_df = nutrition_df.astype(float)

In [63]:
nutrition_df.dtypes.value_counts()

float64    75
dtype: int64

In [64]:
nutrition_df.shape

(8789, 75)

In [66]:
nutrition_df.filter(like='octopus',axis=0) 

Unnamed: 0_level_0,serving_size_g,calories,total_fat_g,saturated_fat_g,cholesterol_mg,sodium_mg,choline_mg,folate_mcg,folic_acid_mcg,niacin_mg,...,fat_g,saturated_fatty_acids_g,monounsaturated_fatty_acids_g,polyunsaturated_fatty_acids_g,fatty_acids_total_trans_mg,alcohol_g,ash_g,caffeine_mg,theobromine_mg,water_g
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Mollusks, raw, common, octopus",100.0,82.0,1.0,0.2,48.0,230.0,65.0,16.0,0.0,2.1,...,1.04,0.227,0.162,0.239,48.0,0.0,1.6,0.0,0.0,80.25
"Mollusks, moist heat, cooked, common, octopus",100.0,164.0,2.1,0.5,96.0,460.0,81.0,24.0,0.0,3.78,...,2.08,0.453,0.324,0.477,96.0,0.0,3.2,0.0,0.0,60.5


In [67]:
nutrition_df.filter(regex='octopus',axis=0)

Unnamed: 0_level_0,serving_size_g,calories,total_fat_g,saturated_fat_g,cholesterol_mg,sodium_mg,choline_mg,folate_mcg,folic_acid_mcg,niacin_mg,...,fat_g,saturated_fatty_acids_g,monounsaturated_fatty_acids_g,polyunsaturated_fatty_acids_g,fatty_acids_total_trans_mg,alcohol_g,ash_g,caffeine_mg,theobromine_mg,water_g
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Mollusks, raw, common, octopus",100.0,82.0,1.0,0.2,48.0,230.0,65.0,16.0,0.0,2.1,...,1.04,0.227,0.162,0.239,48.0,0.0,1.6,0.0,0.0,80.25
"Mollusks, moist heat, cooked, common, octopus",100.0,164.0,2.1,0.5,96.0,460.0,81.0,24.0,0.0,3.78,...,2.08,0.453,0.324,0.477,96.0,0.0,3.2,0.0,0.0,60.5


In [68]:
nutrition_df.filter(regex='[Oo]ctopus', axis=0)

Unnamed: 0_level_0,serving_size_g,calories,total_fat_g,saturated_fat_g,cholesterol_mg,sodium_mg,choline_mg,folate_mcg,folic_acid_mcg,niacin_mg,...,fat_g,saturated_fatty_acids_g,monounsaturated_fatty_acids_g,polyunsaturated_fatty_acids_g,fatty_acids_total_trans_mg,alcohol_g,ash_g,caffeine_mg,theobromine_mg,water_g
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Octopus (Alaska Native),100.0,56.0,0.8,0.2,41.0,0.0,0.0,0.0,0.0,2.0,...,0.8,0.2,0.0,0.2,41.0,0.0,1.5,0.0,0.0,84.0
"Mollusks, raw, common, octopus",100.0,82.0,1.0,0.2,48.0,230.0,65.0,16.0,0.0,2.1,...,1.04,0.227,0.162,0.239,48.0,0.0,1.6,0.0,0.0,80.25
"Mollusks, moist heat, cooked, common, octopus",100.0,164.0,2.1,0.5,96.0,460.0,81.0,24.0,0.0,3.78,...,2.08,0.453,0.324,0.477,96.0,0.0,3.2,0.0,0.0,60.5


In [71]:
# nutrition_df.filter(regex='(?i)octopus',axis=0).loc[:,['serving_size_g','choline_mg','sodium_mg']]
nutrition_df.filter(regex='(?i)octopus',axis=0).filter(items=['serving_size_g','choline_mg','sodium_mg'],axis=1)

Unnamed: 0_level_0,serving_size_g,choline_mg,sodium_mg
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Octopus (Alaska Native),100.0,0.0,0.0
"Mollusks, raw, common, octopus",100.0,65.0,230.0
"Mollusks, moist heat, cooked, common, octopus",100.0,81.0,460.0


### sort_values()

In [72]:
nutrition_df.choline_mg.sort_values()

name
Amaranth leaves, with salt, drained, boiled, cooked                   0.0
Cereals, dry, unenriched, regular and quick, yellow, corn grits       0.0
Puddings, prepared with whole milk, instant, dry mix, chocolate       0.0
Alcoholic Beverage, Claret, red, table, wine                          0.0
Beef, raw, sweetbread, imported, New Zealand                          0.0
                                                                    ...  
Egg, pasteurized, sugared, frozen, raw, yolk                        669.3
Egg, pasteurized, salted, frozen, raw, yolk                         705.0
Egg, fresh, raw, yolk                                               820.2
Egg, dried, whole                                                  1266.7
Egg, dried, yolk                                                   2403.3
Name: choline_mg, Length: 8789, dtype: float64

In [75]:
nutrition_df.sort_values(by='calories', ascending=False)

Unnamed: 0_level_0,serving_size_g,calories,total_fat_g,saturated_fat_g,cholesterol_mg,sodium_mg,choline_mg,folate_mcg,folic_acid_mcg,niacin_mg,...,fat_g,saturated_fatty_acids_g,monounsaturated_fatty_acids_g,polyunsaturated_fatty_acids_g,fatty_acids_total_trans_mg,alcohol_g,ash_g,caffeine_mg,theobromine_mg,water_g
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Fat, mutton tallow",100.0,902.0,100.0,47.0,102.0,0.0,79.8,0.0,0.0,0.00,...,100.0,47.300,40.600,7.800,102.0,0.0,0.00,0.0,0.0,0.00
"Fish oil, salmon",100.0,902.0,100.0,20.0,485.0,0.0,0.0,0.0,0.0,0.00,...,100.0,19.872,29.037,40.324,485.0,0.0,0.00,0.0,0.0,0.00
Lard,100.0,902.0,100.0,39.0,95.0,0.0,49.7,0.0,0.0,0.00,...,100.0,39.200,45.100,11.200,95.0,0.0,0.00,0.0,0.0,0.00
"Fat, beef tallow",100.0,902.0,100.0,50.0,109.0,0.0,79.8,0.0,0.0,0.00,...,100.0,49.800,41.800,4.000,109.0,0.0,0.00,0.0,0.0,0.00
"Fish oil, cod liver",100.0,902.0,100.0,23.0,570.0,0.0,0.0,0.0,0.0,0.00,...,100.0,22.608,46.711,22.541,570.0,0.0,0.00,0.0,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Beverages, decaffeinated, brewed, green, tea",100.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.03,...,0.0,0.000,0.000,0.000,0.0,0.0,0.00,0.0,0.0,99.93
"Beverages, caffeine free, cola, ZEVIA",100.0,0.0,0.0,,0.0,6.0,0.0,0.0,0.0,0.00,...,0.0,0.000,0.000,0.000,0.0,0.0,0.01,0.0,0.0,98.87
"Carbonated beverage, without caffeine, with sodium saccharin, other than cola or pepper, low calorie",100.0,0.0,0.0,,0.0,16.0,0.0,0.0,0.0,0.00,...,0.0,0.000,0.000,0.000,0.0,0.0,0.10,0.0,0.0,99.80
"Beverages, unsweetened, ready to drink, green, tea",100.0,0.0,0.0,,0.0,7.0,0.0,0.0,0.0,0.00,...,0.0,0.000,0.000,0.000,0.0,0.0,0.12,12.0,0.0,99.88


In [76]:
nutrition_df.sort_values(by=['cholesterol_mg','calories'], ascending=[False,False])

Unnamed: 0_level_0,serving_size_g,calories,total_fat_g,saturated_fat_g,cholesterol_mg,sodium_mg,choline_mg,folate_mcg,folic_acid_mcg,niacin_mg,...,fat_g,saturated_fatty_acids_g,monounsaturated_fatty_acids_g,polyunsaturated_fatty_acids_g,fatty_acids_total_trans_mg,alcohol_g,ash_g,caffeine_mg,theobromine_mg,water_g
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Beef, simmered, cooked, brain, variety meats and by-products",100.0,151.0,11.0,2.4,3100.0,108.0,490.9,5.0,0.0,3.620,...,10.53,2.394,1.882,1.632,3100.0,0.0,1.46,0.0,0.0,74.86
"Veal, braised, cooked, brain, variety meats and by-products",100.0,136.0,9.6,2.2,3100.0,156.0,0.0,3.0,0.0,2.430,...,9.63,2.180,1.740,1.490,3100.0,0.0,1.40,0.0,0.0,76.89
"Beef, raw, brain, variety meats and by-products",100.0,143.0,10.0,2.3,3010.0,126.0,0.0,3.0,0.0,3.550,...,10.30,2.300,1.890,1.586,3010.0,0.0,1.51,0.0,0.0,76.29
"Lamb, soaked and fried, cooked, brains, imported, New Zealand",100.0,154.0,11.0,1.4,2559.0,101.0,0.0,0.0,0.0,2.995,...,10.92,1.365,4.168,0.999,2559.0,0.0,3.39,0.0,0.0,73.11
"Pork, braised, cooked, brain, variety meats and by-products, fresh",100.0,138.0,9.5,2.2,2552.0,91.0,0.0,4.0,0.0,3.330,...,9.51,2.150,1.720,1.470,2552.0,0.0,1.40,0.0,0.0,75.88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Beverages, contains caffeine, with aspartame, other than cola or pepper, low calorie, carbonated",100.0,0.0,0.0,,0.0,6.0,0.0,0.0,0.0,0.000,...,0.00,0.000,0.000,0.000,0.0,0.0,0.10,15.0,0.0,99.80
"Beverages,,Gerolsteiner naturally sparkling mineral water, GEROLSTEINER BRUNNEN GmbH & Co. KG",100.0,0.0,0.0,,0.0,13.0,0.0,0.0,0.0,0.000,...,0.00,0.000,0.000,0.000,0.0,0.0,0.05,0.0,0.0,99.95
"Beverages, fortified, Revive Fruit Punch, Glaceau Vitamin Water, The COCA-COLA company",100.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,3.384,...,0.00,0.000,0.000,0.000,0.0,0.0,0.15,0.0,0.0,99.17
"Beverages, mineral bottled water, naturally sparkling, GEROLSTEINER BRUNNEN GmbH & Co. KG (Gerolsteiner)",100.0,0.0,0.0,,0.0,13.0,0.0,0.0,0.0,0.000,...,0.00,0.000,0.000,0.000,0.0,0.0,0.05,0.0,0.0,99.95


In [77]:
nutrition_df.loc['Beef, simmered, cooked, brain, variety meats and by-products'].filter(like='_g',axis=0).sort_values(ascending=False)

serving_size_g                   100.000
water_g                           74.860
protein_g                         11.670
total_fat_g                       11.000
fat_g                             10.530
saturated_fat_g                    2.400
saturated_fatty_acids_g            2.394
monounsaturated_fatty_acids_g      1.882
polyunsaturated_fatty_acids_g      1.632
carbohydrate_g                     1.480
ash_g                              1.460
threonine_g                        0.000
alcohol_g                          0.000
sugars_g                           0.000
fiber_g                            0.000
valine_g                           0.000
tyrosine_g                         0.000
tryptophan_g                       0.000
serine_g                           0.000
phenylalanine_g                    0.000
methionine_g                       0.000
lysine_g                           0.000
leucine_g                          0.000
isoleucine_g                       0.000
histidine_g     

### between()

In [78]:
nutrition_df.calories.between(20,40)

name
Cornstarch                                                                                            False
Nuts, pecans                                                                                          False
Eggplant, raw                                                                                          True
Teff, uncooked                                                                                        False
Sherbet, orange                                                                                       False
                                                                                                      ...  
Beef, raw, all grades, trimmed to 0" fat, separable lean and fat, boneless, top round roast, round    False
Lamb, cooked, separable lean only, composite of trimmed retail cuts, frozen, imported, New Zealand    False
Lamb, raw, separable lean and fat, composite of trimmed retail cuts, frozen, imported, New Zealand    False
Beef, raw, all grades, 

In [81]:
nutrition_df[nutrition_df.calories.between(10,20)].sample(3)

Unnamed: 0_level_0,serving_size_g,calories,total_fat_g,saturated_fat_g,cholesterol_mg,sodium_mg,choline_mg,folate_mcg,folic_acid_mcg,niacin_mg,...,fat_g,saturated_fatty_acids_g,monounsaturated_fatty_acids_g,polyunsaturated_fatty_acids_g,fatty_acids_total_trans_mg,alcohol_g,ash_g,caffeine_mg,theobromine_mg,water_g
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Pumpkin flowers, raw",100.0,15.0,0.1,,0.0,5.0,0.0,59.0,0.0,0.69,...,0.07,0.036,0.009,0.004,0.0,0.0,0.48,0.0,0.0,95.15
"Lettuce, raw, green leaf",100.0,15.0,0.2,,0.0,28.0,13.6,38.0,0.0,0.375,...,0.15,0.02,0.006,0.082,0.0,0.0,0.62,0.0,0.0,94.98
"Peppers, raw, green, sweet",100.0,20.0,0.2,0.1,0.0,3.0,5.5,10.0,0.0,0.48,...,0.17,0.058,0.008,0.062,0.0,0.0,0.43,0.0,0.0,93.89


### min, max

In [85]:
nutrition_df.min(axis=1)  # default axis = 0

name
Cornstarch                                                                                            0.0
Nuts, pecans                                                                                          0.0
Eggplant, raw                                                                                         0.0
Teff, uncooked                                                                                        0.0
Sherbet, orange                                                                                       0.0
                                                                                                     ... 
Beef, raw, all grades, trimmed to 0" fat, separable lean and fat, boneless, top round roast, round    0.0
Lamb, cooked, separable lean only, composite of trimmed retail cuts, frozen, imported, New Zealand    0.0
Lamb, raw, separable lean and fat, composite of trimmed retail cuts, frozen, imported, New Zealand    0.0
Beef, raw, all grades, trimmed to 0" fat,

In [86]:
nutrition_df.max()

serving_size_g      100.0
calories            902.0
total_fat_g         100.0
saturated_fat_g      96.0
cholesterol_mg     3100.0
                    ...  
alcohol_g            42.5
ash_g                99.8
caffeine_mg        5714.0
theobromine_mg     2634.0
water_g             100.0
Length: 75, dtype: float64

In [90]:
# what food has most pottasium
print(nutrition_df.potassium_mg.max())
print(nutrition_df.potassium_mg.idxmax())

16500.0
Leavening agents, cream of tartar


In [91]:
nutrition_df.potassium_mg.sort_values(ascending=False)

name
Leavening agents, cream of tartar                              16500.0
Leavening agents, low-sodium, baking powder                    10100.0
Parsley, freeze-dried                                           6300.0
Beverages, unsweetened, decaffeinated, instant, tea             6040.0
Beverages, powder, unsweetened, instant, tea                    6040.0
                                                                ...   
CAMPBELL'S CHUNKY Soups, Beef with White and Wild Rice Soup        0.0
Alcoholic beverage, Pinot Gris (Grigio), white, table, wine        0.0
Cloudberries, raw (Alaska Native)                                  0.0
Oil, all purpose, soy ( partially hydrogenated), industrial        0.0
Oil, sunflower, mid-oleic, industrial                              0.0
Name: potassium_mg, Length: 8789, dtype: float64

In [93]:
# targeting potassium to sodium of 16
k_to_na = (nutrition_df.potassium_mg.replace(0,1)/nutrition_df.sodium_mg.replace(0,1)).sort_values(ascending=False)
k_to_na.head()

name
Peanut flour, low fat                                         1358.0
Nuts, raw, pistachio nuts                                     1025.0
Beverages, reduced calorie, with whitener, instant, coffee     909.0
Soybeans, raw, mature seeds                                    898.5
Soy meal, raw, defatted                                        830.0
dtype: float64

In [95]:
k_to_na[k_to_na.between(15,17)]

name
Tofu, prepared with calcium sulfate, firm, raw                                                                       16.928571
Nuts, canned (liquid expressed from grated meat and water), coconut milk                                             16.923077
Winged bean tuber, raw                                                                                               16.742857
Beverages, prepared with water, frozen concentrate, with juice and pulp, breakfast type, Orange drink                16.700000
Melons, raw, cantaloupe                                                                                              16.687500
Couscous, dry                                                                                                        16.600000
Babyfood, mango with tapioca, fruit dessert                                                                          16.500000
Vanilla extract                                                                                           

### nlargest() and nsmallest()

In [102]:
nutrition_df.nlargest(5,columns='potassium_mg').potassium_mg

name
Leavening agents, cream of tartar                      16500.0
Leavening agents, low-sodium, baking powder            10100.0
Parsley, freeze-dried                                   6300.0
Beverages, powder, unsweetened, instant, tea            6040.0
Beverages, unsweetened, decaffeinated, instant, tea     6040.0
Name: potassium_mg, dtype: float64

In [99]:
nutrition_df.potassium_mg.nlargest(5)

name
Leavening agents, cream of tartar                      16500.0
Leavening agents, low-sodium, baking powder            10100.0
Parsley, freeze-dried                                   6300.0
Beverages, powder, unsweetened, instant, tea            6040.0
Beverages, unsweetened, decaffeinated, instant, tea     6040.0
Name: potassium_mg, dtype: float64

In [103]:
nutrition_df.nlargest(5,columns=['potassium_mg','sodium_mg'])

Unnamed: 0_level_0,serving_size_g,calories,total_fat_g,saturated_fat_g,cholesterol_mg,sodium_mg,choline_mg,folate_mcg,folic_acid_mcg,niacin_mg,...,fat_g,saturated_fatty_acids_g,monounsaturated_fatty_acids_g,polyunsaturated_fatty_acids_g,fatty_acids_total_trans_mg,alcohol_g,ash_g,caffeine_mg,theobromine_mg,water_g
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Leavening agents, cream of tartar",100.0,258.0,0.0,,0.0,52.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,36.8,0.0,0.0,1.7
"Leavening agents, low-sodium, baking powder",100.0,97.0,0.4,0.1,0.0,90.0,0.0,0.0,0.0,0.0,...,0.4,0.073,0.006,0.121,0.0,0.0,46.4,0.0,0.0,6.2
"Parsley, freeze-dried",100.0,271.0,5.2,,0.0,391.0,0.0,194.0,0.0,10.4,...,5.2,0.0,0.0,0.0,0.0,0.0,19.12,0.0,0.0,2.0
"Beverages, powder, unsweetened, instant, tea",100.0,315.0,0.0,,0.0,72.0,118.3,103.0,0.0,10.8,...,0.0,0.0,0.0,0.0,0.0,0.0,16.04,5714.0,71.0,5.09
"Beverages, unsweetened, decaffeinated, instant, tea",100.0,315.0,0.0,,0.0,72.0,118.3,103.0,0.0,10.8,...,0.0,0.0,0.0,0.0,0.0,0.0,16.04,169.0,11.0,5.09


### Challenge

In [124]:
#1 find the 10 foods that have most vitamin b12. what do they have in common? 
print(nutrition_df.vitamin_b12_mcg.nlargest(10))

#2 isolate the food in dataset that contains/based on eggplant. which one has most sodium in them ? 
print(nutrition_df.filter(regex='(?i)eggplant',axis=0).sodium_mg.sort_values(ascending=False))

#3 select a slice of dataframe that contains 4 random and 2 random columns.
import random
print(nutrition_df.iloc[:,[random.randint(1,75),random.randint(1,75)]].sample(4))
print(nutrition_df.sample(4).sample(2, axis=1))

name
Mollusks, moist heat, cooked, mixed species, clam                                   98.89
Beef, boiled, cooked, variety meats and by-products liver, imported, New Zealand    96.00
Lamb, raw, liver, variety meats and by-products                                     90.05
Lamb, pan-fried, cooked, liver, variety meats and by-products                       85.70
Veal, braised, cooked, liver, variety meats and by-products                         84.60
Beef, raw, liver, variety meats and by-products, imported, New Zealand              84.50
Beef, pan-fried, cooked, liver, variety meats and by-products                       83.13
Lamb, braised, cooked, kidneys, variety meats and by-products                       78.90
Lamb, braised, cooked, liver, variety meats and by-products                         76.50
Veal, pan-fried, cooked, liver, variety meats and by-products                       72.50
Name: vitamin_b12_mcg, dtype: float64
name
Eggplant, pickled                                  1

In [139]:
#1 Remove all the food items that contains at least 1 NaN value with inplace =True, how many items remains? 
nutrition_df.dropna(inplace=True)
print(nutrition_df.shape)

#2 from remaining df, isolate those that have 20 to 40 mg of vitamin c per 100 gm of servings.Of these which is least caloric? 
vitamin_c_df = nutrition_df[nutrition_df.vitamin_c_mg.between(20,40)]
print(vitamin_c_df.vitamin_c_mg)
print(vitamin_c_df.calories.nsmallest(5))

#3 how many items have vitamic c between 2 and 3 std deviations(including) above the mean? 
print(nutrition_df.vitamin_c_mg.describe())
print(nutrition_df[nutrition_df.vitamin_c_mg.between(5.553369+2*46.104385, 5.553369+3*46.104385)].shape)

(7199, 75)
name
Broccoli, raw, chinese                                                                                             29.6
Broccoli raab, cooked                                                                                              37.0
Horseradish, prepared                                                                                              24.9
Spices, white, pepper                                                                                              21.0
Dandelion greens, raw                                                                                              35.0
                                                                                                                   ... 
Beverages, fortified, ready to drink, milk and soy based, chocolate drink                                          25.3
Cereals ready-to-eat, Peanut Butter, Multi Grain CHEERIOS, GENERAL MILLS                                           21.4
Infant formula, with ARA