[Reference](https://towardsdatascience.com/6-lesser-nown-yet-awesome-tricks-in-pandas-32236f3785c8)

In [5]:
url = 'https://en.wikipedia.org/wiki/Table_of_food_nutrients'

In [3]:
import pandas as pd

In [6]:
dairy_table = pd.read_html(url, match='Fortified milk')

dairy_table = dairy_table[0]
print(dairy_table.head())

          Dairy products                           ...                         
                    Food Measure   Grams Calories  ... Carb. Fiber Fat Sat. fat
0      Cows' milk, whole   1 qt.   976.0    660.0  ...    48   0.0  40       36
1                   skim   1 qt.   984.0    360.0  ...    52   0.0   t        t
2   Buttermilk, cultured   1 cup   246.0    127.0  ...    13   0.0   5        4
3  Evaporated, undiluted   1 cup   252.0    345.0  ...    24   0.0  20       18
4         Fortified milk  6 cups  1419.0   1373.0  ...   119   1.4  42       23

[5 rows x 9 columns]


In [7]:
##==== 2. Startup options ====##
## To set startup file in the Spyder IDE: Perference->Ipython console->Startup->Run a file
import pandas as pd
def start_config():
    options = {
      ## Check out the link above for more options besides 'display'
        'display': {
            'max_columns': None,    ### Max # of columns
            'max_colwidth': 1000,   ### Max width of columns
            'max_rows': 1000,       ### Max # of rows 
            'precision': 3          ### Float number precision
        }
    }
    for display, optionVals in options.items():
        for setting, userVal in optionVals.items():
            pd.set_option(f'{display}.{setting}', userVal)  

if __name__ == '__main__':
    start_config()

In [8]:
## Here, we take a subset of the entire table for demonstration purpose
dairy_table_raw = dairy_table.iloc[1:23, ]
dairy_table_raw.columns = dairy_table_raw[:1].iloc[0]
dairy_table_raw = dairy_table_raw[1:].reset_index(drop=True)
dairy_table = dairy_table_raw.iloc[:, :4]

In [9]:
dairy_table

1,skim,1 qt.,984.0,360.0
0,"Buttermilk, cultured",1 cup,246.0,127.0
1,"Evaporated, undiluted",1 cup,252.0,345.0
2,Fortified milk,6 cups,1419.0,1373.0
3,"Powdered milk, whole",1 cup,103.0,515.0
4,"skim, instant",1 1/3 cups,85.0,290.0
5,"skim, non-instant",2/3 cup,85.0,290.0
6,"Goats' milk, fresh",1 cup,244.0,165.0
7,Malted milk,,,
8,(1/2 cup ice cream),2 cups,540.0,690.0
9,Cocoa,1 cup,252.0,235.0


In [10]:
### df.itertuples()
for row in dairy_table.itertuples():
    if row[0] == 0:
        print(f'{row}')
        break

Pandas(Index=0, skim='Buttermilk, cultured', _2='1 cup', _3=246.0, _4=127.0)


In [16]:
##==== 3. Use itertuples() to loop through rows ====##
## Initialize variables to store the string and missing row index
cur_str = ''    
missing_value_row = 0

for row in dairy_table.itertuples():    
    idx = row[0]
    
    ## rows with NAs: record the Food value and index
    if str(row.Measure)=='nan':
        cur_str += f'{row.Food} '
        missing_value_row = idx  
    
    ## next rows, where idx = (row with NAs)+1
    if cur_str and idx == missing_value_row+1:
        cur_str += row.Food
        dairy_table.iloc[idx, 0] = cur_str
        ## reassign cur_str back to '' for the next loop
        cur_str = ''
        
## Drop the rows with NAs
dairy_table = dairy_table.dropna(how='any') 

In [12]:
dairy_table = dairy_table.fillna(method='bfill') 

In [15]:
##==== 5. cumsum() to work with booleans ====##
### 1) Compare two consecutive rows on all columns excluding the 'Food' (i.e., first column)
keys = (dairy_table != dairy_table.shift(1)).iloc[:, 1:].astype(int).cumsum()

### 2) Define the column names; string in python: vectorized operation 
keys.columns = keys.columns + '_'

### 3) Merge back to the original table 
dairy_table = pd.concat([dairy_table, keys], axis=1)

In [14]:
#==== 6. Groupby ====## 
new_food_col = dairy_table.groupby(['Measure_', 'Grams_', 'Calories_'], as_index=False)['Food']
                    .apply(' '.join).reset_index(drop=True) 

### Drop the old Food column and add the new 
dairy_table = dairy_table.drop(columns='Food').drop_duplicates().reset_index(drop=True)
dairy_table.insert(0, column='Food', value=new_food_col)
dairy_table = dairy_table[original_cols]