In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
df=pd.read_csv('Dataset1- Raw_Cocktail_Data.csv',index_col=0)
df=df.reset_index(drop=True)
df.head(8)


Unnamed: 0,cocktail_name,calories_per_serving,sugar,description
0,Sex on the beach cocktail,92,8g,ice50ml vodka Vodka vod-kaOriginally associate...
1,Mojito recipe,158,4.3g,"juice of 1 lime Lime ly-mThe same shape, but s..."
2,Caipirinha,245,30g,"2 limes, chopped into wedges, plus extra lime ..."
3,Cuba Libre,159,10g,"½ lime Lime ly-mThe same shape, but smaller th..."
4,Long Island iced tea,212,16g,50ml vanilla vodka (we used Absolut Vanilia) V...
5,Bloody Mary recipe,160,8g,large handful of ice100ml vodka Vodka vod-kaOr...
6,Mai tai,284,16.6g,2 tbsp white rum Rum rhumRum is a type of spir...
7,Piña colada,314,13.6g,120ml pineapple juice Pineapple pine-ap-pelWit...


## Data Cleaning and Feature Engineering

### Agenda: 

##### column 'cocktail name':

    1)remove 'cocktail', 'recipe'
    2)write cocktail name in capital letters
    
##### column 'sugar':
    1)rename column: 'sugar in g'
    2)remove 'g'
    
##### column 'description' (select 18 ingredients):
    -pineapple     -cranberry                                                                           
    -rum           -cachaça                         
    -coconut       -cola                      
    -mint          -gin                  
    -lime          -tequila                         
    -lemon         -tomato                       
    -vodka         -pepper                         
    -peach         -grenadine                            
    -orange        -cherry                          

In [29]:
#remove 'cocktail', 'recipe'
def modify_name(x):
    if 'cocktail' in x:
        return x.strip('cocktail')
    if 'recipe' in x:
        return x.strip('recipe')
    else:
        return x

In [31]:
#apply modify_name function
df['cocktail_name']=df['cocktail_name'].apply(modify_name)

In [34]:
#cocktail names in capital letters
df['cocktail_name']=[x.upper() for x in df['cocktail_name']]

In [35]:
#check the dataframe
df.head()

Unnamed: 0,cocktail_name,calories_per_serving,sugar,description
0,SEX ON THE BEACH,92,8g,ice50ml vodka Vodka vod-kaOriginally associate...
1,MOJITO,158,4.3g,"juice of 1 lime Lime ly-mThe same shape, but s..."
2,CAIPIRINHA,245,30g,"2 limes, chopped into wedges, plus extra lime ..."
3,CUBA LIBRE,159,10g,"½ lime Lime ly-mThe same shape, but smaller th..."
4,LONG ISLAND ICED TEA,212,16g,50ml vanilla vodka (we used Absolut Vanilia) V...


In [36]:
#rename 'sugar' column
df['sugar in g']=df['sugar']

In [39]:
#remove 'g' in 'sugar in g' column
df['sugar in g']=[x.strip('g') for x in df['sugar in g']]

In [42]:
#drop the old 'sugar' column
df.drop('sugar', axis=1, inplace=True)

In [43]:
#check the dataframe
df.head()

Unnamed: 0,cocktail_name,calories_per_serving,description,sugar in g
0,SEX ON THE BEACH,92,ice50ml vodka Vodka vod-kaOriginally associate...,8.0
1,MOJITO,158,"juice of 1 lime Lime ly-mThe same shape, but s...",4.3
2,CAIPIRINHA,245,"2 limes, chopped into wedges, plus extra lime ...",30.0
3,CUBA LIBRE,159,"½ lime Lime ly-mThe same shape, but smaller th...",10.0
4,LONG ISLAND ICED TEA,212,50ml vanilla vodka (we used Absolut Vanilia) V...,16.0


In [46]:
# Create a column for each mentioned ingredient above

df['Pineapple']=df['description'].apply(lambda x: 1 if 'pineapple' in x.lower() else 0)
df['Rum']=df['description'].apply(lambda x: 1 if 'rum' in x.lower() else 0)
df['Coconut']=df['description'].apply(lambda x: 1 if 'coconut' in x.lower() else 0)
df['Mint']=df['description'].apply(lambda x: 1 if 'mint' in x.lower() else 0)
df['Lime']=df['description'].apply(lambda x: 1 if 'lime' in x.lower() else 0)
df['Lemon']=df['description'].apply(lambda x: 1 if 'lemon' in x.lower() else 0)
df['Vodka']=df['description'].apply(lambda x: 1 if 'vodka' in x.lower() else 0)
df['Peach']=df['description'].apply(lambda x: 1 if 'peach' in x.lower() else 0)
df['Orange']=df['description'].apply(lambda x: 1 if 'orange' in x.lower() else 0)
df['Cachaça']=df['description'].apply(lambda x: 1 if 'cachaça' in x.lower() or 'cachaca' in x.lower() else 0)
df['Cola']=df['description'].apply(lambda x: 1 if 'cola' in x.lower() else 0)
df['Gin']=df['description'].apply(lambda x: 1 if 'gin' in x.lower() else 0)
df['Tequila']=df['description'].apply(lambda x: 1 if 'tequila' in x.lower() else 0)
df['Tomato']=df['description'].apply(lambda x: 1 if 'tomato' in x.lower() else 0)
df['Pepper']=df['description'].apply(lambda x: 1 if 'pepper' in x.lower() else 0)
df['Grenadine']=df['description'].apply(lambda x: 1 if 'grenadine' in x.lower() else 0)
df['Cherry']=df['description'].apply(lambda x: 1 if 'cherry' in x.lower() else 0)

In [62]:
df['Cherry'].value_counts()

0    7
1    1
Name: Cherry, dtype: int64

In [63]:
#dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   cocktail_name         8 non-null      object
 1   calories_per_serving  8 non-null      int64 
 2   description           8 non-null      object
 3   sugar in g            8 non-null      object
 4   Pineapple             8 non-null      int64 
 5   Rum                   8 non-null      int64 
 6   Coconut               8 non-null      int64 
 7   Mint                  8 non-null      int64 
 8   Lime                  8 non-null      int64 
 9   Lemon                 8 non-null      int64 
 10  Vodka                 8 non-null      int64 
 11  Peach                 8 non-null      int64 
 12  Orange                8 non-null      int64 
 13  Cachaça               8 non-null      int64 
 14  Cola                  8 non-null      int64 
 15  Gin                   8 non-null      int64 

In [64]:
#check data frame
df.head()

Unnamed: 0,cocktail_name,calories_per_serving,description,sugar in g,Pineapple,Rum,Coconut,Mint,Lime,Lemon,...,Peach,Orange,Cachaça,Cola,Gin,Tequila,Tomato,Pepper,Grenadine,Cherry
0,SEX ON THE BEACH,92,ice50ml vodka Vodka vod-kaOriginally associate...,8.0,0,0,0,0,0,0,...,1,1,0,0,1,0,0,0,0,0
1,MOJITO,158,"juice of 1 lime Lime ly-mThe same shape, but s...",4.3,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,CAIPIRINHA,245,"2 limes, chopped into wedges, plus extra lime ...",30.0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
3,CUBA LIBRE,159,"½ lime Lime ly-mThe same shape, but smaller th...",10.0,0,1,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
4,LONG ISLAND ICED TEA,212,50ml vanilla vodka (we used Absolut Vanilia) V...,16.0,0,1,0,0,1,0,...,0,0,0,1,1,1,0,0,0,0


In [69]:
#save the dataframe in a csv file
df.to_csv('Dataset2- Cleaned_Cocktail_Data.csv')