In [None]:
###########################################################################

In [None]:
# Introduction

# This Jupyter notebook is part of your learning experience in the study of central tendency
# You will work with a simple data set that contains guest details for a buffet

# In this exercise, you will perform the following tasks:
# 1 - Load and study the data
# 2 - View the distributions of the various features in the data set and calculate their central tendencies
# 3 - Create a new Pandas Series that contains the details of the representative meal for the buffet

In [None]:
###########################################################################

In [None]:
# Task 1 - Load and study the data

# Load the data and study its features such as:
# The number of employees
# The number of features
# The types of features

In [None]:
# Load "numpy" and "pandas" for manipulating numbers and data frames
# Load "matplotlib.pyplot" and "seaborn" for data visualisation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Read in the "Buffet_Details.csv" file as a Pandas Data Frame
# Note: Make sure the code and the data are in the same folder or specify the appropriate path
df = pd.read_csv('Buffet_Details.csv', index_col = 0)

In [None]:
# Take a brief look at the data using ".head()"
########## CODE HERE ##########

In [None]:
# Study the description of the data
# Note: Make sure the code and the data description are in the same folder or specify the appropriate path
with open('Buffet_Details_Feature_Description.txt', 'r') as f:
    print(f.read())

In [None]:
# Get the dimensions of the data frame using ".shape"
########## CODE HERE ##########

In [None]:
# Get the row names of the data frame using ".index"
########## CODE HERE ##########

In [None]:
# Get the column names of the data frame using ".columns"
########## CODE HERE ##########

In [None]:
# Look at basic information about the data frame using ".info()"
########## CODE HERE ##########

In [None]:
# Observations

# There are 32 rows and 4 columns in the data
# Each row contains the details of the meal previously selected by a guest

# The features in the data set are:
# The names of the guests
# Their respective ages
# The amount of money they spend on their previous meal
# The type of cuisine of their previous meal

In [None]:
###########################################################################

In [None]:
# Task 2 - View the distributions of the various features in the data set and calculate their central tendencies

# We will now look at the distributions of the various features in the data set
# We will also calculate appropriate measures of central tendency for these features

In [None]:
# Create a histogram of the "Age" feature
plt.figure(figsize = (6, 3), dpi= 100)
sns.histplot(data = df, x = 'Age', color = 'orange', edgecolor = 'linen', alpha = 0.5, bins = 5)
plt.title('Histogram of Age of Guests')
plt.xlabel('Age in years')
plt.ylabel('Count');
# We observe that the histogram is quite skewed towards the left
# This indicates that most of the guests are not too old

In [None]:
# Calculate the mean "Age" feature using ".mean()"
########## CODE HERE ##########

In [None]:
# Calculate the median "Age" feature using ".median()"
########## CODE HERE ##########

In [None]:
# Create a histogram of the "Age" feature and also show the mean and the median
plt.figure(figsize = (6, 3), dpi= 100)
sns.histplot(data = df, x = 'Age', color = 'orange', edgecolor = 'linen', alpha = 0.5, bins = 5)
plt.title('Histogram of Age of Guests')
plt.xlabel('Age in years')
plt.vlines(x = df['Age'].mean(), ymin = 0, ymax = 20, colors = 'blue', label = 'Mean')
plt.vlines(x = df['Age'].median(), ymin = 0, ymax = 20, colors = 'red', label = 'Median')
plt.legend();
# Since there are a few guests who are quite old, the mean is pushed more towards the right
# Even so, the median seems to be a better indicator of the representative age of the group
# In fact, very few guests have age equal to the mean in this data set

In [None]:
# Create a histogram of the "Expenditure" feature
plt.figure(figsize = (6, 3), dpi= 100)
sns.histplot(data = df, x = 'Expenditure', color = 'orange', edgecolor = 'linen', alpha = 0.5, bins = 5)
plt.title('Histogram of Expenditure of Guests')
plt.xlabel('Expenditure in rupees')
plt.ylabel('Count');
# We observe that this histogram is almost well distributed, but is skewed a little towards the left

In [None]:
# Calculate the mean "Expenditure" feature using ".mean()"
########## CODE HERE ##########

In [None]:
# Calculate the median "Expenditure" feature using ".median()"
########## CODE HERE ##########

In [None]:
# Create a histogram of the "Expenditure" feature and also show the mean and the median
plt.figure(figsize = (6, 3), dpi= 100)
sns.histplot(data = df, x = 'Expenditure', color = 'orange', edgecolor = 'linen', alpha = 0.5, bins = 5)
plt.title('Histogram of Expenditure of Guests')
plt.xlabel('Expenditure in rupees')
plt.vlines(x = df['Expenditure'].mean(), ymin = 0, ymax = 12, colors = 'blue', label = 'Mean')
plt.vlines(x = df['Expenditure'].median(), ymin = 0, ymax = 12, colors = 'red', label = 'Median')
plt.legend();
# The mean and the median are quite close and the difference between them is negligible
# We can safely choose the mean as the measure of the central tendency here

In [None]:
# Create a count plot of the "Cuisine" feature
plt.figure(figsize = (6, 3), dpi= 100)
sns.countplot(data = df, x = 'Cuisine')
plt.title('Count Plot of Cuisine Preferences of Guests')
plt.xlabel('Cuisine')
plt.ylabel('Count');
# It is quite clear from the count plot that the Indian cuisine is the most popular cuisine for this group of guests

In [None]:
# Count the number of occurences of different categories of the "Cuisine" feature using ".value_counts()"
########## CODE HERE ##########

In [None]:
# Calculate the mode of the "Cuisine" feature using ".value_counts()"
# Note: Grab the first index of the series obtained by using ".value_counts()" and ".index[0]"
########## CODE HERE ##########

In [None]:
# Observations

# We saw the distributions of the various features in the data set using appropriate plots
# We decided on different central tendency measures for each of these features
# The median should be chosen instead of the mean for the "Age" feature as the mean is pushed to a higher value because of a few older guests
# The mean and the median for the "Expenditure" feature were similar and we can choose the mean in this case
# The mode of the "Cuisine" feature can be chosen as a representative value

In [None]:
###########################################################################

In [None]:
# Task 3 - Create a new Pandas Series that contains the details of the representative meal for the buffet

# We will now create a Pandas Series that contains the representative values for each of the features

In [None]:
# Create a new Pandas Series called "rep_meal" that contains the details of the representative meal for the buffet
# Note: The "index" parameter of the series needs to be a list of the relevant feature names
# Note: The "data" parameter of the series needs to be a list of the relevant values
########## CODE HERE ##########

In [None]:
# Print the "rep_meal" series
########## CODE HERE ##########

In [None]:
# Observations

# The representative meal for the buffet is as follows:
# The median age of the group of guests would be 27.5
# The mean price per meal would be around 1700 rupees
# The type of cuisine would be Indian

In [None]:
###########################################################################

In [None]:
# Conclusions

# From the given data, we can use simple visualisations to get a sense of how data are distributed
# We can use various measures of central tendency such as mean, median and mode to represent a group of observations
# The type of central tendency measure to use depends on the type and the distribution of the data

In [None]:
###########################################################################