# Load the dataset
- Load the Tips dataset using pandas.


In [None]:
import pandas as pd
df = pd.read_csv("/content/tips.csv")
df

# Basic data exploration
- Display the first 5 rows of the dataset.
- Display the summary statistics of the dataset.
- Check for missing values in the dataset.

In [None]:
df.head(5)

In [None]:
df.describe()

In [None]:
df.isnull()

# Data selection
- Select the columns 'total_bill', 'tip', and 'sex'.
- Select the rows where the 'total_bill' is greater than $20.
- Select the rows where the day is 'Sun' and the 'time' is 'Dinner'.


In [None]:
# Select the columns 'total_bill', 'tip', and 'sex'.
df.loc[:, 'total_bill':'sex']

In [None]:
# Select the rows where the 'total_bill' is greater than $20.
df[df["total_bill"] > 20]

In [None]:
df[(df["day"] == "Sun") &  (df["time"] == "Dinner")]

# Grouping and aggregation:

- Calculate the average total bill for each day.
- Calculate the sum of tips for each day.
- Find the maximum total bill for each combination of day and time.

In [None]:
#Calculate the average total bill for each day.
gk = df.groupby("day")
gk["total_bill"].mean()

In [None]:
# Calculate the sum of tips for each day
gk["tip"].sum()

In [None]:
# Find the maximum total bill for each combination of day and time.
ggk = df.groupby(["day", "time"])
ggk["total_bill"].max()

# Data transformation:

- Add a new column 'tip_percentage' which is the tip divided by the total bill multiplied by 100.
- Create a new column 'bill_per_person' which is the total bill divided by the number of people ('size').

In [None]:
# Add a new column 'tip_percentage' which is the tip divided by the total bill multiplied by 100.
df["tip_percentage"] = (df["tip"] / df["total_bill"]) * 100
# Create a new column 'bill_per_person' which is the total bill divided by the number of people ('size').
new_col = df.total_bill / df.size
df.insert(8, "bill_per_person", new_col)
df

# Filtering:

- Filter the rows where 'tip_percentage' is greater than 15%.
- Filter the rows where 'total_bill' is in the top 10 highest bills.

In [None]:
# Filter the rows where 'tip_percentage' is greater than 15%.
filtered = df[df.tip_percentage > 15]
total_bill_sorted = df.sort_values(by="total_bill", ascending = False).iloc[:10, :]
total_bill_sorted

# Sorting:

- Sort the dataset by 'total_bill' in descending order.

In [None]:
total_bill_sorted_df = df.sort_values(by = "total_bill", ascending = False)
total_bill_sorted_df

# Handling missing data:

- Replace any missing values in the 'tip' column with the mean of the 'tip' column.
- Drop any rows where the 'size' is missing.

In [None]:
#Replace any missing values in the 'tip' column with the mean of the 'tip' column.
mean = df["tip"].mean()
df["tip"] = df["tip"].fillna(value = mean)
# Drop any rows where the 'size' is missing
df.dropna(subset = ["size"])
df