# Question 1

In [2]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [3]:
df = pd.read_csv('tips.csv')

In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  244 non-null    int64  
 1   total_bill  205 non-null    float64
 2   tip         244 non-null    float64
 3   sex         244 non-null    object 
 4   smoker      244 non-null    object 
 5   day         244 non-null    object 
 6   time        244 non-null    object 
 7   size        244 non-null    int64  
dtypes: float64(2), int64(2), object(4)
memory usage: 15.4+ KB
None


In [5]:
print(df.isnull().sum())

Unnamed: 0     0
total_bill    39
tip            0
sex            0
smoker         0
day            0
time           0
size           0
dtype: int64


In [6]:
# Numerical columns: 'total_bill' and 'tip' that have numerical values
# impute the null values with the mean values of of the columns 'total_bill' and 'tip'
df['total_bill'].fillna(df['total_bill'].mean(), inplace=True)
df['tip'].fillna(df['tip'].mean(), inplace=True)

# Categorical columns: 'sex', 'smoker', 'day', 'time' that are classified into true and false categories
# impute with the mode (most frequent value) of each column
df['sex'].fillna(df['sex'].mode()[0], inplace=True)
df['smoker'].fillna(df['smoker'].mode()[0], inplace=True)
df['day'].fillna(df['day'].mode()[0], inplace=True)
df['time'].fillna(df['time'].mode()[0], inplace=True)

In [7]:
print("\nMissing values after handling:")
print(df.isnull().sum())


Missing values after handling:
Unnamed: 0    0
total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64


# Question 2

In [8]:
# Categorical columns: 'sex', 'smoker', 'day', 'time'

# Label Encoding
label_encoder = LabelEncoder()
df['sex_label_encoded'] = label_encoder.fit_transform(df['sex'])
df['smoker_label_encoded'] = label_encoder.fit_transform(df['smoker'])

# One-Hot Encoding
one_hot_encoded = pd.get_dummies(df[['day', 'time']])
df = pd.concat([df, one_hot_encoded], axis=1)

# Ordinal Encoding 
ordinal_mapping = {'Thur': 0, 'Fri': 1, 'Sat': 2, 'Sun': 3}
df['day_ordinal_encoded'] = df['day'].map(ordinal_mapping)

# Dropping original categorical columns after encoding
df.drop(['sex', 'smoker', 'day', 'time'], axis=1, inplace=True)

print(df)

     Unnamed: 0  total_bill   tip  size  sex_label_encoded  \
0             0   16.990000  1.01     2                  0   
1             1   21.360195  1.66     3                  1   
2             2   21.010000  3.50     3                  1   
3             3   23.680000  3.31     2                  1   
4             4   24.590000  3.61     4                  0   
..          ...         ...   ...   ...                ...   
239         239   29.030000  5.92     3                  1   
240         240   27.180000  2.00     2                  0   
241         241   22.670000  2.00     2                  1   
242         242   17.820000  1.75     2                  1   
243         243   18.780000  3.00     2                  0   

     smoker_label_encoded  day_Fri  day_Sat  day_Sun  day_Thur  time_Dinner  \
0                       0        0        0        1         0            1   
1                       0        0        0        1         0            1   
2                 

# Question 3

In [33]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Min-Max Normalization
min_max_scaler = MinMaxScaler()
tips_min_max_scaled = min_max_scaler.fit_transform(df[['total_bill', 'tip', 'size']])
tips_min_max_scaled = pd.DataFrame(tips_min_max_scaled, columns=['total_bill_mm', 'tip_mm', 'size_mm'])

# Standardization (Z-score)
standard_scaler = StandardScaler()
tips_standard_scaled = standard_scaler.fit_transform(df[['total_bill', 'tip', 'size']])
tips_standard_scaled = pd.DataFrame(tips_standard_scaled, columns=['total_bill_std', 'tip_std', 'size_std'])

# Z-score scaling
z_score_scaled = (df[['total_bill', 'tip', 'size']] - df[['total_bill', 'tip', 'size']].mean()) / df[['total_bill', 'tip', 'size']].std()
z_score_scaled.columns = ['total_bill_z', 'tip_z', 'size_z']

# Concatenating the scaled features to the original DataFrame
tips_scaled = pd.concat([df, tips_min_max_scaled, tips_standard_scaled, z_score_scaled], axis=1)

print(tips_scaled)

     Unnamed: 0  total_bill   tip  size  sex_label_encoded  \
0             0   16.990000  1.01     2                  0   
1             1   21.360195  1.66     3                  1   
2             2   21.010000  3.50     3                  1   
3             3   23.680000  3.31     2                  1   
4             4   24.590000  3.61     4                  0   
..          ...         ...   ...   ...                ...   
239         239   29.030000  5.92     3                  1   
240         240   27.180000  2.00     2                  0   
241         241   22.670000  2.00     2                  1   
242         242   17.820000  1.75     2                  1   
243         243   18.780000  3.00     2                  0   

     smoker_label_encoded  day_Fri  day_Sat  day_Sun  day_Thur  ...  \
0                       0        0        0        1         0  ...   
1                       0        0        0        1         0  ...   
2                       0        0        

# Question 4

In [35]:
# Calculate tip percentage for each row
df['tip_percentage'] = (df['tip'] / df['total_bill']) * 100

# Group by dining party size and calculate the average tip percentage
average_tip_percentage_by_size = df.groupby('size')['tip_percentage'].mean().reset_index()
average_tip_percentage_by_size.columns = ['size', 'avg_tip_percentage']

# Merge the average tip percentage with the original DataFrame
tips_with_avg_tip_percentage = pd.merge(df, average_tip_percentage_by_size, on='size', how='left')

print(tips_with_avg_tip_percentage)

     Unnamed: 0  total_bill   tip  size  sex_label_encoded  \
0             0   16.990000  1.01     2                  0   
1             1   21.360195  1.66     3                  1   
2             2   21.010000  3.50     3                  1   
3             3   23.680000  3.31     2                  1   
4             4   24.590000  3.61     4                  0   
..          ...         ...   ...   ...                ...   
239         239   29.030000  5.92     3                  1   
240         240   27.180000  2.00     2                  0   
241         241   22.670000  2.00     2                  1   
242         242   17.820000  1.75     2                  1   
243         243   18.780000  3.00     2                  0   

     smoker_label_encoded  day_Fri  day_Sat  day_Sun  day_Thur  time_Dinner  \
0                       0        0        0        1         0            1   
1                       0        0        0        1         0            1   
2                 

# Question 5

In [36]:
df['bill_type'] = 'Normal-bills'
df.loc[(df['total_bill'] > 10) & (df['tip'] > 3), 'bill_type'] = 'Highest-bills-with-tips'

print(df)

     Unnamed: 0  total_bill   tip  size  sex_label_encoded  \
0             0   16.990000  1.01     2                  0   
1             1   21.360195  1.66     3                  1   
2             2   21.010000  3.50     3                  1   
3             3   23.680000  3.31     2                  1   
4             4   24.590000  3.61     4                  0   
..          ...         ...   ...   ...                ...   
239         239   29.030000  5.92     3                  1   
240         240   27.180000  2.00     2                  0   
241         241   22.670000  2.00     2                  1   
242         242   17.820000  1.75     2                  1   
243         243   18.780000  3.00     2                  0   

     smoker_label_encoded  day_Fri  day_Sat  day_Sun  day_Thur  time_Dinner  \
0                       0        0        0        1         0            1   
1                       0        0        0        1         0            1   
2                 