In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


data = pd.read_csv("/content/tips.csv")

print(data.head())

   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4


In [7]:
#Cleaning
# Check for missing values
print("Missing Values:\n", data.isnull().sum())

# Remove outliers using IQR (Interquartile Range)
Q1 = data["tip"].quantile(0.25)
Q3 = data["tip"].quantile(0.75)
IQR = Q3 - Q1

# Define outlier range
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print("\nOutlier:\n")
# Filter out outliers
tips_cleaned = data[(data["tip"] >= lower_bound) & (data["tip"] <= upper_bound)]
print(f"Original Data: {len(data)} rows, Cleaned Data: {len(tips_cleaned)} rows")

Missing Values:
 total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

Outlier:

Original Data: 244 rows, Cleaned Data: 235 rows


In [9]:
# One-hot encode categorical variables
data_encoded = pd.get_dummies(tips_cleaned, columns=["sex", "smoker", "day", "time"], drop_first=True)

print("One-hot Encoded Format:\n")
print(data_encoded.head())

One-hot Encoded Format:

   total_bill   tip  size  sex_Male  smoker_Yes  day_Sat  day_Sun  day_Thur  \
0       16.99  1.01     2     False       False    False     True     False   
1       10.34  1.66     3      True       False    False     True     False   
2       21.01  3.50     3      True       False    False     True     False   
3       23.68  3.31     2      True       False    False     True     False   
4       24.59  3.61     4     False       False    False     True     False   

   time_Lunch  
0       False  
1       False  
2       False  
3       False  
4       False  


In [14]:

# Define numerical columns to scale
num_features = ["total_bill", "size"]

# Apply standard scaling
scaler = StandardScaler()
data_encoded[num_features] = scaler.fit_transform(data_encoded[num_features])
print("\n Scaled Data:\n")
print(data_encoded.head())



 Scaled Data:

   total_bill   tip      size  sex_Male  smoker_Yes  day_Sat  day_Sun  \
0   -0.261408  1.01 -0.567497     False       False    False     True   
1   -1.076452  1.66  0.525633      True       False    False     True   
2    0.231294  3.50  0.525633      True       False    False     True   
3    0.558537  3.31 -0.567497      True       False    False     True   
4    0.670070  3.61  1.618762     False       False    False     True   

   day_Thur  time_Lunch  
0     False       False  
1     False       False  
2     False       False  
3     False       False  
4     False       False  


In [12]:
# Define features (X) and target variable (y)
X = data_encoded.drop("tip", axis=1)
y = data_encoded["tip"]

#First split → 70% training, 30% temporary (validation + test).
#Second split → The temporary 30% is split into 15% validation and 15% testing.

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
print("\n Splitting Data:\n")
print(f"Training Set: {X_train.shape}, Testing Set: {X_test.shape}, Validation Set: {X_val.shape}")


 Splitting Data:

Training Set: (164, 8), Testing Set: (36, 8), Validation Set: (35, 8)
