Feature Engineering Notebook

In [211]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [212]:
df = pd.read_csv("/content/tips.csv")

In [213]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [214]:
df.tail()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.0,Female,Yes,Sat,Dinner,2
241,22.67,2.0,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2
243,18.78,3.0,Female,No,Thur,Dinner,2


In [215]:
df.size

1708

In [216]:
df.shape

(244, 7)

In [217]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB


In [218]:
df.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [219]:
import plotly.express as px

# Scatter plot: Total Bill vs Tip, colored by Time
fig = px.scatter(df, x='total_bill', y='tip', color='time',
                 labels={
                     'total_bill': 'Total Bill',
                     'tip': 'Tip',
                     'time': 'Time'
                 },
                 title='Scatter Plot of Total Bill vs Tip Colored by Time')
fig.show()

# Bar chart: Average Total Bill for Each Day
avg_total_bill = df.groupby('day')['total_bill'].mean().reset_index()

fig = px.bar(avg_total_bill, x='day', y='total_bill',
             labels={
                 'total_bill': 'Average Total Bill',
                 'day': 'Day'
             },
             title='Average Total Bill for Each Day')
fig.show()

# Box plot: Distribution of Total Bill by Day
fig = px.box(df, x='day', y='total_bill',
             labels={
                 'total_bill': 'Total Bill',
                 'day': 'Day'
             },
             title='Box Plot of Total Bill by Day')
fig.show()

# Histogram: Distribution of df
fig = px.histogram(df, x='tip',
                   labels={
                       'tip': 'Tip'
                   },
                   title='Distribution of df')
fig.show()


In [220]:
# Separate numerical and categorical columns
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns

# Standardize numerical features
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Encode categorical features
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Display the transformed dataset
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,-0.314711,-1.439947,0,0,2,0,-0.600193
1,-1.063235,-0.969205,1,0,2,0,0.453383
2,0.13778,0.363356,1,0,2,0,0.453383
3,0.438315,0.225754,1,0,2,0,-0.600193
4,0.540745,0.44302,0,0,2,0,1.506958


In [221]:
# Calculate the correlation matrix
correlation_matrix = df.corr()

# Select features with high correlation with 'tip'
selected_features = correlation_matrix['tip'][correlation_matrix['tip'].abs() > 0.1].index.tolist()

# Create a DataFrame with selected features
selected_data = df[selected_features]

# Generate the heatmap using Plotly
fig = px.imshow(selected_data.corr(),
                labels=dict(color="Correlation"),
                x=selected_data.columns,
                y=selected_data.columns,
                title='Correlation Matrix of Selected Features')

# Add annotations for correlation values
annotations = []
for i, row in enumerate(selected_data.columns):
    for j, col in enumerate(selected_data.columns):
        annotations.append(dict(x=col, y=row, text=f"{selected_data.corr().iloc[i, j]:.2f}",
                                showarrow=False, font=dict(size=12)))

fig.update_layout(
    annotations=annotations,
    xaxis_title="Features",
    yaxis_title="Features",
    width=800,
    height=600,
)

fig.show()
