### Data cleaning and preprocessing with Pandas


In [2]:
import pandas as pd

In [3]:
cereal_data = pd.read_csv('cereal.csv')

In [4]:
cereal_data.head(3)

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0,33.983679
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505


In [16]:
basic_stats = cereal_data.describe()
basic_stats

Unnamed: 0,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
count,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0
mean,106.883117,2.545455,1.012987,159.675325,2.151948,14.597403,6.922078,96.077922,28.246753,2.207792,1.02961,0.821039,42.665705
std,19.484119,1.09479,1.006473,83.832295,2.383364,4.278956,4.444885,71.286813,22.342523,0.832524,0.150477,0.232716,14.047289
min,50.0,1.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,0.0,1.0,0.5,0.25,18.042851
25%,100.0,2.0,0.0,130.0,1.0,12.0,3.0,40.0,25.0,1.0,1.0,0.67,33.174094
50%,110.0,3.0,1.0,180.0,2.0,14.0,7.0,90.0,25.0,2.0,1.0,0.75,40.400208
75%,110.0,3.0,2.0,210.0,3.0,17.0,11.0,120.0,25.0,3.0,1.0,1.0,50.828392
max,160.0,6.0,5.0,320.0,14.0,23.0,15.0,330.0,100.0,3.0,1.5,1.5,93.704912


#### Identify missing values in the DataFrame

In [17]:
missing_values = cereal_data.isnull().sum()
missing_values

name        0
mfr         0
type        0
calories    0
protein     0
fat         0
sodium      0
fiber       0
carbo       0
sugars      0
potass      0
vitamins    0
shelf       0
weight      0
cups        0
rating      0
dtype: int64

#### Drop rows and columns with any missing values

In [22]:
dropped_rows_df = cereal_data.dropna()
dropped_columns_df = cereal_data.dropna(axis=1)
# all data nothing is missing 
dropped_rows_df 
dropped_columns_df

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.00,33.983679
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.50,93.704912
4,Almond Delight,R,C,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,Triples,G,C,110,2,1,250,0.0,21.0,3,60,25,3,1.0,0.75,39.106174
73,Trix,G,C,110,1,1,140,0.0,13.0,12,25,25,2,1.0,1.00,27.753301
74,Wheat Chex,R,C,100,3,1,230,3.0,17.0,3,115,25,1,1.0,0.67,49.787445
75,Wheaties,G,C,100,3,1,200,3.0,17.0,3,110,25,1,1.0,1.00,51.592193


#### Fill missing values

In [27]:
filled_values_df = cereal_data.fillna(0) #with 0
# Forward and Backward fill method
ffill_df = cereal_data.fillna(method='ffill')
bfill_df = cereal_data.fillna(method='bfill')


In [29]:
# Interpolate missing values
interpolated_df = cereal_data.interpolate()
interpolated_df

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.00,33.983679
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.50,93.704912
4,Almond Delight,R,C,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,Triples,G,C,110,2,1,250,0.0,21.0,3,60,25,3,1.0,0.75,39.106174
73,Trix,G,C,110,1,1,140,0.0,13.0,12,25,25,2,1.0,1.00,27.753301
74,Wheat Chex,R,C,100,3,1,230,3.0,17.0,3,115,25,1,1.0,0.67,49.787445
75,Wheaties,G,C,100,3,1,200,3.0,17.0,3,110,25,1,1.0,1.00,51.592193


In [31]:
# Convert the 'calories' column to float
cereal_data['calories'] = cereal_data['calories'].astype(float)
cereal_data

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100% Bran,N,C,70.0,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
1,100% Natural Bran,Q,C,120.0,3,5,15,2.0,8.0,8,135,0,3,1.0,1.00,33.983679
2,All-Bran,K,C,70.0,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505
3,All-Bran with Extra Fiber,K,C,50.0,4,0,140,14.0,8.0,0,330,25,3,1.0,0.50,93.704912
4,Almond Delight,R,C,110.0,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,Triples,G,C,110.0,2,1,250,0.0,21.0,3,60,25,3,1.0,0.75,39.106174
73,Trix,G,C,110.0,1,1,140,0.0,13.0,12,25,25,2,1.0,1.00,27.753301
74,Wheat Chex,R,C,100.0,3,1,230,3.0,17.0,3,115,25,1,1.0,0.67,49.787445
75,Wheaties,G,C,100.0,3,1,200,3.0,17.0,3,110,25,1,1.0,1.00,51.592193


In [5]:
# Apply a lambda function to transform the values of the 'calories' column I multiplied by 2
transformed_df = cereal_data.copy()
transformed_df['calories101'] = transformed_df['calories'].apply(lambda x: x * 2)

In [6]:
# Min-Max scaling for the 'calories' column
cereal_data['calories_minmax'] = (cereal_data['calories'] - cereal_data['calories'].min()) / (cereal_data['calories'].max() - cereal_data['calories'].min())
cereal_data

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating,calories_minmax
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973,0.181818
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.00,33.983679,0.636364
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505,0.181818
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.50,93.704912,0.000000
4,Almond Delight,R,C,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843,0.545455
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,Triples,G,C,110,2,1,250,0.0,21.0,3,60,25,3,1.0,0.75,39.106174,0.545455
73,Trix,G,C,110,1,1,140,0.0,13.0,12,25,25,2,1.0,1.00,27.753301,0.545455
74,Wheat Chex,R,C,100,3,1,230,3.0,17.0,3,115,25,1,1.0,0.67,49.787445,0.454545
75,Wheaties,G,C,100,3,1,200,3.0,17.0,3,110,25,1,1.0,1.00,51.592193,0.454545


In [7]:
# Z-score normalization for the calories column
cereal_data['calories_zscore'] = (cereal_data['calories'] - cereal_data['calories'].mean()) / cereal_data['calories'].std()
cereal_data

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating,calories_minmax,calories_zscore
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973,0.181818,-1.892984
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.00,33.983679,0.636364,0.673209
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505,0.181818,-1.892984
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.50,93.704912,0.000000,-2.919461
4,Almond Delight,R,C,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843,0.545455,0.159970
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,Triples,G,C,110,2,1,250,0.0,21.0,3,60,25,3,1.0,0.75,39.106174,0.545455,0.159970
73,Trix,G,C,110,1,1,140,0.0,13.0,12,25,25,2,1.0,1.00,27.753301,0.545455,0.159970
74,Wheat Chex,R,C,100,3,1,230,3.0,17.0,3,115,25,1,1.0,0.67,49.787445,0.454545,-0.353268
75,Wheaties,G,C,100,3,1,200,3.0,17.0,3,110,25,1,1.0,1.00,51.592193,0.454545,-0.353268


In [9]:
# Identify duplicate rows
duplicate_rows = cereal_data[cereal_data.duplicated()]
duplicate_rows # no duplicates

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating,calories_minmax,calories_zscore


In [12]:
# Drop duplicate rows
dropped_duplicates_df = cereal_data.drop_duplicates()
dropped_duplicates_df

# Drop duplicate rows of 'name' column
dropped_duplicates_name_df = cereal_data.drop_duplicates(subset=['name'])
dropped_duplicates_name_df

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating,calories_minmax,calories_zscore
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973,0.181818,-1.892984
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.00,33.983679,0.636364,0.673209
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505,0.181818,-1.892984
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.50,93.704912,0.000000,-2.919461
4,Almond Delight,R,C,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843,0.545455,0.159970
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,Triples,G,C,110,2,1,250,0.0,21.0,3,60,25,3,1.0,0.75,39.106174,0.545455,0.159970
73,Trix,G,C,110,1,1,140,0.0,13.0,12,25,25,2,1.0,1.00,27.753301,0.545455,0.159970
74,Wheat Chex,R,C,100,3,1,230,3.0,17.0,3,115,25,1,1.0,0.67,49.787445,0.454545,-0.353268
75,Wheaties,G,C,100,3,1,200,3.0,17.0,3,110,25,1,1.0,1.00,51.592193,0.454545,-0.353268


In [14]:
# Convert all string values in the 'name' column to lowercase
cereal_data['name'] = cereal_data['name'].str.lower()
# Remove leading and trailing spaces from string values in the 'name' column
cereal_data['name'] = cereal_data['name'].str.strip()
# Replace 'bran' with 'fiber' in the 'name' column
cereal_data['name'] = cereal_data['name'].str.replace('bran', 'fiber', case=False)
cereal_data

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating,calories_minmax,calories_zscore
0,100% fiber,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973,0.181818,-1.892984
1,100% natural fiber,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.00,33.983679,0.636364,0.673209
2,all-fiber,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505,0.181818,-1.892984
3,all-fiber with extra fiber,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.50,93.704912,0.000000,-2.919461
4,almond delight,R,C,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843,0.545455,0.159970
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,triples,G,C,110,2,1,250,0.0,21.0,3,60,25,3,1.0,0.75,39.106174,0.545455,0.159970
73,trix,G,C,110,1,1,140,0.0,13.0,12,25,25,2,1.0,1.00,27.753301,0.545455,0.159970
74,wheat chex,R,C,100,3,1,230,3.0,17.0,3,115,25,1,1.0,0.67,49.787445,0.454545,-0.353268
75,wheaties,G,C,100,3,1,200,3.0,17.0,3,110,25,1,1.0,1.00,51.592193,0.454545,-0.353268


In [16]:
# Extract the first 3 characters from the 'name' column
cereal_data['name_prefix'] = cereal_data['name'].str[:3]
cereal_data

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating,calories_minmax,calories_zscore,name_prefix
0,100% fiber,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973,0.181818,-1.892984,100
1,100% natural fiber,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.00,33.983679,0.636364,0.673209,100
2,all-fiber,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505,0.181818,-1.892984,all
3,all-fiber with extra fiber,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.50,93.704912,0.000000,-2.919461,all
4,almond delight,R,C,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843,0.545455,0.159970,alm
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,triples,G,C,110,2,1,250,0.0,21.0,3,60,25,3,1.0,0.75,39.106174,0.545455,0.159970,tri
73,trix,G,C,110,1,1,140,0.0,13.0,12,25,25,2,1.0,1.00,27.753301,0.545455,0.159970,tri
74,wheat chex,R,C,100,3,1,230,3.0,17.0,3,115,25,1,1.0,0.67,49.787445,0.454545,-0.353268,whe
75,wheaties,G,C,100,3,1,200,3.0,17.0,3,110,25,1,1.0,1.00,51.592193,0.454545,-0.353268,whe


In [18]:
# Creating a new column 'calories_per_gram' based on 'calories' and 'weight'
cereal_data['calories_per_gram'] = cereal_data['calories'] / (cereal_data['weight'] * 28.3495)  # 1 ounce = 28.3495 grams
# Discretizing the 'calories' column into 3 bins
cereal_data['calories_binned'] = pd.cut(cereal_data['calories'], bins=3, labels=['low', 'medium', 'high'])
cereal_data

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,...,vitamins,shelf,weight,cups,rating,calories_minmax,calories_zscore,name_prefix,calories_per_gram,calories_binned
0,100% fiber,N,C,70,4,1,130,10.0,5.0,6,...,25,3,1.0,0.33,68.402973,0.181818,-1.892984,100,2.469179,low
1,100% natural fiber,Q,C,120,3,5,15,2.0,8.0,8,...,0,3,1.0,1.00,33.983679,0.636364,0.673209,100,4.232879,medium
2,all-fiber,K,C,70,4,1,260,9.0,7.0,5,...,25,3,1.0,0.33,59.425505,0.181818,-1.892984,all,2.469179,low
3,all-fiber with extra fiber,K,C,50,4,0,140,14.0,8.0,0,...,25,3,1.0,0.50,93.704912,0.000000,-2.919461,all,1.763700,low
4,almond delight,R,C,110,2,2,200,1.0,14.0,8,...,25,3,1.0,0.75,34.384843,0.545455,0.159970,alm,3.880139,medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,triples,G,C,110,2,1,250,0.0,21.0,3,...,25,3,1.0,0.75,39.106174,0.545455,0.159970,tri,3.880139,medium
73,trix,G,C,110,1,1,140,0.0,13.0,12,...,25,2,1.0,1.00,27.753301,0.545455,0.159970,tri,3.880139,medium
74,wheat chex,R,C,100,3,1,230,3.0,17.0,3,...,25,1,1.0,0.67,49.787445,0.454545,-0.353268,whe,3.527399,medium
75,wheaties,G,C,100,3,1,200,3.0,17.0,3,...,25,1,1.0,1.00,51.592193,0.454545,-0.353268,whe,3.527399,medium


In [20]:
# DataFrame concatenation
df2  = pd.read_csv('Dataset salary 2024.csv')
concatenated_df = pd.concat([cereal_data, df2], axis=0)
concatenated_df


Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,...,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,100% fiber,N,C,70.0,4.0,1.0,130.0,10.0,5.0,6.0,...,,,,,,,,,,
1,100% natural fiber,Q,C,120.0,3.0,5.0,15.0,2.0,8.0,8.0,...,,,,,,,,,,
2,all-fiber,K,C,70.0,4.0,1.0,260.0,9.0,7.0,5.0,...,,,,,,,,,,
3,all-fiber with extra fiber,K,C,50.0,4.0,0.0,140.0,14.0,8.0,0.0,...,,,,,,,,,,
4,almond delight,R,C,110.0,2.0,2.0,200.0,1.0,14.0,8.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16529,,,,,,,,,,,...,SE,FT,Data Scientist,412000.0,USD,412000.0,US,100.0,US,L
16530,,,,,,,,,,,...,MI,FT,Principal Data Scientist,151000.0,USD,151000.0,US,100.0,US,L
16531,,,,,,,,,,,...,EN,FT,Data Scientist,105000.0,USD,105000.0,US,100.0,US,S
16532,,,,,,,,,,,...,EN,CT,Business Data Analyst,100000.0,USD,100000.0,US,100.0,US,L


In [21]:
concatenated_horizontal_df = pd.concat([cereal_data, df2], axis=1)
concatenated_horizontal_df

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,...,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,100% fiber,N,C,70.0,4.0,1.0,130.0,10.0,5.0,6.0,...,SE,FT,AI Engineer,202730,USD,202730,US,0,US,M
1,100% natural fiber,Q,C,120.0,3.0,5.0,15.0,2.0,8.0,8.0,...,SE,FT,AI Engineer,92118,USD,92118,US,0,US,M
2,all-fiber,K,C,70.0,4.0,1.0,260.0,9.0,7.0,5.0,...,SE,FT,Data Engineer,130500,USD,130500,US,0,US,M
3,all-fiber with extra fiber,K,C,50.0,4.0,0.0,140.0,14.0,8.0,0.0,...,SE,FT,Data Engineer,96000,USD,96000,US,0,US,M
4,almond delight,R,C,110.0,2.0,2.0,200.0,1.0,14.0,8.0,...,SE,FT,Machine Learning Engineer,190000,USD,190000,US,0,US,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16529,,,,,,,,,,,...,SE,FT,Data Scientist,412000,USD,412000,US,100,US,L
16530,,,,,,,,,,,...,MI,FT,Principal Data Scientist,151000,USD,151000,US,100,US,L
16531,,,,,,,,,,,...,EN,FT,Data Scientist,105000,USD,105000,US,100,US,S
16532,,,,,,,,,,,...,EN,CT,Business Data Analyst,100000,USD,100000,US,100,US,L


In [22]:
# Creating a new column 'calories_per_gram' based on 'calories' and 'weight'
cereal_data['calories_per_gram'] = cereal_data['calories'] / (cereal_data['weight'] * 28.3495)  # 1 ounce = 28.3495 grams
cereal_data

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,...,vitamins,shelf,weight,cups,rating,calories_minmax,calories_zscore,name_prefix,calories_per_gram,calories_binned
0,100% fiber,N,C,70,4,1,130,10.0,5.0,6,...,25,3,1.0,0.33,68.402973,0.181818,-1.892984,100,2.469179,low
1,100% natural fiber,Q,C,120,3,5,15,2.0,8.0,8,...,0,3,1.0,1.00,33.983679,0.636364,0.673209,100,4.232879,medium
2,all-fiber,K,C,70,4,1,260,9.0,7.0,5,...,25,3,1.0,0.33,59.425505,0.181818,-1.892984,all,2.469179,low
3,all-fiber with extra fiber,K,C,50,4,0,140,14.0,8.0,0,...,25,3,1.0,0.50,93.704912,0.000000,-2.919461,all,1.763700,low
4,almond delight,R,C,110,2,2,200,1.0,14.0,8,...,25,3,1.0,0.75,34.384843,0.545455,0.159970,alm,3.880139,medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,triples,G,C,110,2,1,250,0.0,21.0,3,...,25,3,1.0,0.75,39.106174,0.545455,0.159970,tri,3.880139,medium
73,trix,G,C,110,1,1,140,0.0,13.0,12,...,25,2,1.0,1.00,27.753301,0.545455,0.159970,tri,3.880139,medium
74,wheat chex,R,C,100,3,1,230,3.0,17.0,3,...,25,1,1.0,0.67,49.787445,0.454545,-0.353268,whe,3.527399,medium
75,wheaties,G,C,100,3,1,200,3.0,17.0,3,...,25,1,1.0,1.00,51.592193,0.454545,-0.353268,whe,3.527399,medium


In [23]:
# Discretizing the 'calories' column into 3 bins
cereal_data['calories_binned'] = pd.cut(cereal_data['calories'], bins=3, labels=['low', 'medium', 'high'])
cereal_data

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,...,vitamins,shelf,weight,cups,rating,calories_minmax,calories_zscore,name_prefix,calories_per_gram,calories_binned
0,100% fiber,N,C,70,4,1,130,10.0,5.0,6,...,25,3,1.0,0.33,68.402973,0.181818,-1.892984,100,2.469179,low
1,100% natural fiber,Q,C,120,3,5,15,2.0,8.0,8,...,0,3,1.0,1.00,33.983679,0.636364,0.673209,100,4.232879,medium
2,all-fiber,K,C,70,4,1,260,9.0,7.0,5,...,25,3,1.0,0.33,59.425505,0.181818,-1.892984,all,2.469179,low
3,all-fiber with extra fiber,K,C,50,4,0,140,14.0,8.0,0,...,25,3,1.0,0.50,93.704912,0.000000,-2.919461,all,1.763700,low
4,almond delight,R,C,110,2,2,200,1.0,14.0,8,...,25,3,1.0,0.75,34.384843,0.545455,0.159970,alm,3.880139,medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,triples,G,C,110,2,1,250,0.0,21.0,3,...,25,3,1.0,0.75,39.106174,0.545455,0.159970,tri,3.880139,medium
73,trix,G,C,110,1,1,140,0.0,13.0,12,...,25,2,1.0,1.00,27.753301,0.545455,0.159970,tri,3.880139,medium
74,wheat chex,R,C,100,3,1,230,3.0,17.0,3,...,25,1,1.0,0.67,49.787445,0.454545,-0.353268,whe,3.527399,medium
75,wheaties,G,C,100,3,1,200,3.0,17.0,3,...,25,1,1.0,1.00,51.592193,0.454545,-0.353268,whe,3.527399,medium


In [24]:
from sklearn.preprocessing import PolynomialFeatures

# Creating polynomial features from 'calories' and 'protein' columns
poly = PolynomialFeatures(degree=2)
poly_features = poly.fit_transform(cereal_data[['calories', 'protein']])
poly_features_df = pd.DataFrame(poly_features, columns=poly.get_feature_names_out(['calories', 'protein']))

print(poly_features_df.head())

     1  calories  protein  calories^2  calories protein  protein^2
0  1.0      70.0      4.0      4900.0             280.0       16.0
1  1.0     120.0      3.0     14400.0             360.0        9.0
2  1.0      70.0      4.0      4900.0             280.0       16.0
3  1.0      50.0      4.0      2500.0             200.0       16.0
4  1.0     110.0      2.0     12100.0             220.0        4.0
