In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from jcopml.pipeline import num_pipe, cat_pipe
from jcopml.utils import save_model, load_model
from jcopml.plot import plot_missing_value
from jcopml.feature_importance import mean_score_decrease

In [3]:
df = pd.read_csv("data/cerealKeren.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,0,100% Bran,N,C,70,4,1,130,10.0,5.0,6.0,280.0,FDA_25,3,1.0,0.33,68.402973
1,1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8.0,135.0,FDA_0,3,1.0,1.0,33.983679
2,2,All-Bran,K,C,70,4,1,260,9.0,7.0,5.0,320.0,FDA_25,3,1.0,0.33,59.425505
3,3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0.0,330.0,FDA_25,3,1.0,0.5,93.704912
4,4,Almond Delight,R,C,110,2,2,200,1.0,14.0,8.0,ready,FDA_25,3,1.0,0.75,34.384843


In [4]:
df.drop(["Unnamed: 0"], axis=1, inplace=True)

In [5]:
df.head()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6.0,280.0,FDA_25,3,1.0,0.33,68.402973
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8.0,135.0,FDA_0,3,1.0,1.0,33.983679
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5.0,320.0,FDA_25,3,1.0,0.33,59.425505
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0.0,330.0,FDA_25,3,1.0,0.5,93.704912
4,Almond Delight,R,C,110,2,2,200,1.0,14.0,8.0,ready,FDA_25,3,1.0,0.75,34.384843


In [6]:
df[(df["rating"] > 30) & (df["mfr"].isin(["N", "K"]))].head()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6.0,280.0,FDA_25,3,1.0,0.33,68.402973
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5.0,320.0,FDA_25,3,1.0,0.33,59.425505
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0.0,330.0,FDA_25,3,1.0,0.5,93.704912
6,Apple Jacks,K,C,110,2,0,125,1.0,11.0,14.0,30.0,FDA_25,2,1.0,1.0,33.174094
16,Corn Flakes,K,C,100,2,0,290,1.0,21.0,2.0,35.0,FDA_25,1,1.0,1.0,45.863324


In [9]:
df[(df["rating"] > 30) & (df["mfr"].isin(["N", "K"]))].mean()

  df[(df["rating"] > 30) & (df["mfr"].isin(["N", "K"]))].mean()


calories    103.571429
protein       2.714286
fat           0.500000
sodium      144.821429
fiber         3.107143
shelf         2.214286
weight        1.057857
cups          0.796786
rating       49.670420
dtype: float64

In [14]:
keren = df[(df["rating"] > 30) & (df["mfr"].isin(["N", "K"]))].copy()
keren.sort_values(by="rating", ascending=False, inplace=True)
keren.head()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0.0,330.0,FDA_25,3,1.0,0.5,93.704912
64,Shredded Wheat 'n'Bran,N,C,90,3,0,0,4.0,19.0,0.0,140.0,FDA_0,1,1.0,0.67,74.472949
65,Shredded Wheat spoon size,N,C,90,3,0,0,3.0,20.0,0.0,120.0,FDA_0,1,1.0,0.67,72.801787
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6.0,280.0,FDA_25,3,1.0,0.33,68.402973
63,Shredded Wheat,N,C,80,2,0,0,3.0,16.0,0.0,95.0,FDA_0,1,0.83,1.0,68.235885


In [15]:
keren.groupby("mfr").mean()

Unnamed: 0_level_0,calories,protein,fat,sodium,fiber,shelf,weight,cups,rating
mfr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
K,108.181818,2.681818,0.590909,174.090909,2.863636,2.363636,1.081364,0.801818,44.680016
N,86.666667,2.833333,0.166667,37.5,4.0,1.666667,0.971667,0.778333,67.968567


In [26]:
df.groupby(["mfr", "shelf"]).rating.agg(["mean", "std", "count"])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,count
mfr,shelf,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,2,54.850917,,1
G,1,39.872079,9.533802,6
G,2,26.069929,6.396781,7
G,3,37.440751,5.390828,9
K,1,42.747695,9.133588,4
K,2,37.131878,9.857668,7
K,3,48.497559,17.018441,12
N,1,71.836874,3.22855,3
N,2,61.948905,3.655617,2
N,3,68.402973,,1


In [27]:
df.pivot_table(index="mfr", columns="shelf", values="rating", aggfunc=["mean", "std", "count"])

Unnamed: 0_level_0,mean,mean,mean,std,std,std,count,count,count
shelf,1,2,3,1,2,3,1,2,3
mfr,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
A,,54.850917,,,,,,1.0,
G,39.872079,26.069929,37.440751,9.533802,6.396781,5.390828,6.0,7.0,9.0
K,42.747695,37.131878,48.497559,9.133588,9.857668,17.018441,4.0,7.0,12.0
N,71.836874,61.948905,68.402973,3.22855,3.655617,,3.0,2.0,1.0
P,31.997429,28.025765,47.221846,4.603286,,6.756369,2.0,1.0,6.0
Q,50.828392,28.414072,51.814327,,14.772502,13.271956,1.0,3.0,4.0
R,45.587913,,37.498082,4.478015,,4.753519,4.0,,4.0


In [28]:
df.head()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6.0,280.0,FDA_25,3,1.0,0.33,68.402973
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8.0,135.0,FDA_0,3,1.0,1.0,33.983679
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5.0,320.0,FDA_25,3,1.0,0.33,59.425505
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0.0,330.0,FDA_25,3,1.0,0.5,93.704912
4,Almond Delight,R,C,110,2,2,200,1.0,14.0,8.0,ready,FDA_25,3,1.0,0.75,34.384843


In [29]:
df.shape

(77, 16)

In [30]:
len(df)

77

In [32]:
df["kolomKeren"] = np.random.rand(len(df))

In [33]:
df.head()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating,kolomKeren
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6.0,280.0,FDA_25,3,1.0,0.33,68.402973,0.383753
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8.0,135.0,FDA_0,3,1.0,1.0,33.983679,0.99215
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5.0,320.0,FDA_25,3,1.0,0.33,59.425505,0.280479
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0.0,330.0,FDA_25,3,1.0,0.5,93.704912,0.893086
4,Almond Delight,R,C,110,2,2,200,1.0,14.0,8.0,ready,FDA_25,3,1.0,0.75,34.384843,0.620341


In [34]:
df.drop(["kolomKeren"], axis=1, inplace=True)

In [35]:
df.head()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6.0,280.0,FDA_25,3,1.0,0.33,68.402973
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8.0,135.0,FDA_0,3,1.0,1.0,33.983679
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5.0,320.0,FDA_25,3,1.0,0.33,59.425505
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0.0,330.0,FDA_25,3,1.0,0.5,93.704912
4,Almond Delight,R,C,110,2,2,200,1.0,14.0,8.0,ready,FDA_25,3,1.0,0.75,34.384843


In [36]:
df["shelf_name"] = df.shelf.map({1: "bawah", 2: "tengah", 3: "atas"})

In [37]:
df.head()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating,shelf_name
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6.0,280.0,FDA_25,3,1.0,0.33,68.402973,atas
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8.0,135.0,FDA_0,3,1.0,1.0,33.983679,atas
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5.0,320.0,FDA_25,3,1.0,0.33,59.425505,atas
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0.0,330.0,FDA_25,3,1.0,0.5,93.704912,atas
4,Almond Delight,R,C,110,2,2,200,1.0,14.0,8.0,ready,FDA_25,3,1.0,0.75,34.384843,atas


In [42]:
def convertGtoMg(x):
  return x*1000

def convertToLower(x):
  return x.lower()

In [40]:
df["fatMg"] = df["fat"].apply(convertGtoMg)

In [41]:
df.head()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating,shelf_name,fatMg
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6.0,280.0,FDA_25,3,1.0,0.33,68.402973,atas,1000
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8.0,135.0,FDA_0,3,1.0,1.0,33.983679,atas,5000
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5.0,320.0,FDA_25,3,1.0,0.33,59.425505,atas,1000
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0.0,330.0,FDA_25,3,1.0,0.5,93.704912,atas,0
4,Almond Delight,R,C,110,2,2,200,1.0,14.0,8.0,ready,FDA_25,3,1.0,0.75,34.384843,atas,2000


In [43]:
df["mfrToLower"] = df["mfr"].apply(convertToLower)

In [44]:
df.head()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating,shelf_name,fatMg,mfrToLower
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6.0,280.0,FDA_25,3,1.0,0.33,68.402973,atas,1000,n
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8.0,135.0,FDA_0,3,1.0,1.0,33.983679,atas,5000,q
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5.0,320.0,FDA_25,3,1.0,0.33,59.425505,atas,1000,k
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0.0,330.0,FDA_25,3,1.0,0.5,93.704912,atas,0,k
4,Almond Delight,R,C,110,2,2,200,1.0,14.0,8.0,ready,FDA_25,3,1.0,0.75,34.384843,atas,2000,r


In [45]:
df["UseLambdafatMg"] = df["fat"].apply(lambda x: x*1000)

In [46]:
df.head()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating,shelf_name,fatMg,mfrToLower,UseLambdafatMg
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6.0,280.0,FDA_25,3,1.0,0.33,68.402973,atas,1000,n,1000
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8.0,135.0,FDA_0,3,1.0,1.0,33.983679,atas,5000,q,5000
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5.0,320.0,FDA_25,3,1.0,0.33,59.425505,atas,1000,k,1000
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0.0,330.0,FDA_25,3,1.0,0.5,93.704912,atas,0,k,0
4,Almond Delight,R,C,110,2,2,200,1.0,14.0,8.0,ready,FDA_25,3,1.0,0.75,34.384843,atas,2000,r,2000


In [47]:
df["scaledRating"] = df["rating"].transform(lambda x: (x-x.min())/(x.max()-x.min()))

In [48]:
df.head()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,...,vitamins,shelf,weight,cups,rating,shelf_name,fatMg,mfrToLower,UseLambdafatMg,scaledRating
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6.0,...,FDA_25,3,1.0,0.33,68.402973,atas,1000,n,1000,0.665593
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8.0,...,FDA_0,3,1.0,1.0,33.983679,atas,5000,q,5000,0.210685
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5.0,...,FDA_25,3,1.0,0.33,59.425505,atas,1000,k,1000,0.546941
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0.0,...,FDA_25,3,1.0,0.5,93.704912,atas,0,k,0,1.0
4,Almond Delight,R,C,110,2,2,200,1.0,14.0,8.0,...,FDA_25,3,1.0,0.75,34.384843,atas,2000,r,2000,0.215987


In [49]:
df["scaledRating"] = df["rating"].apply(lambda x: (x-x.min())/(x.max()-x.min()))

AttributeError: 'float' object has no attribute 'min'