# Cleaning Data

In [1]:
# Import all

import pandas as pd
import numpy as np
from scipy import stats

In [2]:
# Load Data

cookies = pd.read_csv("cookies.csv")

In [3]:
# Explore Data

cookies.head()
cookies.info()
cookies.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5198 entries, 0 to 5197
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   sugar to flour ratio  5198 non-null   float64
 1   sugar index           5193 non-null   float64
 2   bake temp             5198 non-null   int64  
 3   chill time            5198 non-null   float64
 4   calories              5198 non-null   float64
 5   density               5198 non-null   float64
 6   pH                    5198 non-null   float64
 7   grams baking soda     5198 non-null   float64
 8   bake time             5188 non-null   float64
 9   quality               5198 non-null   int64  
 10  butter type           5198 non-null   object 
 11  weight                5198 non-null   float64
 12  diameter              5198 non-null   int64  
 13  mixins                5196 non-null   object 
 14  crunch factor         5198 non-null   float64
 15  aesthetic appeal     

sugar to flour ratio     0
sugar index              5
bake temp                0
chill time               0
calories                 0
density                  0
pH                       0
grams baking soda        0
bake time               10
quality                  0
butter type              0
weight                   0
diameter                 0
mixins                   2
crunch factor            0
aesthetic appeal         0
dtype: int64

In [4]:
# Drop NA

cookies = cookies.dropna()

# Drop Unnecessary Columns

cookies = cookies.drop(columns= ["crunch factor", "density", "diameter", "pH", "aesthetic appeal"])



# Dummy Values

In [5]:
# Get values in "Object" columns

one = pd.get_dummies(cookies.mixins)

cookies = pd.concat([cookies, one], axis=1)

two = pd.get_dummies(cookies["butter type"])

cookies= pd.concat([cookies, two], axis=1)

cookies = cookies.drop(columns=["butter type", "mixins"])



# Outliners

In [6]:
# Identify Outliners

from pyod.models.knn import KNN

X = pd.DataFrame(data=cookies)
clf = KNN(contamination=0.01)
clf.fit(X)
y_pred = clf.predict(X)
cookies_outliners = X[y_pred == 1]
cookies_outliners # This DataFrame contains all outliners

Unnamed: 0,sugar to flour ratio,sugar index,bake temp,chill time,calories,grams baking soda,bake time,quality,weight,chocolate,...,"nuts, oats","nuts, oats, chocolate","nuts,raisins",oats,peanut butter,"peanut butter, raisins",raisins,"raisins, oats",cubed,melted
22,0.28,14.7,510,29.0,-99.0,0.39,9.0,9,14.0,0,...,0,0,0,0,0,0,0,0,0,1
70,0.58,1.4,2040,15.0,97.0,0.43,10.0,8,12.6,1,...,0,0,0,0,0,0,0,0,0,1
200,0.37,1.0,380,6.0,-99.0,0.34,11.4,7,11.2,0,...,0,0,0,0,0,0,1,0,0,1
302,0.49,2.0,2700,23.0,110.0,1.02,9.3,5,17.8,0,...,0,1,0,0,0,0,0,0,1,0
327,0.81,1.2,2170,14.0,74.0,0.53,9.5,7,13.0,0,...,0,0,0,0,0,0,1,0,0,1
387,0.24,2.5,2260,5.0,15.0,0.6,9.0,6,17.4,0,...,1,0,0,0,0,0,0,0,1,0
435,0.5,2.0,3370,27.0,81.0,1.61,9.5,6,17.8,0,...,0,0,1,0,0,0,0,0,1,0
743,0.44,2.4,2140,5.0,12.0,0.63,9.5,6,20.6,0,...,0,0,0,0,0,0,0,0,1,0
761,0.49,2.1,2000,5.0,16.0,0.63,8.4,3,20.8,0,...,0,1,0,0,0,0,0,0,1,0
800,0.17,2.4,2410,6.0,18.0,0.59,10.3,6,13.8,0,...,0,1,0,0,0,0,0,0,1,0


In [7]:
# Drop Outliners

for x in cookies_outliners.index:
    cookies.drop([x],axis = 0,inplace = True)
    
cookies

Unnamed: 0,sugar to flour ratio,sugar index,bake temp,chill time,calories,grams baking soda,bake time,quality,weight,chocolate,...,"nuts, oats","nuts, oats, chocolate","nuts,raisins",oats,peanut butter,"peanut butter, raisins",raisins,"raisins, oats",cubed,melted
0,0.25,9.5,300,15.0,136.0,0.44,12.1,8,15.2,0,...,0,0,0,0,0,0,1,0,0,1
1,0.23,3.3,520,34.0,113.0,0.48,8.4,7,12.4,0,...,0,0,0,0,0,0,1,0,0,1
2,0.18,1.9,360,33.0,106.0,0.83,14.0,9,9.4,0,...,0,0,0,0,0,0,0,0,0,1
3,0.18,10.5,490,41.0,124.0,0.35,10.5,7,12.2,1,...,0,0,0,0,0,0,0,0,0,1
4,0.24,2.4,770,6.0,33.0,0.57,9.4,5,19.8,0,...,0,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5193,0.66,2.0,390,15.0,88.0,0.60,11.5,6,11.6,0,...,0,0,0,0,1,0,0,0,1,0
5194,0.16,1.5,490,10.0,97.0,0.44,10.7,8,10.0,0,...,0,0,0,0,0,0,1,0,0,1
5195,0.25,14.3,480,79.0,149.0,0.37,9.7,7,13.4,0,...,0,0,0,0,0,0,0,0,0,1
5196,0.33,1.4,560,35.0,136.0,0.78,10.3,8,13.8,1,...,0,0,0,0,0,0,0,0,0,1


# Split Data

In [8]:
# Split Data

X= cookies.drop(columns="quality")
y=cookies.quality

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.1, random_state=8)


In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Modeling

In [21]:
from sklearn.ensemble import RandomForestRegressor

rdf = RandomForestRegressor(n_estimators=250, max_features="auto", random_state=8)
rdf.fit(X_train, y_train)
y_pred = rdf.predict(X_test)