In [38]:
# use panda to read the dataset
import pandas
data = pandas.read_csv('datasets/data.csv')
data

Unnamed: 0,Flour,Milk,Sugar,Butter,Egg,Baking Powder,Vanilla,Salt,Type
0,55,28,3,7,5,2,0,0,Muffin
1,47,24,12,6,9,1,0,0,Muffin
2,47,23,18,6,4,1,0,0,Muffin
3,45,11,17,17,8,1,0,0,Muffin
4,50,25,12,6,5,2,1,0,Muffin
5,55,27,3,7,5,2,1,0,Muffin
6,54,27,7,5,5,2,0,0,Muffin
7,47,26,10,10,4,1,0,0,Muffin
8,50,17,17,8,6,1,0,0,Muffin
9,50,17,17,11,4,1,0,0,Muffin


In [39]:
# Check for empty slots
data.isnull().sum()

Flour            0
Milk             0
Sugar            0
Butter           0
Egg              0
Baking Powder    0
Vanilla          0
Salt             0
Type             0
dtype: int64

In [40]:
# Decribe the dataset
# Only works with numerical datasets
data.describe()

Unnamed: 0,Flour,Milk,Sugar,Butter,Egg,Baking Powder,Vanilla,Salt
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,39.34,15.92,23.08,9.74,6.94,2.68,0.96,0.96
std,11.213348,8.090912,14.037617,4.898188,3.644453,3.152194,2.732719,1.989359
min,19.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0
25%,34.0,11.0,16.0,6.0,5.0,1.0,0.0,0.0
50%,39.0,17.0,20.0,8.0,6.0,1.0,0.0,0.0
75%,47.75,23.0,26.0,12.0,9.0,2.0,1.0,0.0
max,55.0,28.0,52.0,20.0,14.0,11.0,14.0,6.0


In [41]:
# Splitting the data into x and y values
array = data.values #This converts the entire DataFrame into a NumPy array. Think of it like transforming the table into a simpler format where you only have the raw data without any of the labels (like row and column names).
X = array[:, 0:8] #Here, you're selecting all rows (:) but only the columns from index 0 up to (but not including) 8. In other words, you’re selecting the first 8 columns of the data.
Y = array[:, 8] # This selects all rows but only the 9th column (index 8, because counting starts from 0 in Python). Typically, this column is the target variable, which is what you're trying to predict.
Y

array(['Muffin', 'Muffin', 'Muffin', 'Muffin', 'Muffin', 'Muffin',
       'Muffin', 'Muffin', 'Muffin', 'Muffin', 'Cupcake', 'Cupcake',
       'Cupcake', 'Cupcake', 'Cupcake', 'Cupcake', 'Cupcake', 'Cupcake',
       'Cupcake', 'Cupcake', 'Scone', 'Scone', 'Scone', 'Scone', 'Scone',
       'Muffin', 'Muffin', 'Muffin', 'Muffin', 'Muffin', 'Muffin',
       'Muffin', 'Muffin', 'Muffin', 'Muffin', 'Cupcake', 'Cupcake',
       'Cupcake', 'Cupcake', 'Cupcake', 'Cupcake', 'Cupcake', 'Cupcake',
       'Cupcake', 'Cupcake', 'Scone', 'Scone', 'Scone', 'Scone', 'Scone'],
      dtype=object)

In [42]:
# subdivide the dataset further by creating the x_train, x_test, y_train, y_test
from sklearn import model_selection
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=0.3, random_state=42)

In [43]:
# Choose one of the models to work with
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(x_train, y_train)

In [44]:
x_test

array([[39, 13, 17, 19, 10, 1, 1, 0],
       [38, 15, 23, 15, 8, 0, 1, 0],
       [55, 27, 3, 7, 5, 2, 1, 0],
       [29, 3, 46, 6, 0, 11, 0, 6],
       [38, 15, 31, 8, 6, 1, 1, 0],
       [19, 6, 50, 5, 6, 10, 0, 3],
       [47, 24, 12, 6, 9, 1, 0, 0],
       [55, 28, 3, 7, 5, 2, 0, 0],
       [48, 26, 10, 10, 4, 1, 0, 0],
       [34, 17, 23, 11, 13, 0, 1, 0],
       [34, 17, 20, 20, 5, 2, 1, 0],
       [50, 25, 12, 6, 5, 2, 1, 0],
       [34, 17, 20, 20, 5, 2, 1, 0],
       [50, 17, 17, 8, 6, 1, 0, 0],
       [45, 11, 17, 17, 8, 1, 0, 0]], dtype=object)

In [45]:
# lets see what the model predicted against what was the correct output
predicted = model.predict(x_test)
print('The model predicted: ', predicted)
print('We expected: ', y_test)

The model predicted:  ['Cupcake' 'Cupcake' 'Muffin' 'Scone' 'Cupcake' 'Scone' 'Muffin' 'Muffin'
 'Muffin' 'Cupcake' 'Cupcake' 'Muffin' 'Cupcake' 'Muffin' 'Muffin']
We expected:  ['Cupcake' 'Cupcake' 'Muffin' 'Scone' 'Cupcake' 'Scone' 'Muffin' 'Muffin'
 'Muffin' 'Cupcake' 'Cupcake' 'Muffin' 'Cupcake' 'Muffin' 'Muffin']


In [46]:
# we are going to check what is the accuracy score of our model
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(predicted, y_test) * 100
print(f'The model was {accuracy}% correct')

The model was 100.0% correct


In [47]:
# Now lets test the model based by feeding it with unseen values which represents the 8 columns
ingredients = [[3,65,7,77,2,12,3,4]]
cake = model.predict(ingredients)
print(f'You might be prepairing a: {cake}')

You might be prepairing a: ['Scone']


In [49]:
# Cross validation with other models
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC

# create an empty list that we shall append all our models giving them an alias name.
models = []

# append all
models.append(('Gaussian', GaussianNB()))
models.append(('KNC', KNeighborsClassifier()))
models.append(('Logistic', LogisticRegression()))
models.append(('Random Forest', RandomForestClassifier()))
models.append(('GB', GradientBoostingClassifier()))
models.append(('Linear Disc', LinearDiscriminantAnalysis()))
models.append(('Support Machines', SVC(gamma='auto')))

# import cross validation and kfold
from sklearn.model_selection import cross_val_score, KFold

# create a loop to test all models individually
for name, model in models:
    Kfold = KFold(n_splits=10, random_state=42, shuffle=True)

    # we the results for each model
    CV_results = cross_val_score(model, x_train, y_train, cv=Kfold, scoring='accuracy')

    # get the average of all folds
    print('Model Name:', name, 'Results:', CV_results.mean()*100, '%accurate')

Model Name: Gaussian Results: 95.0 %accurate
Model Name: KNC Results: 92.5 %accurate


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Model Name: Logistic Results: 97.5 %accurate
Model Name: Random Forest Results: 100.0 %accurate
Model Name: GB Results: 100.0 %accurate
Model Name: Linear Disc Results: 97.5 %accurate
Model Name: Support Machines Results: 69.16666666666667 %accurate
