In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [2]:
data = pd.read_csv('cereal.csv')

In [3]:
data.head()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0,33.983679
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.5,93.704912
4,Almond Delight,R,C,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843


In [4]:
data.tail()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
72,Triples,G,C,110,2,1,250,0.0,21.0,3,60,25,3,1.0,0.75,39.106174
73,Trix,G,C,110,1,1,140,0.0,13.0,12,25,25,2,1.0,1.0,27.753301
74,Wheat Chex,R,C,100,3,1,230,3.0,17.0,3,115,25,1,1.0,0.67,49.787445
75,Wheaties,G,C,100,3,1,200,3.0,17.0,3,110,25,1,1.0,1.0,51.592193
76,Wheaties Honey Gold,G,C,110,2,1,200,1.0,16.0,8,60,25,1,1.0,0.75,36.187559


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      77 non-null     object 
 1   mfr       77 non-null     object 
 2   type      77 non-null     object 
 3   calories  77 non-null     int64  
 4   protein   77 non-null     int64  
 5   fat       77 non-null     int64  
 6   sodium    77 non-null     int64  
 7   fiber     77 non-null     float64
 8   carbo     77 non-null     float64
 9   sugars    77 non-null     int64  
 10  potass    77 non-null     int64  
 11  vitamins  77 non-null     int64  
 12  shelf     77 non-null     int64  
 13  weight    77 non-null     float64
 14  cups      77 non-null     float64
 15  rating    77 non-null     float64
dtypes: float64(5), int64(8), object(3)
memory usage: 9.8+ KB


In [6]:
data.describe()

Unnamed: 0,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
count,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0
mean,106.883117,2.545455,1.012987,159.675325,2.151948,14.597403,6.922078,96.077922,28.246753,2.207792,1.02961,0.821039,42.665705
std,19.484119,1.09479,1.006473,83.832295,2.383364,4.278956,4.444885,71.286813,22.342523,0.832524,0.150477,0.232716,14.047289
min,50.0,1.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,0.0,1.0,0.5,0.25,18.042851
25%,100.0,2.0,0.0,130.0,1.0,12.0,3.0,40.0,25.0,1.0,1.0,0.67,33.174094
50%,110.0,3.0,1.0,180.0,2.0,14.0,7.0,90.0,25.0,2.0,1.0,0.75,40.400208
75%,110.0,3.0,2.0,210.0,3.0,17.0,11.0,120.0,25.0,3.0,1.0,1.0,50.828392
max,160.0,6.0,5.0,320.0,14.0,23.0,15.0,330.0,100.0,3.0,1.5,1.5,93.704912


# Data Preprocessing

In [7]:
data.isnull().sum()

name        0
mfr         0
type        0
calories    0
protein     0
fat         0
sodium      0
fiber       0
carbo       0
sugars      0
potass      0
vitamins    0
shelf       0
weight      0
cups        0
rating      0
dtype: int64

In [8]:
data = data.drop('name',axis=1)

In [9]:
data.head()

Unnamed: 0,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
1,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0,33.983679
2,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505
3,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.5,93.704912
4,R,C,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843


In [10]:
data['mfr'].value_counts()

K    23
G    22
P     9
Q     8
R     8
N     6
A     1
Name: mfr, dtype: int64

# Label Encoding

In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
le = LabelEncoder()
data['mfr'] = le.fit_transform(data['mfr'])
data.head()

Unnamed: 0,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,3,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
1,5,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0,33.983679
2,2,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505
3,2,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.5,93.704912
4,6,C,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843


In [13]:
data['type'].value_counts()

C    74
H     3
Name: type, dtype: int64

In [14]:
data['type'] = le.fit_transform(data['type'])
data.head()

Unnamed: 0,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,3,0,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
1,5,0,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0,33.983679
2,2,0,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505
3,2,0,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.5,93.704912
4,6,0,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843


# Splitting data for training and testing

In [15]:
x = data.drop('rating',axis=1)
y = data.loc[:,'rating']

In [16]:
print(x)

    mfr  type  calories  protein  fat  sodium  fiber  carbo  sugars  potass  \
0     3     0        70        4    1     130   10.0    5.0       6     280   
1     5     0       120        3    5      15    2.0    8.0       8     135   
2     2     0        70        4    1     260    9.0    7.0       5     320   
3     2     0        50        4    0     140   14.0    8.0       0     330   
4     6     0       110        2    2     200    1.0   14.0       8      -1   
..  ...   ...       ...      ...  ...     ...    ...    ...     ...     ...   
72    1     0       110        2    1     250    0.0   21.0       3      60   
73    1     0       110        1    1     140    0.0   13.0      12      25   
74    6     0       100        3    1     230    3.0   17.0       3     115   
75    1     0       100        3    1     200    3.0   17.0       3     110   
76    1     0       110        2    1     200    1.0   16.0       8      60   

    vitamins  shelf  weight  cups  
0         25   

In [17]:
print(y)

0     68.402973
1     33.983679
2     59.425505
3     93.704912
4     34.384843
        ...    
72    39.106174
73    27.753301
74    49.787445
75    51.592193
76    36.187559
Name: rating, Length: 77, dtype: float64


In [18]:
scaler = StandardScaler()

In [19]:
scaler.fit_transform(x)

array([[ 0.15874508, -0.20134682, -1.90539669, ...,  0.9578133 ,
        -0.19806746, -2.12387042],
       [ 1.32287566, -0.20134682,  0.67762347, ...,  0.9578133 ,
        -0.19806746,  0.77405275],
       [-0.42332021, -0.20134682, -1.90539669, ...,  0.9578133 ,
        -0.19806746, -2.12387042],
       ...,
       [ 1.90494094, -0.20134682, -0.35558459, ..., -1.46027273,
        -0.19806746, -0.65328254],
       [-1.0053855 , -0.20134682, -0.35558459, ..., -1.46027273,
        -0.19806746,  0.77405275],
       [-1.0053855 , -0.20134682,  0.16101944, ..., -1.46027273,
        -0.19806746, -0.30726187]])

In [20]:
x

Unnamed: 0,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups
0,3,0,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33
1,5,0,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.00
2,2,0,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33
3,2,0,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.50
4,6,0,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,1,0,110,2,1,250,0.0,21.0,3,60,25,3,1.0,0.75
73,1,0,110,1,1,140,0.0,13.0,12,25,25,2,1.0,1.00
74,6,0,100,3,1,230,3.0,17.0,3,115,25,1,1.0,0.67
75,1,0,100,3,1,200,3.0,17.0,3,110,25,1,1.0,1.00


In [21]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.25,random_state=42)

In [25]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso

In [28]:
lr = LinearRegression()
ri = Ridge(alpha=0.8)
la = Lasso(alpha=0.8)

In [30]:
lr.fit(xtrain,ytrain)
ri.fit(xtrain,ytrain)
la.fit(xtrain,ytrain)

Lasso(alpha=0.8)

In [31]:
lr_score = lr.score(xtest,ytest)
ri_score = ri.score(xtest,ytest)
la_score = la.score(xtest,ytest)

In [32]:
print(lr_score)

0.9999999999999994


In [33]:
print(ri_score)

0.9999451662134531


In [34]:
print(la_score)

0.9840180909859948
