## *`Introduciton to Machine Learning`*

In [87]:
import numpy as np
import pandas as pd

pd.__version__, np.__version__

('2.2.3', '2.1.3')

In [88]:
import wget
url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv'
file = wget.download(url)

print(f"Downloaded file: {file}")

Downloaded file: car_fuel_efficiency.csv


In [89]:
df = pd.read_csv('car_fuel_efficiency.csv')

df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [90]:
# How many Records in the dataset?

print(f"Record Count: {len(df)}") # df.shape[0]

Record Count: 9704


In [91]:
# How many fuel types are presented in the dataset?

print(f"Fuel Types : {df['fuel_type'].nunique()}")

df['fuel_type'].value_counts()

Fuel Types : 2


fuel_type
Gasoline    4898
Diesel      4806
Name: count, dtype: int64

In [92]:
# How many columns in the dataset have missing values?
df.isna().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

In [93]:
counter = 0
for i in df.isna().sum():
    if i > 0:
        counter += 1

print(f"Columns with missing values: {counter}")

Columns with missing values: 4


In [94]:
# What's the maximum fuel efficiency of cars from Asia?
df['origin'].value_counts()

origin
Europe    3254
Asia      3247
USA       3203
Name: count, dtype: int64

In [95]:
round(df.groupby('origin')['fuel_efficiency_mpg'].max(), 2)

origin
Asia      23.76
Europe    25.97
USA       24.97
Name: fuel_efficiency_mpg, dtype: float64

1. Find the median value of horsepower column in the dataset.
2. Next, calculate the most frequent value of the same horsepower column.
3. Use fillna method to fill the missing values in horsepower column with the most frequent value from the previous step.
4. Now, calculate the median value of horsepower once again.

In [96]:
median1 = df['horsepower'].median() # Before filling missing values

most_frequent = df['horsepower'].mode()[0] # Use this to fill the missing values
df['horsepower'] = df['horsepower'].fillna(most_frequent)

median2 = df['horsepower'].median() # After filling missing values

median1, median2

(149.0, 152.0)

1. Select all the cars from Asia
2. Select only columns vehicle_weight and model_year
3. Select the first 7 values
4. Get the underlying NumPy array. Let's call it X.
5. Compute matrix-matrix multiplication between the transpose of X and X. To get the transpose, use X.T. Let's call the result XTX.
6. Invert XTX.
7. Create an array y with values [1100, 1300, 800, 900, 1000, 1100, 1200].
8. Multiply the inverse of XTX with the transpose of X, and then multiply the result by y. Call the result w.
9. What's the sum of all the elements of the result?

In [97]:
df = df.query("origin == 'Asia'").loc[:, ['vehicle_weight' ,'model_year']].head(7)

df

Unnamed: 0,vehicle_weight,model_year
8,2714.21931,2016
12,2783.868974,2010
14,3582.687368,2007
20,2231.808142,2011
21,2659.431451,2016
34,2844.227534,2014
38,3761.994038,2019


In [98]:
X = df.values

# Transpose of X
XT = X.T

# Matrix Multiplication of X transpose and X
XTX = X@XT

X.shape, X

((7, 2),
 array([[2714.21930965, 2016.        ],
        [2783.86897424, 2010.        ],
        [3582.68736772, 2007.        ],
        [2231.8081416 , 2011.        ],
        [2659.43145076, 2016.        ],
        [2844.22753389, 2014.        ],
        [3761.99403819, 2019.        ]]))

In [99]:
XT.shape, XT

((2, 7),
 array([[2714.21930965, 2783.86897424, 3582.68736772, 2231.8081416 ,
         2659.43145076, 2844.22753389, 3761.99403819],
        [2016.        , 2010.        , 2007.        , 2011.        ,
         2016.        , 2014.        , 2019.        ]]))

In [100]:
XTX.shape, XTX

((7, 7),
 array([[11431242.46085133, 11608190.92541486, 13770311.23387465,
         10111792.75336224, 11282536.19632532, 11780081.29349753,
         14281180.86123177],
        [11608190.92541486, 11790026.46575708, 14007802.20739725,
         10255171.44187306, 11455668.70489329, 11966096.78727446,
         14531088.48421337],
        [13770311.23387465, 14007802.20739725, 16863697.77478558,
         12031947.83608462, 13574023.463935  , 14232076.05656009,
         17530181.51805091],
        [10111792.75336224, 10255171.44187306, 12031947.83608462,
          9025088.58092514,  9989516.7638366 , 10397924.16669749,
         12456257.9230991 ],
        [11282536.19632532, 11455668.70489329, 13574023.463935  ,
          9989516.7638366 , 11136831.64128077, 11624252.15672766,
         14075069.26273247],
        [11780081.29349753, 11966096.78727446, 14232076.05656009,
         10397924.16669749, 11624252.15672766, 12145826.2645139 ,
         14766233.02574041],
        [14281180.8612317

In [101]:
X.dot(XT).shape, X.dot(XT)

((7, 7),
 array([[11431242.46085133, 11608190.92541486, 13770311.23387465,
         10111792.75336224, 11282536.19632532, 11780081.29349753,
         14281180.86123177],
        [11608190.92541486, 11790026.46575708, 14007802.20739725,
         10255171.44187306, 11455668.70489329, 11966096.78727446,
         14531088.48421337],
        [13770311.23387465, 14007802.20739725, 16863697.77478558,
         12031947.83608462, 13574023.463935  , 14232076.05656009,
         17530181.51805091],
        [10111792.75336224, 10255171.44187306, 12031947.83608462,
          9025088.58092514,  9989516.7638366 , 10397924.16669749,
         12456257.9230991 ],
        [11282536.19632532, 11455668.70489329, 13574023.463935  ,
          9989516.7638366 , 11136831.64128077, 11624252.15672766,
         14075069.26273247],
        [11780081.29349753, 11966096.78727446, 14232076.05656009,
         10397924.16669749, 11624252.15672766, 12145826.2645139 ,
         14766233.02574041],
        [14281180.8612317

In [102]:
# Inverse of Matrix

det = np.linalg.det(XTX)

if det != 0:
    XTX_inv = np.linalg.inv(XTX)
    print("Matrix is invertible")
    print(f"Determinant: {det}")
    print(f"Inverse of Matrix:\n{XTX_inv}")
    print(f"Shape of Inverse Matrix: {XTX_inv.shape}")
else:
    print("Matrix is not invertible")

Matrix is invertible
Determinant: 3.1533446453343684e-31
Inverse of Matrix:
[[ 6.27784782e+08  4.52348582e+08 -5.52589667e+08 -5.04518657e+08
  -1.36497506e+08 -9.70745174e+07  2.07769569e+08]
 [ 6.52427761e+08  1.18496810e+09 -8.15933475e+08 -7.54720098e+08
  -7.08834488e+08  2.53306835e+08  1.86771033e+08]
 [-4.82034184e+08 -5.85397198e+08  3.79915957e+08  6.66310906e+08
   1.18051009e+08 -1.47095872e+08  5.16329750e+07]
 [-4.86725764e+08 -5.86851309e+08  7.06448581e+08  6.82137781e+08
   2.06347119e+08 -3.29606193e+08 -1.88697388e+08]
 [-2.37576087e+08 -7.37561522e+08  2.26902495e+08  2.89910408e+08
   1.50145296e+08  3.48995157e+08 -4.08718420e+07]
 [-2.28618838e+08  1.97697960e+08 -2.05812130e+07 -2.25519129e+08
   3.60824331e+08 -2.92220029e+06 -8.08263491e+07]
 [ 1.52870693e+08  7.43387198e+07  7.67225188e+07 -1.51179749e+08
   9.23761238e+06 -2.62872552e+07 -1.35338415e+08]]
Shape of Inverse Matrix: (7, 7)


In [103]:
y = np.array([1100, 1300, 800, 900, 1000, 1100, 1200])

y.shape, y

((7,), array([1100, 1300,  800,  900, 1000, 1100, 1200]))

In [104]:
XTX_inv.shape , XT.shape

((7, 7), (2, 7))

In [105]:
w = XT.dot(XTX_inv)
w = w @ y

w.shape, w

((2,), array([ 1.10015869, -0.09307861]))

In [106]:
w.sum()

np.float64(1.007080078125)