In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

### Q1. Pandas version

What's the version of Pandas that you installed?

You can get the version information using the `__version__` field:

In [2]:
pd.__version__

'2.1.4'

### Q2. Records count

How many records are in the dataset?

In [3]:
# Reading the document first

df = pd.read_csv(r'car_fuel.csv')

df.shape

(9704, 11)

### Q3. Fuel types

How many fuel types are presented in the dataset?

In [4]:
# Performing EDA to see structure of dataset

df.columns

Index(['engine_displacement', 'num_cylinders', 'horsepower', 'vehicle_weight',
       'acceleration', 'model_year', 'origin', 'fuel_type', 'drivetrain',
       'num_doors', 'fuel_efficiency_mpg'],
      dtype='object')

In [5]:
df['fuel_type'].unique()

array(['Gasoline', 'Diesel'], dtype=object)

### Q4. Missing values

How many columns in the dataset have missing values?

In [6]:
df.isna().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

### Q5. Max fuel efficiency

What's the maximum fuel efficiency of cars from Asia?

In [7]:
df[df['origin'] == 'Asia']['fuel_efficiency_mpg'].max()

23.759122836520497

### Q6. Median value of horsepower



1. Find the median value of `horsepower` column in the dataset.
2. Next, calculate the most frequent value of the same `horsepower` column.
3. Use `fillna` method to fill the missing values in `horsepower` column with the most frequent value from the previous step.
4. Now, calculate the median value of `horsepower` once again.

Has it changed?

In [8]:
# Median horsepower

df['horsepower'].median()

149.0

In [9]:
# Most frequent horsepower value

mode = df['horsepower'].mode()
mode

0    152.0
Name: horsepower, dtype: float64

In [10]:
# Fill missing horsepower values with mode value

df['horsepower'] = df['horsepower'].fillna(mode)

In [11]:
# Calculate new median horsepower

df['horsepower'].median()

149.0

### Q7. Sum of weights

1. Select all the cars from Asia
2. Select only columns `vehicle_weight` and `model_year`
3. Select the first 7 values
4. Get the underlying NumPy array. Let's call it `X`.
5. Compute matrix-matrix multiplication between the transpose of `X` and `X`. To get the transpose, use `X.T`. Let's call the result `XTX`.
6. Invert `XTX`.
7. Create an array `y` with values `[1100, 1300, 800, 900, 1000, 1100, 1200]`.
8. Multiply the inverse of `XTX` with the transpose of `X`, and then multiply the result by `y`. Call the result `w`.
9. What's the sum of all the elements of the result?

> **Note**: You just implemented linear regression. We'll talk about it in the next lesson.

In [12]:
# Select all cars from Asia

df2 = df[df['origin']=='Asia']

In [13]:
# Select only 'vehicle_weight' and 'model_year columns

df2 = df2[['vehicle_weight', 'model_year']]

In [14]:
# Select first 7 values

df2.iloc[:7]

Unnamed: 0,vehicle_weight,model_year
8,2714.21931,2016
12,2783.868974,2010
14,3582.687368,2007
20,2231.808142,2011
21,2659.431451,2016
34,2844.227534,2014
38,3761.994038,2019


In [15]:
# Get underlying Numpy array

X = df2.iloc[:7].values

In [16]:
# Compute matrix-matrix multiplication between the transpose of X and X. 
# To get the transpose, use X.T 
# Let's call the result XTX.

XTX = np.dot(X.T, X)

In [17]:
# Invert `XTX`.
XTX_inv = np.linalg.inv(XTX)

In [19]:
# Create an array `y` with values `[1100, 1300, 800, 900, 1000, 1100, 1200]`.
# Multiply the inverse of `XTX` with the transpose of `X`, 
# and then multiply the result by `y`. Call the result `w`.
# What's the sum of all the elements of the result?

y = [1100, 1300, 800, 900, 1000, 1100, 1200]

# Multiply the inverse of `XTX` with the transpose of `X`, 
a = np.dot(XTX_inv, X.T)

# and then multiply the result by `y`. Call the result `w`.
w = np.dot(a, y)

w.sum()

0.5187709081074007