In [62]:
import pandas as pd
import numpy as np

### Q1 Pandas version
What's the version of Pandas that you installed?

In [2]:
pd.__version__

'2.3.2'

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv")

In [4]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


### Q2. Records count
How many records are in the dataset?

In [5]:
df.shape

(9704, 11)

### Q3. Fuel types
How many fuel types are presented in the dataset?

In [7]:
df.fuel_type.nunique()

2

### Q4. Missing values
How many columns in the dataset have missing values?

In [12]:
df.isnull().any().sum()

np.int64(4)

In [13]:
df.isnull().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

### Q5. Max fuel efficiency
What's the maximum fuel efficiency of cars from Asia?

In [25]:
df[df.origin == "Asia"].fuel_efficiency_mpg.max().round(3)

np.float64(23.759)

In [23]:
df[df.origin == "Asia"].sort_values("fuel_efficiency_mpg", ascending=False).head(1)

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
9387,330,3.0,136.0,1223.298226,,2001,Asia,Gasoline,Front-wheel drive,1.0,23.759123


### Q6. Median value of horsepower



1. Find the median value of `horsepower` column in the dataset.
2. Next, calculate the most frequent value of the same `horsepower` column.
3. Use `fillna` method to fill the missing values in `horsepower` column with the most frequent value from the previous step.
4. Now, calculate the median value of `horsepower` once again.

Has it changed?

- Yes, it increased
- Yes, it decreased
- **No**

In [36]:
# 1
df.horsepower.median()

np.float64(149.0)

In [37]:
# 2
df.horsepower.mode()

0    152.0
Name: horsepower, dtype: float64

In [42]:
df.horsepower.fillna(df.horsepower.mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.horsepower.fillna(df.horsepower.mode()[0], inplace=True)


In [43]:
df.isnull().sum()

engine_displacement      0
num_cylinders          482
horsepower               0
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

In [44]:
df.horsepower.median()

np.float64(152.0)

### Q7. Sum of weights

1. Select all the cars from Asia
2. Select only columns `vehicle_weight` and `model_year`
3. Select the first 7 values
4. Get the underlying NumPy array. Let's call it `X`.
5. Compute matrix-matrix multiplication between the transpose of `X` and `X`. To get the transpose, use `X.T`. Let's call the result `XTX`.
6. Invert `XTX`.
7. Create an array `y` with values `[1100, 1300, 800, 900, 1000, 1100, 1200]`.
8. Multiply the inverse of `XTX` with the transpose of `X`, and then multiply the result by `y`. Call the result `w`.
9. What's the sum of all the elements of the result?

> **Note**: You just implemented linear regression. We'll talk about it in the next lesson.

- 0.051
- **0.51**
- 5.1
- 51


In [47]:
df_asia = df[df.origin == "Asia"]

In [56]:
X = df_asia.loc[ :, ["vehicle_weight","model_year"]].head(7).values

In [60]:
XTX = X.T.dot(X)

In [61]:
y = [1100, 1300, 800, 900, 1000, 1100, 1200]

In [63]:
w = np.linalg.inv(XTX).dot(X.T).dot(y)

In [65]:
w.sum().round(3)

np.float64(0.519)