# Imports & Dataset Setup

In [54]:
import numpy as np
import pandas as pd

In [3]:
# Download the dataset
!curl https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv -o car_fuel_efficiency.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  853k  100  853k    0     0  2289k      0 --:--:-- --:--:-- --:--:-- 2301k


# Q1 Pandas version
What's the version of Pandas that you installed?

In [9]:
print(f"Pandas version: {pd.__version__}")

Pandas version: 2.3.2


# Q2. Records count
How many records are in the dataset?

In [10]:
df = pd.read_csv("car_fuel_efficiency.csv")
print("Records count:", len(df))

Records count: 9704


# Q3. Fuel types
How many fuel types are presented in the dataset?

In [12]:
# Check csv columns first
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [13]:
types_of_fuel = df["fuel_type"].nunique()
print(f"Types of fuel: {types_of_fuel}")

Types of fuel: 2


# Q4. Missing values
How many columns in the dataset have missing values?

In [26]:
# Get NaN and empty values
nan_missing = df.isnull().sum()  # NaN values
empty_missing = (df == "").sum()  # Empty values ""
total_missing = nan_missing + empty_missing

# Print # columns with missing values
print((total_missing > 0).sum())

4


# Q5. Max fuel efficiency
What's the maximum fuel efficiency of cars from Asia?

In [36]:
# Get Asian cars
asian_cars = df[df["origin"] == "Asia"]

# Get max fuel efficiency
max_mpg_asia = asian_cars["fuel_efficiency_mpg"].max()
print(f"Max fuel efficiency for cars from Asia: {max_mpg_asia:.4f} mpg.")

Max fuel efficiency for cars from Asia: 23.7591 mpg.


# Q6. Median value of horsepower
1. Find the median value of `horsepower` column in the dataset.
2. Next, calculate the most frequent value of the same `horsepower` column.
3. Use `fillna` method to fill the missing values in `horsepower` column with the most frequent value from the previous step.
4. Now, calculate the median value of `horsepower` once again.

In [43]:
# 1. Find the median value
df_hp = df["horsepower"]
median_val = df_hp.median()
print(f"Initial median value: {median_val} hp.")

# 2. Calculate the most frequent value
most_freq_val = df_hp.mode()[0]
print(f"Most frequent value: {most_freq_val} hp.")

# 3. Fill missing values
df_hp = df_hp.fillna(most_freq_val)

# 4. Compute median again
median_val = df_hp.median()
print(f"Final median value: {median_val} hp.")

Initial median value: 149.0 hp.
Most frequent value: 152.0 hp.
Final median value: 152.0 hp.


# Q.7 Sum of weights
1. Select all the cars from Asia
2. Select only columns `vehicle_weight` and `model_year`
3. Select the first 7 values
4. Get the underlying NumPy array. Let's call it `X`.
5. Compute matrix-matrix multiplication between the transpose of `X` and `X`. To get the transpose, use `X.T`. Let's call the result `XTX`.
6. Invert `XTX`.
7. Create an array `y` with values `[1100, 1300, 800, 900, 1000, 1100, 1200]`.
8. Multiply the inverse of `XTX` with the transpose of `X`, and then multiply the result by `y`. Call the result `w`.
9. What's the sum of all the elements of the result?

> **Note**: You just implemented linear regression. We'll talk about it in the next lesson.

In [55]:
# 1. Cars from Asia
df_asian_cars = df[df['origin'] == 'Asia']
# print(df_asian_cars.head())
print(f"Number of Asian cars: {len(df_asian_cars)}.")

# 2. Select only the `vehicle_weight` and `model_year` columns
df_asian_cars = df_asian_cars[["vehicle_weight", "model_year"]]
print("Selected columns:")
print(df_asian_cars.head())

# 3. First 7 vals
df_asian_cars = df_asian_cars.head(7)
print(df_asian_cars)

# 4. Make it a numpy array
X = df_asian_cars.to_numpy()
print(X)

# 5. Compute matrix mult
XTX = X.T @ X

# 6. Invert
XTX_inv = np.linalg.inv(XTX)

# 7. Create the y array
y = np.array([1100, 1300, 800, 900, 1000, 1100, 1200])

# 8. Multiply
w = XTX_inv @ X.T @ y

# 9. Sum of all elements
print(f"Sum of elements: {w.sum()}.")

Number of Asian cars: 3247.
Selected columns:
    vehicle_weight  model_year
8      2714.219310        2016
12     2783.868974        2010
14     3582.687368        2007
20     2231.808142        2011
21     2659.431451        2016
    vehicle_weight  model_year
8      2714.219310        2016
12     2783.868974        2010
14     3582.687368        2007
20     2231.808142        2011
21     2659.431451        2016
34     2844.227534        2014
38     3761.994038        2019
[[2714.21930965 2016.        ]
 [2783.86897424 2010.        ]
 [3582.68736772 2007.        ]
 [2231.8081416  2011.        ]
 [2659.43145076 2016.        ]
 [2844.22753389 2014.        ]
 [3761.99403819 2019.        ]]
Sum of elements: 0.5187709081074007.
