## Q1. Pandas version

In [2]:
import pandas as pd
print(pd.__version__)

2.3.1


## Getting the data

In [3]:
data = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')

In [30]:
data.head(10)

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369
5,190,3.0,,2484.883986,14.7,2008,Europe,Gasoline,All-wheel drive,-1.0,17.271818
6,240,7.0,127.0,3006.542287,22.2,2012,USA,Gasoline,Front-wheel drive,1.0,13.210412
7,150,4.0,239.0,3638.65778,17.3,2020,USA,Diesel,All-wheel drive,1.0,12.848884
8,250,1.0,174.0,2714.21931,10.3,2016,Asia,Diesel,Front-wheel drive,-1.0,16.823554
9,150,4.0,123.0,3509.036569,10.2,2005,USA,Gasoline,Front-wheel drive,-1.0,12.298355


## Q2. Records count

In [5]:
print(data.shape)

(9704, 11)


In [6]:
print(data.columns)

Index(['engine_displacement', 'num_cylinders', 'horsepower', 'vehicle_weight',
       'acceleration', 'model_year', 'origin', 'fuel_type', 'drivetrain',
       'num_doors', 'fuel_efficiency_mpg'],
      dtype='object')


## Q3. Fuel types

In [7]:
print(data['fuel_type'].unique())

['Gasoline' 'Diesel']


In [8]:
print(data['fuel_type'].nunique())

2


## Q4. Missing values

In [21]:
data.isnull().head(10)

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False
2,False,True,False,False,False,False,False,False,False,False,False
3,False,False,True,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False
5,False,False,True,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,False


In [16]:
# Check for missing values in each column
missing_values_per_column = data.isnull().sum()
print(missing_values_per_column)

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64


In [18]:
# I read display() is often preferred in Colab for better formatting of pandas Series and DataFrames.
# It result looked beter in collab actually, trust me.
display(missing_values_per_column)

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

In [23]:
# Count the number of columns with missing values
columns_with_missing_values = (missing_values_per_column > 0).sum()

print(f"Number of columns with missing values: {columns_with_missing_values}")

Number of columns with missing values: 4


## Q5. Max fuel efficiency

In [24]:
asia_cars = data[data['origin'] == 'Asia'] # This filters the original data DataFrame
max_fuel_efficiency_asia = asia_cars['fuel_efficiency_mpg'].max()
print(max_fuel_efficiency_asia)

23.759122836520497


## Q6. Median value of horsepower

In [29]:
# Find the median value of the 'horsepower' column
median_horsepower_before = data['horsepower'].median()
print(f"Median horsepower before filling missing values: {median_horsepower_before}")

Median horsepower before filling missing values: 149.0


In [27]:
# Calculate the most frequent value of the 'horsepower' column
# In this case, there was likely only one most frequent value, so without [0] return same result.
most_frequent_horsepower = data['horsepower'].mode()[0]
print(f"Most frequent horsepower: {most_frequent_horsepower}")

Most frequent horsepower: 152.0


In [31]:
# Fill missing values in the 'horsepower' column with the most frequent value
data['horsepower'] = data['horsepower'].fillna(most_frequent_horsepower)

In [38]:
# Calculate the median value of 'horsepower' once again
median_horsepower_after = data['horsepower'].median()
print(f"Median horsepower after filling missing values: {median_horsepower_after}")
print(f"\nYes, it increased")

Median horsepower after filling missing values: 152.0

Yes, it increased


## Q7. Sum of weights

In [39]:
# Select all the cars from Asia
asia_cars_df = data[data['origin'] == 'Asia']

In [51]:
# Select only columns vehicle_weight and model_year
relevant_column = asia_cars_df[['vehicle_weight', 'model_year']]
display(X)

Unnamed: 0,vehicle_weight,model_year
8,2714.219310,2016
12,2783.868974,2010
14,3582.687368,2007
20,2231.808142,2011
21,2659.431451,2016
...,...,...
9688,3948.404625,2018
9692,3680.341381,2016
9693,2545.070139,2012
9698,3107.427820,2005


In [58]:
# Select the first 7 values
relevant_column_7 = relevant_column.head(7)
print(relevant_column_7)
type(relevant_column_7)

    vehicle_weight  model_year
8      2714.219310        2016
12     2783.868974        2010
14     3582.687368        2007
20     2231.808142        2011
21     2659.431451        2016
34     2844.227534        2014
38     3761.994038        2019


pandas.core.frame.DataFrame

In [47]:
import numpy as np

In [59]:
# Get the underlying NumPy array. Let's call it X.
X = relevant_column_7.to_numpy()
print(X)
type(X)

[[2714.21930965 2016.        ]
 [2783.86897424 2010.        ]
 [3582.68736772 2007.        ]
 [2231.8081416  2011.        ]
 [2659.43145076 2016.        ]
 [2844.22753389 2014.        ]
 [3761.99403819 2019.        ]]


numpy.ndarray

In [60]:
# Compute matrix-matrix multiplication between the transpose of X and X. 
# To get the transpose, use X.T. Let's call the result XTX.

XT = X.T
XTX = XT @ X

In [61]:
#Invert XTX.
from numpy.linalg import inv
XTX_inv = inv(XTX)

In [62]:
#Create an array y with values [1100, 1300, 800, 900, 1000, 1100, 1200].
y = np.array([1100, 1300, 800, 900, 1000, 1100, 1200])
print(y)

[1100 1300  800  900 1000 1100 1200]


In [63]:
# Multiply the inverse of XTX with the transpose of X
# multiply the result by y. Call the result w
XTX_inv_XT = XTX_inv @ XT
w = XTX_inv_XT @ y
print(w)

[0.01386421 0.5049067 ]


In [64]:
# What's the sum of all the elements of the result?
sum_w = np.sum(w)

print(sum_w)

0.5187709081074016
