In [None]:
import pandas as pd
from sklearn import linear_model
import numpy as np

df = pandas.read_csv("Unit03 cars.csv")

# Check the actual column names in the DataFrame
print(df.columns)

# I need to change '?' strings to missing values to be able to conduct this analysis.
# Replace '?' with NaN
df.replace('?', np.nan, inplace=True)

# Convert columns to numeric, coercing errors to NaN
df['weight'] = pd.to_numeric(df['weight'], errors='coerce')
df['horsepower'] = pd.to_numeric(df['horsepower'], errors='coerce')
df['mpg'] = pd.to_numeric(df['mpg'], errors='coerce')

# Drop rows with missing values in the specified columns
df.dropna(subset=['weight', 'horsepower', 'mpg'], inplace=True)


# The previously mentioned Weight, Volume and CO2 columns in this code do not exist and thus need to be changed.
# I thus here use weight and horsepower for X and mpg for y.
X = df[['weight', 'horsepower']]
y = df['mpg']

regr = linear_model.LinearRegression()
regr.fit(X, y)

#predict the mpg emission of a car where the weight is 2300kg, and the car has 100 horsepowers:
predictedmpg = regr.predict([[2300, 100]])

print(predictedmpg)

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model year', 'origin', 'car name'],
      dtype='object')
[27.58336259]




# Coefficient

The coefficient is a factor that describes the relationship with an unknown variable.
In this case, we can ask for the coefficient value of weight against CO2, and for volume against CO2. The answer(s) we get tells us what would happen if we increase, or decrease, one of the independent values.

In [None]:
print(regr.coef_)

[-0.00579416 -0.04730286]


The result array represents the coefficient values of weight and volume.

Weight: 0.00755095
Volume: 0.00780526

These values tell us that if the weight increase by 1kg, the CO2 emission increases by 0.00755095g.

And if the engine size (Volume) increases by 1 cm3, the CO2 emission increases by 0.00780526 g.

I think that is a fair guess, but let test it!

We have already predicted that if a car with a 1300cm3 engine weighs 2300kg, the CO2 emission will be approximately 107g.

What if we increase the weight with 1000kg (from 2300 to 3300) what will be the CO2 emission?

Ans: 107.2087328 + (1000 * 0.00755095) = 114.75968

In [None]:
predictedmpg = regr.predict([[2700, 130]])

print(predictedmpg)

[23.84661375]




## Here are my experiments with the underlying data sets and how this changes the results:

**I also had to practice with the underlying code a bit to fix the code above to make it run as it should (see my commentaries in the code). Now I additionally also calculate an additional multivariate linear regression:**

In [None]:
import pandas as pd
from sklearn import linear_model
import numpy as np

df = pandas.read_csv("Unit03 cars.csv")

# Check the actual column names in the DataFrame
print(df.columns)

#I now analyse the influence that the number of cylinders and the displacement have on miles per gallon:

# I need to change '?' strings to missing values to be able to conduct this analysis.
# Replace '?' with NaN
df.replace('?', np.nan, inplace=True)

# Convert columns to numeric, coercing errors to NaN
df['cylinders'] = pd.to_numeric(df['cylinders'], errors='coerce')
df['displacement'] = pd.to_numeric(df['displacement'], errors='coerce')
df['mpg'] = pd.to_numeric(df['mpg'], errors='coerce')

# Drop rows with missing values in the specified columns
df.dropna(subset=['cylinders', 'displacement', 'mpg'], inplace=True)


# The previously mentioned Weight, Volume and CO2 columns in this code do not exist and thus need to be changed.
# I thus here use weight and horsepower for X and mpg for y.
X = df[['cylinders', 'displacement']]
y = df['mpg']

regr2 = linear_model.LinearRegression()
regr2.fit(X, y)

#predict the mpg emission of a car with 4 cylinders, and the car has a displacement of 6 liters:
predictedmpg = regr.predict([[4, 6]])

print(predictedmpg)

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model year', 'origin', 'car name'],
      dtype='object')
[34.06178958]




**Next, I again print the regression coefficients:**

In [None]:
print(regr2.coef_)

[-0.51735734 -0.05225841]


**And finally I again create a prediction:**

In [None]:
# Prediction for the mpg for a car with 5 cylinders and a displacement of 10:
predictedmpg = regr.predict([[5, 10]])

print(predictedmpg)

[33.33539859]


