In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy as sp
import pingouin as pg
%matplotlib inline
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

sns.set(
    font_scale=2,
    style="whitegrid",
    rc={'figure.figsize':(20,7)}
)

cars = pd.read_csv('cars.csv')

## Task 1

In [3]:
cars.isnull().sum()
# answer - 0

car_ID              0
symboling           0
CarName             0
fueltype            0
aspiration          0
doornumber          0
carbody             0
drivewheel          0
enginelocation      0
wheelbase           0
carlength           0
carwidth            0
carheight           0
curbweight          0
enginetype          0
cylindernumber      0
enginesize          0
fuelsystem          0
boreratio           0
stroke              0
compressionratio    0
horsepower          0
peakrpm             0
citympg             0
highwaympg          0
price               0
dtype: int64

## Task 2

In [4]:
cars = (
    cars
    .assign(company = cars.CarName.str.split(" ").str[0])
    .drop(["CarName", "car_ID"], axis=1)
)
cars.company.nunique()
# answer 28

28

## Task 3

In [5]:
def change_company_names(car_names):
    car_names = car_names.str.lower()
    car_names = car_names.str.replace('porcshce', 'porsche')
    car_names = car_names.str.replace('maxda', 'mazda')
    car_names = car_names.str.replace('toyouta', 'toyota')
    car_names = car_names.str.replace('vokswagen', 'volkswagen')
    car_names = car_names.str.replace('vw', 'volkswagen')
    return car_names

cars = (
    cars
    .assign(company = lambda x: change_company_names(x.company))
)
    
    
cars.company.nunique()
# answer = 22

22

## Task 4

In [6]:
(
    cars
    [['price', 'company', 'fueltype', 'aspiration','carbody', 'drivewheel', 'wheelbase', 'carlength',
      'carwidth', 'curbweight', 'enginetype', 'cylindernumber', 'enginesize', 'boreratio','horsepower']]
    .corr()
    .price

)

# answer - 0.81

  cars


price         1.000000
wheelbase     0.577816
carlength     0.682920
carwidth      0.759325
curbweight    0.835305
enginesize    0.874145
boreratio     0.553173
horsepower    0.808139
Name: price, dtype: float64

## Task 5

In [7]:
df_dummy = pd.get_dummies(
    data=(
        cars
        [['company', 'fueltype', 'aspiration','carbody', 'drivewheel', 'wheelbase', 
          'carlength','carwidth', 'curbweight', 'enginetype', 'cylindernumber',
          'enginesize', 'boreratio','horsepower', 'price']]
    ), 
    drop_first = True
)
df_dummy.shape
# answer - 49


(205, 49)

## Task 6

In [8]:
simple_lm = smf.ols('price ~ horsepower', df_dummy).fit()
print(simple_lm.summary())
# answer - 65%

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.653
Model:                            OLS   Adj. R-squared:                  0.651
Method:                 Least Squares   F-statistic:                     382.2
Date:                Wed, 28 Dec 2022   Prob (F-statistic):           1.48e-48
Time:                        14:09:53   Log-Likelihood:                -2024.0
No. Observations:                 205   AIC:                             4052.
Df Residuals:                     203   BIC:                             4059.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept  -3721.7615    929.849     -4.003      0.0

## Task 7

In [10]:
independent_variables = ('+').join(df_dummy.columns[~df_dummy.columns.isin(['price'])])
full_lm = smf.ols('price ~ ' + independent_variables, df_dummy).fit()
print(full_lm.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.959
Model:                            OLS   Adj. R-squared:                  0.948
Method:                 Least Squares   F-statistic:                     81.09
Date:                Wed, 28 Dec 2022   Prob (F-statistic):           4.86e-89
Time:                        14:10:05   Log-Likelihood:                -1804.2
No. Observations:                 205   AIC:                             3702.
Df Residuals:                     158   BIC:                             3858.
Df Model:                          46                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Intercept             -3.472e+

In [205]:
no_company = df_dummy.columns[~df_dummy.columns.str.contains('company')]
independent_variables_no_company = ('+').join(no_company[~no_company.isin(['price'])])
no_company_lm = smf.ols('price ~ ' + independent_variables_no_company, df_dummy).fit()
print(no_company_lm.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.914
Model:                            OLS   Adj. R-squared:                  0.901
Method:                 Least Squares   F-statistic:                     72.32
Date:                Fri, 23 Dec 2022   Prob (F-statistic):           9.86e-81
Time:                        19:15:59   Log-Likelihood:                -1881.6
No. Observations:                 205   AIC:                             3817.
Df Residuals:                     178   BIC:                             3907.
Df Model:                          26                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Intercept               -1.7e+

In [200]:
# answer: 
# - Большинство коэффициентов, связанных с марками машин, статистически незначимы
# - Если судить чисто по диагностическим показателям (вроде R2), то модель со все


ми предикторами лучшая

## Task 8

In [207]:
# answer: 
# Выбранная модель объясняет примерно 90% дисперсии (окр. до целого). 
# Среди предикторов 10 из 27 оказались не значимыми (p > 0.05). 
# Пример интерпретации: при единичном изменении показателя horsepower, цена ВОЗРАСТАЕТ на 86.8164 (без округления).