In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import mean_squared_error
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
# Ler o arquivo CSV
df = pd.read_csv('/content/drive/MyDrive/MD/aulas/regressao_linear/car_price_2.csv')

# Identificando outliers

In [None]:
# Função para identificar outliers usando IQR
def identificar_outliers_iqr(df):
    outliers = {}
    for col in df.select_dtypes(include=[np.number]).columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lim_inf = Q1 - 1.5 * IQR
        lim_sup = Q3 + 1.5 * IQR
        outliers[col] = df[col][(df[col] < lim_inf) | (df[col] > lim_sup)]
    return outliers

# Identificar outliers
outliers_iqr = identificar_outliers_iqr(df)

# Exibir os outliers
for col, outliers in outliers_iqr.items():
    if not outliers.empty:
        print(f'Outliers na coluna {col}:')
        print(outliers)
        print()
    else:
        print(f'Nenhum outlier encontrado na coluna {col}')

Nenhum outlier encontrado na coluna car_ID
Outliers na coluna wheelbase:
69    115.6
70    115.6
72    120.9
Name: wheelbase, dtype: float64

Nenhum outlier encontrado na coluna carlength
Outliers na coluna carwidth:
6      71.4
7      71.4
8      71.4
16     70.9
48     70.6
69     71.7
70     71.7
72     71.7
73     72.0
128    72.3
Name: carwidth, dtype: float64

Nenhum outlier encontrado na coluna carheight
Nenhum outlier encontrado na coluna curbweight
Outliers na coluna enginesize:
46    258
47    258
48    326
70    234
71    234
72    308
73    304
Name: enginesize, dtype: int64

Nenhum outlier encontrado na coluna boreratio
Outliers na coluna stroke:
28     3.90
46     4.17
47     4.17
110    2.19
112    2.19
129    3.90
130    3.90
133    2.07
137    2.36
138    2.64
139    2.64
140    2.64
141    2.64
142    2.64
143    2.64
144    2.64
145    2.64
146    2.64
147    2.64
148    2.64
Name: stroke, dtype: float64

Outliers na coluna compressionratio:
28      7.0
48     11.5
6

In [None]:
df

Unnamed: 0,car_ID,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,168.8,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,168.8,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,171.2,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,176.6,...,109,mpfi,3.19,3.40,10.0,102,5500,24,30,13950.0
4,5,audi 100ls,gas,std,four,sedan,4wd,front,99.4,176.6,...,136,mpfi,3.19,3.40,8.0,115,5500,18,22,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,201,volvo 145e (sw),gas,std,four,sedan,rwd,front,109.1,188.8,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845.0
200,202,volvo 144ea,gas,turbo,four,sedan,rwd,front,109.1,188.8,...,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045.0
201,203,volvo 244dl,gas,std,four,sedan,rwd,front,109.1,188.8,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485.0
202,204,volvo 246,diesel,turbo,four,sedan,rwd,front,109.1,188.8,...,145,idi,3.01,3.40,23.0,106,4800,26,27,22470.0


In [None]:
# Verificar os primeiros registros do DataFrame
print(df.head())

   car_ID                   CarName fueltype aspiration doornumber  \
0       1        alfa-romero giulia      gas        std        two   
1       2       alfa-romero stelvio      gas        std        two   
2       3  alfa-romero Quadrifoglio      gas        std        two   
3       4               audi 100 ls      gas        std       four   
4       5                audi 100ls      gas        std       four   

       carbody drivewheel enginelocation  wheelbase  carlength  ...  \
0  convertible        rwd          front       88.6      168.8  ...   
1  convertible        rwd          front       88.6      168.8  ...   
2    hatchback        rwd          front       94.5      171.2  ...   
3        sedan        fwd          front       99.8      176.6  ...   
4        sedan        4wd          front       99.4      176.6  ...   

   enginesize  fuelsystem  boreratio stroke compressionratio  horsepower  \
0         130        mpfi       3.47   2.68              9.0         111   


In [None]:
# Separar as features (X) e o target (y)
X = df.drop(columns=['price'])
y = df['price']

In [None]:
# Dividir os dados em conjuntos de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Identificar colunas categóricas e numéricas
categorical_features = ['CarName', 'fueltype', 'aspiration','doornumber','carbody','drivewheel','enginelocation','enginetype','cylindernumber','fuelsystem']
numeric_features = ['wheelbase', 'carlength','carwidth','carheight','curbweight','enginesize','boreratio','stroke','compressionratio','horsepower','peakrpm','citympg','highwaympg']

In [None]:
# Pré-processamento das features categóricas
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [None]:
numeric_transformer = StandardScaler()

In [None]:
# Criar um transformer para aplicar transformações nas colunas adequadas
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numeric_transformer, numeric_features)
    ],
    #remainder='passthrough'  # Manter as colunas numéricas como estão
    force_int_remainder_cols=False
)

In [None]:
# Criar um pipeline com pré-processamento e modelo de regressão linear
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [None]:
# Treinar o modelo
model.fit(X_train, y_train)

In [None]:
# Fazer previsões nos dados de teste
y_pred = model.predict(X_test)

In [None]:
# Calcular o erro quadrático médio
mse = mean_squared_error(y_test, y_pred)
print(f'Erro Quadrático Médio: {mse:.2f}')

Erro Quadrático Médio: 30646077.62


In [None]:
# Calcular a Raiz do erro quadrático médio
rmse = root_mean_squared_error(y_test, y_pred)
print(f'Raiz do Erro Quadrático Médio: {rmse:.2f}')

Raiz do Erro Quadrático Médio: 5535.89


In [None]:
# Função para estimar o valor de um novo automóvel
def estimar_valor(novo_automovel):
    df_novo = pd.DataFrame([novo_automovel])
    valor_estimado = model.predict(df_novo)
    return valor_estimado[0]

In [None]:
# Exemplo de uso da função
novo_automovel = {
    'CarName' : 'audi 100 ls',
    'fueltype' : 'gas',
    'aspiration' : 'turbo',
    'doornumber' : 'four',
    'carbody' : 'sedan',
    'drivewheel' : '4fwd',
    'enginelocation' : 'front',
    'wheelbase' : 101.2,
    'carlength' : 152,
    'carwidth' : 55.1,
    'carheight': 48.8,
    'curbweight': 3000,
    'enginetype': 'ohc',
    'cylindernumber': 'four',
    'enginesize': 120,
    'fuelsystem': 'mpfi',
    'boreratio': 3.20,
    'stroke': 2.50,
    'compressionratio': 8,
    'horsepower': 111,
    'peakrpm': 5400,
    'citympg': 21,
    'highwaympg' : 28
}

In [None]:
valor_estimado = estimar_valor(novo_automovel)
print(f'Valor estimado: R$ {valor_estimado:.2f}')

Valor estimado: R$ 32480.84
