In [219]:
%config IPCompleter.greedy=True

In [220]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

#### Carregando os dados

In [221]:
def convert_prices(df): 
    for x in range(100): # loop de ,00 ,01 ,02...até 0,99 centavos
        if x >= 10:
            str_replace = ',' + str(x)
        else:    
            str_replace = ',0' + str(x)
        df['preco'] = df.preco.replace({str_replace:''}, regex=True)
    
    for i in range(len(df)) : 
        df['preco'][i] = df['preco'][i].replace('.', '')
    
    df['preco'] = df['preco'].astype(float)
    
    return df

In [222]:
def load_df():    
    df = pd.read_csv('df_final.csv', sep='\t')
    df.drop(df.columns[[0, 1, 2]], axis = 1, inplace = True) 
    df = convert_prices(df)
    return df

In [223]:
df = load_df()
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['preco'][i] = df['preco'][i].replace('.', '')


Unnamed: 0,nome,ano,km,cor,cambio,portas,preco
0,Chevrolet Onix 1.0 LT (Flex) 2020,2020 / 2020,0.0,Laranja,manual,3.0,59890.0
1,Fiat Strada Hard Working 1.4 (Flex) (Cabine Si...,2019 / 2019,41.07,Branco,manual,2.0,45900.0
2,Fiat Strada Hard Working 1.4 (Flex) (Cabine Si...,2018 / 2019,48.654,Prata,manual,4.0,45900.0
3,Fiat Strada Hard Working 1.4 (Flex) (Cabine Si...,2018 / 2019,19.0,Prata,manual,2.0,45990.0
4,Fiat Strada Hard Working 1.4 (Flex) (Cabine Si...,2018 / 2019,24.715,Branco,manual,2.0,45990.0


#### Verificando features com dados faltantes

In [225]:
[col for col in df.columns if df[col].isnull().any()]

['cambio', 'portas']

In [226]:
cars_features = ['km', 'cambio', 'portas']
X = df[cars_features]

In [227]:
print('Cambio valores distintos:', X.cambio.unique())
print('Valores faltantes: ', X.cambio.isna().sum())
print('\nCambio totais por tipo:')
print(X.cambio.value_counts())

Cambio valores distintos: ['manual' 'automático' nan]
Valores faltantes:  11

Cambio totais por tipo:
manual        1334
automático     366
Name: cambio, dtype: int64


In [228]:
print('Portas valores distintos:', X.portas.unique())
print('Valores faltantes: ', X.portas.isna().sum())
print('\nPortas totais por tipo:')
print(X.portas.value_counts())

Portas valores distintos: [ 3.  2.  4. nan  5.]
Valores faltantes:  225

Portas totais por tipo:
4.0    932
5.0    346
2.0    163
3.0     45
Name: portas, dtype: int64


In [229]:
X['cambio'] = X['cambio'].fillna(method='ffill')
X['portas'] = X['portas'].fillna(method='ffill')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['cambio'] = X['cambio'].fillna(method='ffill')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['portas'] = X['portas'].fillna(method='ffill')


In [230]:
print(X.cambio.value_counts())
print(X.portas.value_counts())

manual        1338
automático     373
Name: cambio, dtype: int64
4.0    1057
5.0     428
2.0     176
3.0      50
Name: portas, dtype: int64


#### Verificando se ficou alguma feature com dados faltantes 

In [231]:
[col for col in X.columns if X[col].isnull().any()]

[]

### Aplicando técnica de LabelEncoder

In [233]:
label_encoder = LabelEncoder()

In [242]:
s = (X.dtypes == 'object')
object_cols = list(s[s].index)
object_cols

['cambio']

In [244]:
y = df.preco

In [245]:
X_train, X_test, y_train, y_test = train_test_split(label_X, y, test_size=0.3, random_state=0)

In [246]:
label_X_train = X_train.copy()
label_X_test = X_test.copy()

In [247]:
for col in object_cols:
    label_X_train[col] = label_encoder.fit_transform(X_train[col])
    label_X_test[col] = label_encoder.transform(X_test[col])

#### RandomForestRegressor

In [248]:
rf = RandomForestRegressor(n_estimators=20, random_state=0)
rf.fit(label_X_train, y_train)
rf_pred = rf.predict(label_X_test)

print('RF - MAE - Mean Absolute Error:', metrics.mean_absolute_error(y_test, rf_pred))
print('RF - MSE - Mean Squared Error:', metrics.mean_squared_error(y_test, rf_pred))
print('RF - RMSE - Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, rf_pred)))

RF - MAE - Mean Absolute Error: 8158.186043340672
RF - MSE - Mean Squared Error: 134516356.12695876
RF - RMSE - Root Mean Squared Error: 11598.118646011462


#### LogisticRegression

In [249]:
lr = LogisticRegression(solver='liblinear', multi_class='ovr')
lr.fit(label_X_train, y_train)
lr_pred = lr.predict(label_X_test)

print('LR - MAE - Mean Absolute Error:', metrics.mean_absolute_error(y_test, lr_pred))
print('LR - MSE - Mean Squared Error:', metrics.mean_squared_error(y_test, lr_pred))
print('LR - RMSE - Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, lr_pred)))

LR - MAE - Mean Absolute Error: 10073.496108949417
LR - MSE - Mean Squared Error: 255274051.00583658
LR - RMSE - Root Mean Squared Error: 15977.297988265618


#### DecisionTreeRegressor

In [250]:
dt = DecisionTreeRegressor(max_leaf_nodes=10, random_state=1)
dt.fit(label_X_train, y_train)
dt_pred = dt.predict(label_X_test)

print('DT - MAE - Mean Absolute Error:', metrics.mean_absolute_error(y_test, dt_pred))
print('DT - MSE - Mean Squared Error:', metrics.mean_squared_error(y_test, dt_pred))
print('DT - RMSE - Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, dt_pred)))

DT - MAE - Mean Absolute Error: 7967.64794020087
DT - MSE - Mean Squared Error: 132848193.70879717
DT - RMSE - Root Mean Squared Error: 11525.979078099925


### Aplicando técnica de OneHotEncoder

In [251]:
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

In [252]:
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test[object_cols]))

In [253]:
OH_cols_train.index = X_train.index
OH_cols_test.index = X_test.index

In [254]:
num_X_train = X_train.drop(object_cols, axis=1)
num_X_test = X_test.drop(object_cols, axis=1)

In [255]:
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

#### RandomForestRegressor

In [256]:
rf = RandomForestRegressor(n_estimators=20, random_state=0)
rf.fit(OH_X_train, y_train)
rf_pred = rf.predict(OH_X_test)

print('RF - MAE - Mean Absolute Error:', metrics.mean_absolute_error(y_test, rf_pred))
print('RF - MSE - Mean Squared Error:', metrics.mean_squared_error(y_test, rf_pred))
print('RF - RMSE - Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, rf_pred)))

RF - MAE - Mean Absolute Error: 8170.9200859801
RF - MSE - Mean Squared Error: 134697550.2691223
RF - RMSE - Root Mean Squared Error: 11605.927376522839


#### LogisticRegression

In [257]:
lr = LogisticRegression(solver='liblinear', multi_class='ovr')
lr.fit(OH_X_train, y_train)
lr_pred = lr.predict(OH_X_test)

print('LR - MAE - Mean Absolute Error:', metrics.mean_absolute_error(y_test, lr_pred))
print('LR - MSE - Mean Squared Error:', metrics.mean_squared_error(y_test, lr_pred))
print('LR - RMSE - Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, lr_pred)))

LR - MAE - Mean Absolute Error: 8705.184824902724
LR - MSE - Mean Squared Error: 164088620.64007783
LR - RMSE - Root Mean Squared Error: 12809.708062250202


#### DecisionTreeRegressor

In [258]:
dt = DecisionTreeRegressor(max_leaf_nodes=10, random_state=1)
dt.fit(OH_X_train, y_train)
dt_pred = dt.predict(OH_X_test)

print('DT - MAE - Mean Absolute Error:', metrics.mean_absolute_error(y_test, dt_pred))
print('DT - MSE - Mean Squared Error:', metrics.mean_squared_error(y_test, dt_pred))
print('DT - RMSE - Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, dt_pred)))

DT - MAE - Mean Absolute Error: 7967.64794020087
DT - MSE - Mean Squared Error: 132848193.70879717
DT - RMSE - Root Mean Squared Error: 11525.979078099925


### Comparação

In [None]:
'''
Modelo1 - com duas features (km e portas)       modelo2 - com três features (km, cambio portas) 
Sem nenhum tratamento ------------------------  LabelEncoder  ---------------------------------------  OneHotEncoder
'''

In [None]:
'''
RandomForestRegressor
MAE  : 12130.13714621856 ---------------------  MAE  : 8158.186043340672 ----------------------------  MAE  : 8170.9200859801
MSE  : 289786666.3864076 ---------------------  MSE  : 134516356.12695876 ---------------------------  MSE  : 134697550.2691223
RMSE : 17023.121522987716 --------------------  RMSE : 11598.118646011462 ---------------------------  RMSE : 11605.927376522839

LogisticRegression
MAE  : 13086.900874635568 --------------------  MAE  : 10073.496108949417 ---------------------------  MAE  : 8705.184824902724
MSE  : 384705427.8746356 ---------------------  MSE  : 255274051.00583658 ---------------------------  MSE  : 164088620.64007783
RMSE : 19613.90904115331 ---------------------  RMSE : 15977.297988265618 ---------------------------  RMSE : 12809.708062250202

DecisionTreeRegressor
MAE  : 11862.036106689564 --------------------  MAE  : 7967.64794020087 -----------------------------  MAE  : 7967.64794020087
MSE  : 276756377.7213516 ---------------------  MSE  : 132848193.70879717 ---------------------------  MSE  : 132848193.70879717
RMSE : 16635.996445099154 --------------------  RMSE : 11525.979078099925 ---------------------------  RMSE : 11525.979078099925
'''

In [None]:
'''
DecisionTreeRegressor apresentou melhores resultados em geral
LogisticRegression apresentou melhora significativa aplicando OneHotEncoder
RandomForestRegressor em comaração a DT ficou pior, talvez alterando o n_estimators ou algum outro parâmetro possa melhorar em relação a DT
'''