In [221]:
%config IPCompleter.greedy=True

In [222]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

#### Carregando os dados

In [223]:
def convert_prices(df): 
    for x in range(100): # loop de ,00 ,01 ,02...até 0,99 centavos
        if x >= 10:
            str_replace = ',' + str(x)
        else:    
            str_replace = ',0' + str(x)
        df['preco'] = df.preco.replace({str_replace:''}, regex=True)
    
    for i in range(len(df)) : 
        df['preco'][i] = df['preco'][i].replace('.', '')
    
    df['preco'] = df['preco'].astype(float)
    
    return df

In [224]:
def load_df():    
    df = pd.read_csv('df_final.csv', sep='\t')
    df.drop(df.columns[[0, 1, 2]], axis = 1, inplace = True) 
    df = convert_prices(df)
    return df

In [225]:
df = load_df()
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['preco'][i] = df['preco'][i].replace('.', '')


Unnamed: 0,nome,ano,km,cor,cambio,portas,preco
0,Chevrolet Onix 1.0 LT (Flex) 2020,2020 / 2020,0.0,Laranja,manual,3.0,59890.0
1,Fiat Strada Hard Working 1.4 (Flex) (Cabine Si...,2019 / 2019,41.07,Branco,manual,2.0,45900.0
2,Fiat Strada Hard Working 1.4 (Flex) (Cabine Si...,2018 / 2019,48.654,Prata,manual,4.0,45900.0
3,Fiat Strada Hard Working 1.4 (Flex) (Cabine Si...,2018 / 2019,19.0,Prata,manual,2.0,45990.0
4,Fiat Strada Hard Working 1.4 (Flex) (Cabine Si...,2018 / 2019,24.715,Branco,manual,2.0,45990.0


#### Verificando features com dados faltantes

In [226]:
[col for col in df.columns if df[col].isnull().any()]

['cambio', 'portas']

In [227]:
cars_features = ['km', 'cambio', 'portas', 'cor']
X = df[cars_features]

In [228]:
print('Cambio valores distintos:', X.cambio.unique())
print('Valores faltantes: ', X.cambio.isna().sum())
print('\nCambio totais por tipo:')
print(X.cambio.value_counts())

Cambio valores distintos: ['manual' 'automático' nan]
Valores faltantes:  11

Cambio totais por tipo:
manual        1334
automático     366
Name: cambio, dtype: int64


In [229]:
print('Portas valores distintos:', X.portas.unique())
print('Valores faltantes: ', X.portas.isna().sum())
print('\nPortas totais por tipo:')
print(X.portas.value_counts())

Portas valores distintos: [ 3.  2.  4. nan  5.]
Valores faltantes:  225

Portas totais por tipo:
4.0    932
5.0    346
2.0    163
3.0     45
Name: portas, dtype: int64


In [230]:
print('Portas valores distintos:', X.cor.unique())
print('Valores faltantes: ', X.cor.isna().sum())
print('\Cores totais por tipo:')
print(X.cor.value_counts())

Portas valores distintos: ['Laranja' 'Branco' 'Prata' 'Azul' 'Vermelho' 'Preto' 'Cinza' 'Marrom'
 'Várias cores' 'Não informada.' 'manual' 'Vinho' 'Dourado' 'automático'
 'Verde' 'Bege' 'Bronze' 'Amarelo']
Valores faltantes:  0
\Cores totais por tipo:
Branco            710
Prata             262
Preto             257
Vermelho          144
Cinza             112
Várias cores      112
Não informada.     39
Laranja            21
Azul               16
Marrom             10
automático          6
Verde               6
manual              5
Vinho               3
Bege                3
Dourado             3
Amarelo             1
Bronze              1
Name: cor, dtype: int64


In [231]:
X["cor"].replace({"Várias cores": "Prata"}, inplace=True)
X["cor"].replace({"Não informada.": "Preto"}, inplace=True)
X["cor"].replace({"automático": "Preto"}, inplace=True)
X["cor"].replace({"manual": "Preto"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [232]:
X['cambio'] = X['cambio'].fillna(method='ffill')
X['portas'] = X['portas'].fillna(method='ffill')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['cambio'] = X['cambio'].fillna(method='ffill')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['portas'] = X['portas'].fillna(method='ffill')


In [233]:
print(X.cambio.value_counts())

manual        1338
automático     373
Name: cambio, dtype: int64


In [234]:
print(X.portas.value_counts())

4.0    1057
5.0     428
2.0     176
3.0      50
Name: portas, dtype: int64


In [235]:
print(X.cor.value_counts())

Branco      710
Prata       374
Preto       307
Vermelho    144
Cinza       112
Laranja      21
Azul         16
Marrom       10
Verde         6
Vinho         3
Bege          3
Dourado       3
Amarelo       1
Bronze        1
Name: cor, dtype: int64


#### Verificando se ficou alguma feature com dados faltantes 

In [236]:
[col for col in X.columns if X[col].isnull().any()]

[]

### Aplicando técnica de LabelEncoder

In [248]:
label_encoder = LabelEncoder()

In [249]:
s = (X.dtypes == 'object')
object_cols = list(s[s].index)
object_cols

['cambio', 'cor']

In [250]:
y = df.preco

In [251]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [252]:
label_X_train = X_train.copy()
label_X_test = X_test.copy()

In [253]:
for col in object_cols:
    label_X_train[col] = label_encoder.fit_transform(X_train[col])
    label_X_test[col] = label_encoder.transform(X_test[col])

#### RandomForestRegressor

In [254]:
rf = RandomForestRegressor(n_estimators=20, random_state=0)
rf.fit(label_X_train, y_train)
rf_pred = rf.predict(label_X_test)

print('RF - MAE - Mean Absolute Error:', metrics.mean_absolute_error(y_test, rf_pred))
print('RF - MSE - Mean Squared Error:', metrics.mean_squared_error(y_test, rf_pred))
print('RF - RMSE - Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, rf_pred)))

RF - MAE - Mean Absolute Error: 8148.619942051471
RF - MSE - Mean Squared Error: 135113973.35791183
RF - RMSE - Root Mean Squared Error: 11623.853636290842


#### LogisticRegression

In [255]:
lr = LogisticRegression(solver='liblinear', multi_class='ovr')
lr.fit(label_X_train, y_train)
lr_pred = lr.predict(label_X_test)

print('LR - MAE - Mean Absolute Error:', metrics.mean_absolute_error(y_test, lr_pred))
print('LR - MSE - Mean Squared Error:', metrics.mean_squared_error(y_test, lr_pred))
print('LR - RMSE - Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, lr_pred)))

LR - MAE - Mean Absolute Error: 10972.243190661478
LR - MSE - Mean Squared Error: 273957396.040856
LR - RMSE - Root Mean Squared Error: 16551.658407569194


#### DecisionTreeRegressor

In [256]:
dt = DecisionTreeRegressor(max_leaf_nodes=10, random_state=1)
dt.fit(label_X_train, y_train)
dt_pred = dt.predict(label_X_test)

print('DT - MAE - Mean Absolute Error:', metrics.mean_absolute_error(y_test, dt_pred))
print('DT - MSE - Mean Squared Error:', metrics.mean_squared_error(y_test, dt_pred))
print('DT - RMSE - Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, dt_pred)))

DT - MAE - Mean Absolute Error: 8258.985343189413
DT - MSE - Mean Squared Error: 135468786.18621707
DT - RMSE - Root Mean Squared Error: 11639.105901495057


In [257]:
dt = DecisionTreeRegressor(max_leaf_nodes=10, random_state=1)
dt.fit(label_X_train, y_train)
dt_pred = dt.predict(label_X_test)

print('DT - MAE - Mean Absolute Error:', metrics.mean_absolute_error(y_test, dt_pred))
print('DT - MSE - Mean Squared Error:', metrics.mean_squared_error(y_test, dt_pred))
print('DT - RMSE - Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, dt_pred)))

DT - MAE - Mean Absolute Error: 8258.985343189413
DT - MSE - Mean Squared Error: 135468786.18621707
DT - RMSE - Root Mean Squared Error: 11639.105901495057


### Aplicando técnica de OneHotEncoder

In [258]:
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

In [259]:
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test[object_cols]))

In [260]:
OH_cols_train.index = X_train.index
OH_cols_test.index = X_test.index

In [261]:
num_X_train = X_train.drop(object_cols, axis=1)
num_X_test = X_test.drop(object_cols, axis=1)

In [262]:
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

In [263]:
OH_X_train

Unnamed: 0,km,portas,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
574,0.000,4.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1079,27.774,4.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1322,40.580,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
654,48.720,4.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
581,0.000,5.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
835,42.900,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1216,0.000,5.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1653,0.000,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
559,798.000,4.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### RandomForestRegressor

In [85]:
rf = RandomForestRegressor(n_estimators=20, random_state=0)
rf.fit(OH_X_train, y_train)
rf_pred = rf.predict(OH_X_test)

print('RF - MAE - Mean Absolute Error:', metrics.mean_absolute_error(y_test, rf_pred))
print('RF - MSE - Mean Squared Error:', metrics.mean_squared_error(y_test, rf_pred))
print('RF - RMSE - Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, rf_pred)))

RF - MAE - Mean Absolute Error: 8165.19350382653
RF - MSE - Mean Squared Error: 134726483.19218746
RF - RMSE - Root Mean Squared Error: 11607.173781424463


#### LogisticRegression

In [265]:
lr = LogisticRegression(solver='liblinear', multi_class='ovr')
lr.fit(OH_X_train, y_train)
lr_pred = lr.predict(OH_X_test)

print('LR - MAE - Mean Absolute Error:', metrics.mean_absolute_error(y_test, lr_pred))
print('LR - MSE - Mean Squared Error:', metrics.mean_squared_error(y_test, lr_pred))
print('LR - RMSE - Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, lr_pred)))

LR - MAE - Mean Absolute Error: 9203.231517509728
LR - MSE - Mean Squared Error: 207006929.48054475
LR - RMSE - Root Mean Squared Error: 14387.735384018735


#### DecisionTreeRegressor

In [220]:
dt = DecisionTreeRegressor(max_leaf_nodes=10, random_state=1)
dt.fit(OH_X_train, y_train)
dt_pred = dt.predict(OH_X_test)

print('DT - MAE - Mean Absolute Error:', metrics.mean_absolute_error(y_test, dt_pred))
print('DT - MSE - Mean Squared Error:', metrics.mean_squared_error(y_test, dt_pred))
print('DT - RMSE - Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, dt_pred)))

DT - MAE - Mean Absolute Error: 7876.225626974772
DT - MSE - Mean Squared Error: 130107112.37949692
DT - RMSE - Root Mean Squared Error: 11406.45047240801


### Comparação

In [None]:
'''
DecisionTreeRegressor com OneHotEncoder e com a feature COR foi o único que teve melhor resultado, comparando 
com o modelo 5 que possui três features KM, PORTAS e CAMBIO    
'''