In [8]:
%config IPCompleter.greedy=True

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

#### Carregando os dados

In [10]:
def convert_prices(df): 
    for x in range(100): # loop de ,00 ,01 ,02...até 0,99 centavos
        if x >= 10:
            str_replace = ',' + str(x)
        else:    
            str_replace = ',0' + str(x)
        df['preco'] = df.preco.replace({str_replace:''}, regex=True)
    
    for i in range(len(df)) : 
        df['preco'][i] = df['preco'][i].replace('.', '')
    
    df['preco'] = df['preco'].astype(float)
    
    return df

In [11]:
def load_df():    
    df = pd.read_csv('df_final.csv', sep='\t')
    df.drop(df.columns[[0, 1, 2]], axis = 1, inplace = True) 
    df = convert_prices(df)
    return df

In [12]:
df = load_df()
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['preco'][i] = df['preco'][i].replace('.', '')


Unnamed: 0,nome,ano,km,cor,cambio,portas,preco
0,Chevrolet Onix 1.0 LT (Flex) 2020,2020 / 2020,0.0,Laranja,manual,3.0,59890.0
1,Fiat Strada Hard Working 1.4 (Flex) (Cabine Si...,2019 / 2019,41.07,Branco,manual,2.0,45900.0
2,Fiat Strada Hard Working 1.4 (Flex) (Cabine Si...,2018 / 2019,48.654,Prata,manual,4.0,45900.0
3,Fiat Strada Hard Working 1.4 (Flex) (Cabine Si...,2018 / 2019,19.0,Prata,manual,2.0,45990.0
4,Fiat Strada Hard Working 1.4 (Flex) (Cabine Si...,2018 / 2019,24.715,Branco,manual,2.0,45990.0


#### Verificando features com dados faltantes

In [13]:
[col for col in df.columns if df[col].isnull().any()]

['cambio', 'portas']

#### Selecionando as features

In [14]:
cars_features = ['km', 'cambio', 'portas', 'cor', 'ano', 'nome']
X = df[cars_features]

#### Analisando os dados da feature Cambio

In [15]:
print('Cambio valores distintos:', X.cambio.unique())
print('Valores faltantes: ', X.cambio.isna().sum())
print('\nCambio totais por tipo:')
print(X.cambio.value_counts())

Cambio valores distintos: ['manual' 'automático' nan]
Valores faltantes:  11

Cambio totais por tipo:
manual        1334
automático     366
Name: cambio, dtype: int64


#### Analisando os dados da feature Portas

In [16]:
print('Portas valores distintos:', X.portas.unique())
print('Valores faltantes: ', X.portas.isna().sum())
print('\nPortas totais por tipo:')
print(X.portas.value_counts())

Portas valores distintos: [ 3.  2.  4. nan  5.]
Valores faltantes:  225

Portas totais por tipo:
4.0    932
5.0    346
2.0    163
3.0     45
Name: portas, dtype: int64


### Inputando dados nas features com dados faltantes

In [17]:
X['cambio'] = X['cambio'].fillna(method='ffill')
X['portas'] = X['portas'].fillna(method='ffill')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['cambio'] = X['cambio'].fillna(method='ffill')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['portas'] = X['portas'].fillna(method='ffill')


#### Verificando se ficou alguma feature com dados faltantes 

In [18]:
[col for col in X.columns if X[col].isnull().any()]

[]

### Preprocessando os dados nas features Cor e Ano 

#### Analisando os dados da feature Cor

In [19]:
print('Cores valores distintos:', X.cor.unique())
print('Valores faltantes: ', X.cor.isna().sum())
print('\Cores totais por tipo:')
print(X.cor.value_counts())

Cores valores distintos: ['Laranja' 'Branco' 'Prata' 'Azul' 'Vermelho' 'Preto' 'Cinza' 'Marrom'
 'Várias cores' 'Não informada.' 'manual' 'Vinho' 'Dourado' 'automático'
 'Verde' 'Bege' 'Bronze' 'Amarelo']
Valores faltantes:  0
\Cores totais por tipo:
Branco            710
Prata             262
Preto             257
Vermelho          144
Várias cores      112
Cinza             112
Não informada.     39
Laranja            21
Azul               16
Marrom             10
Verde               6
automático          6
manual              5
Dourado             3
Vinho               3
Bege                3
Amarelo             1
Bronze              1
Name: cor, dtype: int64


In [20]:
X["cor"].replace({"Várias cores": "Prata"}, inplace=True)
X["cor"].replace({"Não informada.": "Preto"}, inplace=True)
X["cor"].replace({"automático": "Preto"}, inplace=True)
X["cor"].replace({"manual": "Preto"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


#### Analisando os dados da feature Ano

In [21]:
print('Ano valores distintos:', X.ano.unique())
print('Valores faltantes: ', X.ano.isna().sum())
print('Anos totais por tipo:')
print(X.ano.value_counts())

Ano valores distintos: ['2020 / 2020' '2019 / 2019' '2018 / 2019' '2017 / 2018' '2018 / 2018'
 '2015 / 2016' '2016 / 2016' '2015 / 2015' '2014 / 2015' '2013 / 2014'
 '2013 / 2013' '2012 / 2013' '2012 / 2012' '2011 / 2011' '2020 / 2021'
 '2019 / 2020' '2016 / 2017' '2017 / 2017' '2011 / 2012' '2009 / 2010'
 '2009 / 2009' '2014 / 2014']
Valores faltantes:  0
Anos totais por tipo:
2019 / 2020    607
2020 / 2020    402
2020 / 2021    226
2018 / 2019    154
2019 / 2019     94
2017 / 2018     45
2018 / 2018     36
2016 / 2017     25
2015 / 2015     21
2014 / 2015     18
2016 / 2016     17
2015 / 2016     17
2013 / 2013     11
2017 / 2017     10
2013 / 2014      8
2014 / 2014      8
2012 / 2013      5
2009 / 2010      2
2011 / 2011      2
2011 / 2012      1
2012 / 2012      1
2009 / 2009      1
Name: ano, dtype: int64


In [22]:
X["ano"].replace({"2021 / 2021": "2021"}, inplace=True)
X["ano"].replace({"2020 / 2020": "2020"}, inplace=True)
X["ano"].replace({"2019 / 2019": "2019"}, inplace=True)
X["ano"].replace({"2018 / 2018": "2018"}, inplace=True)
X["ano"].replace({"2017 / 2017": "2017"}, inplace=True)
X["ano"].replace({"2016 / 2016": "2016"}, inplace=True)
X["ano"].replace({"2015 / 2015": "2015"}, inplace=True)
X["ano"].replace({"2014 / 2014": "2014"}, inplace=True)
X["ano"].replace({"2013 / 2013": "2013"}, inplace=True)
X["ano"].replace({"2012 / 2012": "2012"}, inplace=True)
X["ano"].replace({"2011 / 2011": "2011"}, inplace=True)
X["ano"].replace({"2010 / 2010": "2010"}, inplace=True)
X["ano"].replace({"2009 / 2009": "2009"}, inplace=True)

In [23]:
X["ano"].replace({"2020 / 2021": "2021"}, inplace=True)
X["ano"].replace({"2019 / 2020": "2020"}, inplace=True)
X["ano"].replace({"2018 / 2019": "2019"}, inplace=True)
X["ano"].replace({"2017 / 2018": "2018"}, inplace=True)
X["ano"].replace({"2016 / 2017": "2017"}, inplace=True)
X["ano"].replace({"2015 / 2016": "2016"}, inplace=True)
X["ano"].replace({"2014 / 2015": "2015"}, inplace=True)
X["ano"].replace({"2013 / 2014": "2014"}, inplace=True)
X["ano"].replace({"2012 / 2013": "2013"}, inplace=True)
X["ano"].replace({"2011 / 2012": "2012"}, inplace=True)
X["ano"].replace({"2010 / 2011": "2011"}, inplace=True)
X["ano"].replace({"2009 / 2010": "2010"}, inplace=True)

#### Analisando os dados após os tratamentos

In [24]:
print(X.cambio.value_counts())

manual        1338
automático     373
Name: cambio, dtype: int64


In [25]:
print(X.portas.value_counts())

4.0    1057
5.0     428
2.0     176
3.0      50
Name: portas, dtype: int64


In [26]:
print(X.cor.value_counts())

Branco      710
Prata       374
Preto       307
Vermelho    144
Cinza       112
Laranja      21
Azul         16
Marrom       10
Verde         6
Vinho         3
Bege          3
Dourado       3
Bronze        1
Amarelo       1
Name: cor, dtype: int64


In [27]:
print(X.ano.value_counts())

2020    1009
2019     248
2021     226
2018      81
2015      39
2017      35
2016      34
2014      16
2013      16
2010       2
2011       2
2012       2
2009       1
Name: ano, dtype: int64


### Aplicando técnica de LabelEncoder

In [28]:
X

Unnamed: 0,km,cambio,portas,cor,ano,nome
0,0.000,manual,3.0,Laranja,2020,Chevrolet Onix 1.0 LT (Flex) 2020
1,41.070,manual,2.0,Branco,2019,Fiat Strada Hard Working 1.4 (Flex) (Cabine Si...
2,48.654,manual,4.0,Prata,2019,Fiat Strada Hard Working 1.4 (Flex) (Cabine Si...
3,19.000,manual,2.0,Prata,2019,Fiat Strada Hard Working 1.4 (Flex) (Cabine Si...
4,24.715,manual,2.0,Branco,2019,Fiat Strada Hard Working 1.4 (Flex) (Cabine Si...
...,...,...,...,...,...,...
1706,40.912,manual,2.0,Branco,2019,Fiat Strada Hard Working 1.4 (Flex) (Cabine Si...
1707,35.271,manual,2.0,Branco,2019,Fiat Strada Hard Working 1.4 (Flex) (Cabine Du...
1708,40.741,manual,2.0,Prata,2019,Fiat Strada Hard Working 1.4 (Flex) (Cabine Si...
1709,40.870,manual,2.0,Prata,2019,Fiat Strada Hard Working 1.4 (Flex) (Cabine Du...


In [29]:
label_encoder = LabelEncoder()

In [30]:
y = df.preco

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [32]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1197, 6), (514, 6), (1197,), (514,))

In [33]:
label_X_train = X_train.copy()
label_X_test = X_test.copy()

In [34]:
label_X_train

Unnamed: 0,km,cambio,portas,cor,ano,nome
574,0.000,automático,4.0,Branco,2020,Hyundai HB20 1.0 T-GDI Evolution (Aut) 2020
1079,27.774,manual,4.0,Branco,2020,Renault Kwid 1.0 Zen 2020
1322,40.580,manual,4.0,Preto,2020,Fiat Argo 1.0 Drive 2020
654,48.720,manual,4.0,Branco,2016,Hyundai HB20 1.6 Comfort Style 2016
581,0.000,automático,5.0,Branco,2020,Hyundai HB20 1.0 T-GDI Evolution (Aut) 2020
...,...,...,...,...,...,...
835,42.900,manual,4.0,Preto,2019,Jeep Renegade 1.8 Sport 2019
1216,0.000,automático,5.0,Branco,2020,Chevrolet Onix 1.0 Turbo LT (Aut) 2020
1653,0.000,manual,4.0,Vermelho,2020,Fiat Strada 1.4 CD Freedom 2020
559,798.000,manual,4.0,Branco,2020,Hyundai HB20 1.0 Sense 2020


In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [36]:
nome_train = X_train['nome']
nome_val = X_test['nome']

In [37]:
nome_train.shape, nome_val.shape

((1197,), (514,))

In [38]:
nome_vec = TfidfVectorizer(min_df=2, ngram_range=(1,3)) 

In [39]:
nome_train_tfidf_fit_transform = nome_vec.fit_transform(nome_train)
nome_bow_val = nome_vec.transform(nome_val)

In [40]:
nome_train_tfidf_fit_transform

<1197x482 sparse matrix of type '<class 'numpy.float64'>'
	with 13749 stored elements in Compressed Sparse Row format>

In [41]:
nome_train_tfidf_fit_transform.shape

(1197, 482)

In [42]:
from scipy.sparse import hstack, vstack

In [43]:
nome_train_tfidf_fit_transform.shape, X_train.shape

((1197, 482), (1197, 6))

In [44]:
cars_features_v2 = ['km', 'cambio', 'portas', 'cor', 'ano']
X_train_teste = X_train[cars_features_v2]
X_test_teste = X_test[cars_features_v2]

In [45]:
object_cols = ['cambio', 'cor', 'ano']

In [46]:
for col in object_cols:
    label_X_train[col] = label_encoder.fit_transform(X_train_teste[col])
    label_X_test[col] = label_encoder.transform(X_test_teste[col])

In [47]:
label_X_train

Unnamed: 0,km,cambio,portas,cor,ano,nome
574,0.000,0,4.0,3,11,Hyundai HB20 1.0 T-GDI Evolution (Aut) 2020
1079,27.774,1,4.0,3,11,Renault Kwid 1.0 Zen 2020
1322,40.580,1,4.0,10,11,Fiat Argo 1.0 Drive 2020
654,48.720,1,4.0,3,7,Hyundai HB20 1.6 Comfort Style 2016
581,0.000,0,5.0,3,11,Hyundai HB20 1.0 T-GDI Evolution (Aut) 2020
...,...,...,...,...,...,...
835,42.900,1,4.0,10,10,Jeep Renegade 1.8 Sport 2019
1216,0.000,0,5.0,3,11,Chevrolet Onix 1.0 Turbo LT (Aut) 2020
1653,0.000,1,4.0,12,11,Fiat Strada 1.4 CD Freedom 2020
559,798.000,1,4.0,3,11,Hyundai HB20 1.0 Sense 2020


In [48]:
label_X_train = label_X_train[['km', 'cambio', 'portas', 'cor', 'ano']]
label_X_test = label_X_test[['km', 'cambio', 'portas', 'cor', 'ano']]

In [50]:
Xtrain_wnome = hstack([label_X_train, nome_train_tfidf_fit_transform])
Xtest_wnome = hstack([label_X_test, nome_bow_val])

In [51]:
Xtrain_wnome.shape, Xtest_wnome.shape

((1197, 487), (514, 487))

In [52]:
label_X_train.shape

(1197, 5)

In [55]:
Xtest_wnome

<514x487 sparse matrix of type '<class 'numpy.float64'>'
	with 8178 stored elements in COOrdinate format>

In [54]:
rf = RandomForestRegressor(n_estimators=20, random_state=0)
rf.fit(Xtrain_wnome, y_train)
rf_pred = rf.predict(Xtest_wnome)

print('RF - MAE - Mean Absolute Error:', metrics.mean_absolute_error(y_test, rf_pred))
print('RF - MSE - Mean Squared Error:', metrics.mean_squared_error(y_test, rf_pred))
print('RF - RMSE - Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, rf_pred)))
rf.score(Xtest_wnome, y_test)

RF - MAE - Mean Absolute Error: 3719.9160671264535
RF - MSE - Mean Squared Error: 30292489.652076945
RF - RMSE - Root Mean Squared Error: 5503.861340193533


0.9011057542241044

#### RandomForestRegressor

In [210]:
rf = RandomForestRegressor(n_estimators=20, random_state=0)
rf.fit(Xtrain_wnome, y_train)
rf_pred = rf.predict(Xtest_wnome)

print('RF - MAE - Mean Absolute Error:', metrics.mean_absolute_error(y_test, rf_pred))
print('RF - MSE - Mean Squared Error:', metrics.mean_squared_error(y_test, rf_pred))
print('RF - RMSE - Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, rf_pred)))

RF - MAE - Mean Absolute Error: 3719.9160671264535
RF - MSE - Mean Squared Error: 30292489.652076945
RF - RMSE - Root Mean Squared Error: 5503.861340193533


In [212]:
rf.score(Xtest_wnome, y_test)

0.9011057542241044

In [213]:
'''
RandomForestRegressor melhorou de 0.6116710248281205 para 0.8968164577341587
'''

'\nRandomForestRegressor melhorou de 0.6116710248281205 para 0.8968164577341587\n'

#### LogisticRegression

In [214]:
lr = LogisticRegression(solver='liblinear', multi_class='ovr')
lr.fit(Xtrain_wnome, y_train)
lr_pred = lr.predict(Xtest_wnome)

print('LR - MAE - Mean Absolute Error:', metrics.mean_absolute_error(y_test, lr_pred))
print('LR - MSE - Mean Squared Error:', metrics.mean_squared_error(y_test, lr_pred))
print('LR - RMSE - Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, lr_pred)))

LR - MAE - Mean Absolute Error: 5781.1206225680935
LR - MSE - Mean Squared Error: 90198584.3929961
LR - RMSE - Root Mean Squared Error: 9497.293529895562


In [215]:
lr.score(Xtest_wnome, y_test)

0.11478599221789883

In [216]:
'''
LogisticRegression melhorou de 0.07198443579766536 para 0.11284046692607004
'''

'\nLogisticRegression melhorou de 0.07198443579766536 para 0.11284046692607004\n'

#### TfidfVectorizer

In [217]:
dt = DecisionTreeRegressor(max_leaf_nodes=10, random_state=1)
dt.fit(Xtrain_wnome, y_train)
dt_pred = dt.predict(Xtest_wnome)

print('DT - MAE - Mean Absolute Error:', metrics.mean_absolute_error(y_test, dt_pred))
print('DT - MSE - Mean Squared Error:', metrics.mean_squared_error(y_test, dt_pred))
print('DT - RMSE - Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, dt_pred)))

DT - MAE - Mean Absolute Error: 6023.803419716494
DT - MSE - Mean Squared Error: 60973473.46557966
DT - RMSE - Root Mean Squared Error: 7808.551303896239


In [218]:
dt.score(Xtest_wnome, y_test)

0.8009432126585986

In [219]:
'''
DecisionTreeRegressor melhorou de 0.6055322556693861 para 0.7916679342714491
'''

'\nDecisionTreeRegressor melhorou de 0.6055322556693861 para 0.7916679342714491\n'

In [220]:
import joblib as jb

In [221]:
jb.dump(dt, "decision_tree_m8.pkl.z")

['decision_tree_m8.pkl.z']

In [222]:
jb.dump(rf, "random_forest_m8.pkl.z")

['random_forest_m8.pkl.z']

In [223]:
jb.dump(nome_vec, "nome_vectorizer_m8.pkl.z")

['nome_vectorizer_m8.pkl.z']