In [1]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from scipy.stats import zscore
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from time import time
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score,mean_absolute_error,r2_score


In [2]:
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [3]:
# Desafio Bônus # 
######################
#Proponha um modelo para determinar a quantidade de leads que um anúncio iráreceber.
########################

# Para o desenvolvimento da parte dois deste case, será utilizado um modelo de regressão
# para realizar a predição de quantos leads os anuncios receberão.

###########

# Primeiro passo: Limpeza dos dados.

###########
df = pd.read_csv('dados.csv', sep=',')

print(df)

#  Removendo linhas com valores nulos
df = df.dropna()
# plotando boxplot para analisar visualmente alguns valores que fogem da distribuição normal dos dados para os dados discretos

'''



df.boxplot(column=['cod_anuncio', 'cod_cliente', 'cod_tipo_pessoa', 'prioridade', 'leads',
       'views', 'cliques_telefone*', 'cod_marca_veiculo', 'cod_modelo_veiculo',
       'cod_versao_veiculo', 'ano_modelo', 'cep_2dig',
       'qtd_fotos', 'km_veiculo', 'vlr_mercado'], rot=90)
'''

       cod_anuncio  cod_cliente  cod_tipo_pessoa  prioridade  leads  views  \
0                2         7941                2           3      1      0   
1                6         1770                1           3      1      0   
2                8          709                1           3      4      0   
3               13         9708                1           2     12      0   
4               15         1363                1           2      8      0   
...            ...          ...              ...         ...    ...    ...   
48660       195918         3065                2           3      1  34184   
48661       195924        12535                2           3      1  45048   
48662       195930         2284                2           3      1  71020   
48663       195934         2284                2           3      0  82940   
48664       195939         2284                2           3      0  93608   

       cliques_telefone*  cod_marca_veiculo  cod_modelo_veiculo

"\n\n\n\ndf.boxplot(column=['cod_anuncio', 'cod_cliente', 'cod_tipo_pessoa', 'prioridade', 'leads',\n       'views', 'cliques_telefone*', 'cod_marca_veiculo', 'cod_modelo_veiculo',\n       'cod_versao_veiculo', 'ano_modelo', 'cep_2dig',\n       'qtd_fotos', 'km_veiculo', 'vlr_mercado'], rot=90)\n"

In [4]:
# é visto por meio do boxplot que na colunakm_veiculo temos valores de quilometragem muito alto e que não faze sentido,
#esses valores serão removidos
mask = (df['km_veiculo']> 250000)
df= df.loc[~mask]
'''
df.boxplot(column=['cod_anuncio', 'cod_cliente', 'cod_tipo_pessoa', 'prioridade', 'leads',
       'views', 'cliques_telefone*', 'cod_marca_veiculo', 'cod_modelo_veiculo',
       'cod_versao_veiculo', 'ano_modelo', 'cep_2dig',
       'qtd_fotos', 'km_veiculo', 'vlr_mercado'], rot=90)
'''
# aqui os boxplot fazem mais sentido, indicando a distribuição dos dados
# apenas para analise vou plotar um boxplot a respeito do valor de mercado
#df.boxplot(column=['vlr_mercado'], rot=90), onde é possivel observar que temos um veiculo que passa de 1milhão de reais

# apenas para analise vou plotar um boxplot a respeito do número de leads
#df.boxplot(column=['leads'], rot=90)

# repplace valores iguais a -1 com 0
df = df.replace(-1, np.nan)
df = df.dropna()
df

Unnamed: 0,cod_anuncio,cod_cliente,cod_tipo_pessoa,prioridade,leads,views,cliques_telefone*,cod_marca_veiculo,cod_modelo_veiculo,cod_versao_veiculo,...,arquente,bancocouro,arcondic,abs,desembtras,travaeletr,vidroseletr,rodasliga,sensorchuva,sensorestacion
9,31,13771,1,2,16,0,8,2,1143,337123,...,N,N,N,N,N,S,S,S,N,N
16,55,15375,1,1,104,0,72,16,690,342914,...,S,S,S,S,S,S,S,S,N,N
25,95,16462,1,1,0,0,6,12,2722,342061,...,S,S,S,S,S,S,S,S,S,S
28,110,1987,1,1,0,0,2,2,1161,339241,...,S,N,S,S,S,S,S,N,N,N
30,117,18367,1,1,0,0,10,3,687,344077,...,S,N,S,N,S,S,S,S,N,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48602,37082,22723,1,2,52,52,35,2,670,337726,...,S,N,S,N,S,S,S,N,N,N
48609,39620,32376,1,1,56,56,53,17,2883,341540,...,S,S,S,S,S,S,S,S,S,S
48611,39761,14595,1,3,56,56,20,8,654,345546,...,S,S,S,S,S,S,S,S,S,S
48644,43573,24368,1,1,64,64,64,5,2966,343235,...,S,S,S,S,S,S,S,S,S,S


In [1]:
# criando um dataframe com os dados limpos

df_cleaned = df

# aqui vou criar dois dataframes, ou seja, dois datasets para criar o modelo.
# um dataset que carregara todos os atributos da base de dados (df_all), 
#e outro apenas com os atributos que foram selecionados para análise (df_sel)
df_all = df_cleaned
df_sel = df_cleaned[['leads', 'cod_marca_veiculo', 'cod_modelo_veiculo', 'ano_modelo', 'uf', 'cidade', 'km_veiculo']]

# Pré processamento dos dados
# Para podermos dizer se um anúncio receberá ou não leads, devemos considerar esse um problema de classificação binário
# , ouseja, um problema de sim ou não (1 e 0)

# Primeiro passo aqui é reformular a base de dados para o problema.
# Primeiramente vamos normalizar/transformar as variáveis categóricas em números discretos

df_all_cat = MultiColumnLabelEncoder(columns = df_all.columns).fit_transform(df_all)
df_sel_cat = MultiColumnLabelEncoder(columns = df_sel.columns).fit_transform(df_sel)
type(df_all_cat.values)

NameError: name 'df' is not defined

In [6]:
#Criando dataframes para rodar os modelos
df_all_cat_target =df_all_cat[['leads']]

df_all_cat_input = df_all_cat[['cod_anuncio', 'cod_cliente', 'cod_tipo_pessoa', 'prioridade',
       'views', 'cliques_telefone*', 'cod_marca_veiculo', 'cod_modelo_veiculo',
       'cod_versao_veiculo', 'ano_modelo', 'cep_2dig', 'uf', 'cidade',
       'qtd_fotos', 'km_veiculo', 'vlr_mercado', 'flg_unico_dono',
       'flg_licenciado', 'flg_ipva_pago', 'flg_todas_revisoes_concessionaria',
       'flg_todas_revisoes_agenda_veiculo', 'flg_garantia_fabrica',
       'flg_blindado', 'flg_aceita_troca', 'flg_adaptado_pcd', 'combustivel',
       'cambio', 'portas', 'alarme', 'airbag', 'arquente', 'bancocouro',
       'arcondic', 'abs', 'desembtras', 'travaeletr', 'vidroseletr',
       'rodasliga', 'sensorchuva', 'sensorestacion']]

In [7]:
# normalizando o dataset de entrada
df_all_cat_input_norm = zscore(df_all_cat_input)
df_all_cat_input_norm

array([[-1.73200936,  1.22403103, -2.56851567, ...,  0.89654829,
        -0.29937698, -0.62702345],
       [-1.73192645,  1.31946924, -2.56851567, ...,  0.89654829,
        -0.29937698, -0.62702345],
       [-1.73184355,  1.39090152, -2.56851567, ...,  0.89654829,
         3.34027016,  1.59483669],
       ...,
       [-1.10716949,  1.27409218, -2.56851567, ...,  0.89654829,
         3.34027016,  1.59483669],
       [-1.04275386,  1.83530396, -2.56851567, ...,  0.89654829,
         3.34027016,  1.59483669],
       [-0.65103721, -1.37768471, -2.56851567, ...,  0.89654829,
        -0.29937698, -0.62702345]])

In [21]:

# Agora com os dados limpos e pré processados é possível criar os modelos

y = df_all_cat_target
X = df_all_cat_input_norm

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=66)


regressors = [
    KNeighborsRegressor(),
    GradientBoostingRegressor(),
    KNeighborsRegressor(),
    ExtraTreesRegressor(),
    RandomForestRegressor(),
    DecisionTreeRegressor(),
    LinearRegression(),
    Lasso(),
    Ridge()
]


head = 10
for model in regressors[:head]:
    start = time()
    model.fit(X_train, y_train)
    train_time = time() - start
    start = time()
    y_pred = model.predict(X_test)
    
    predict_time = time()-start    
    print(model)
    print("\tTraining time: %0.3fs" % train_time)
    print("\tPrediction time: %0.3fs" % predict_time)
    print("\tExplained variance:", explained_variance_score(y_test, y_pred))
    print("\tMean absolute error:", mean_absolute_error(y_test, y_pred))
    print("\tR2 score:", r2_score(y_test, y_pred))
    print()
    plt.scatter(y_test, y_pred, color = 'steelblue')
    #plt.plot(0,50, color='tomato', lw=2)
    plt.title('Real vs Predito')
    plt.xlabel('Real')
    plt.ylabel('Predito')
    r = r2_score(y_test, y_pred)
    plt.legend(title='R² = {:.2}'.format(r),fontsize=9, loc='upper left')   
    plt.savefig(f'all_{model}.png')
    plt.close()

No handles with labels found to put in legend.


KNeighborsRegressor()
	Training time: 1.871s
	Prediction time: 35.723s
	Explained variance: 0.5732573090376597
	Mean absolute error: 1.0973293768545993
	R2 score: 0.5719257736718455



  return f(**kwargs)
No handles with labels found to put in legend.


GradientBoostingRegressor()
	Training time: 10.518s
	Prediction time: 0.021s
	Explained variance: 0.7215911669299631
	Mean absolute error: 0.8991457656860169
	R2 score: 0.7214180325212383



No handles with labels found to put in legend.


KNeighborsRegressor()
	Training time: 1.919s
	Prediction time: 35.735s
	Explained variance: 0.5732573090376597
	Mean absolute error: 1.0973293768545993
	R2 score: 0.5719257736718455



  model.fit(X_train, y_train)
No handles with labels found to put in legend.


ExtraTreesRegressor()
	Training time: 26.390s
	Prediction time: 0.291s
	Explained variance: 0.7052274751879813
	Mean absolute error: 0.945761462620848
	R2 score: 0.7041470191654111



  model.fit(X_train, y_train)
No handles with labels found to put in legend.


RandomForestRegressor()
	Training time: 40.434s
	Prediction time: 0.249s
	Explained variance: 0.7102238339695112
	Mean absolute error: 0.926484158131521
	R2 score: 0.7093305371354186



No handles with labels found to put in legend.


DecisionTreeRegressor()
	Training time: 0.580s
	Prediction time: 0.006s
	Explained variance: 0.3975792298987033
	Mean absolute error: 1.2923327270986886
	R2 score: 0.3966735793305973



No handles with labels found to put in legend.


LinearRegression()
	Training time: 0.035s
	Prediction time: 0.001s
	Explained variance: 0.6472102227792056
	Mean absolute error: 1.069200550132973
	R2 score: 0.6470247567607655



No handles with labels found to put in legend.


Lasso()
	Training time: 0.024s
	Prediction time: 0.003s
	Explained variance: 0.5052705129350428
	Mean absolute error: 1.0494313547118255
	R2 score: 0.5052178494631547



No handles with labels found to put in legend.


Ridge()
	Training time: 0.023s
	Prediction time: 0.001s
	Explained variance: 0.6472086283982768
	Mean absolute error: 1.0691895818768045
	R2 score: 0.6470231768974684



In [22]:
# Agora com os dados limpos e pré processados é possível criar os modelos

df_sel_cat_input = df_sel_cat[['cod_marca_veiculo', 'cod_modelo_veiculo', 'ano_modelo', 'uf', 'cidade', 'km_veiculo']]
df_sel_cat_input_norm = zscore(df_sel_cat_input)

y = df_sel_cat[['leads']]
X = df_sel_cat_input_norm


X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=66)


regressors = [
    KNeighborsRegressor(),
    GradientBoostingRegressor(),
    KNeighborsRegressor(),
    ExtraTreesRegressor(),
    RandomForestRegressor(),
    DecisionTreeRegressor(),
    LinearRegression(),
    Lasso(),
    Ridge()
]


head = 10
for model in regressors[:head]:
    start = time()
    model.fit(X_train, y_train)
    train_time = time() - start
    start = time()
    y_pred = model.predict(X_test)
    
    predict_time = time()-start    
    print(model)
    print("\tTraining time: %0.3fs" % train_time)
    print("\tPrediction time: %0.3fs" % predict_time)
    print("\tExplained variance:", explained_variance_score(y_test, y_pred))
    print("\tMean absolute error:", mean_absolute_error(y_test, y_pred))
    print("\tR2 score:", r2_score(y_test, y_pred))
    print()
    plt.scatter(y_test, y_pred, color = 'steelblue')
    plt.title('Real vs Predito')
    plt.xlabel('Real')
    plt.ylabel('Predito')
    plt.savefig(f'sel_{model}.png')
    plt.close()

KNeighborsRegressor()
	Training time: 0.115s
	Prediction time: 0.307s
	Explained variance: -0.07137669083449705
	Mean absolute error: 1.5118598640758112
	R2 score: -0.07154362458363894



  return f(**kwargs)


GradientBoostingRegressor()
	Training time: 2.732s
	Prediction time: 0.016s
	Explained variance: 0.09738240425931466
	Mean absolute error: 1.3081687966411635
	R2 score: 0.09738140556740549

KNeighborsRegressor()
	Training time: 0.100s
	Prediction time: 0.244s
	Explained variance: -0.07137669083449705
	Mean absolute error: 1.5118598640758112
	R2 score: -0.07154362458363894



  model.fit(X_train, y_train)


ExtraTreesRegressor()
	Training time: 5.455s
	Prediction time: 0.284s
	Explained variance: -0.08757620733400362
	Mean absolute error: 1.471071352325923
	R2 score: -0.08823621376694368



  model.fit(X_train, y_train)


RandomForestRegressor()
	Training time: 8.970s
	Prediction time: 0.255s
	Explained variance: 0.012356008132235474
	Mean absolute error: 1.4213092401107712
	R2 score: 0.01116914212408382

DecisionTreeRegressor()
	Training time: 0.128s
	Prediction time: 0.004s
	Explained variance: -0.7376193663014505
	Mean absolute error: 1.8282821514658405
	R2 score: -0.7384826391585186

LinearRegression()
	Training time: 0.005s
	Prediction time: 0.000s
	Explained variance: 0.031918141855649895
	Mean absolute error: 1.4109502524855109
	R2 score: 0.0319179339297293

Lasso()
	Training time: 0.007s
	Prediction time: 0.000s
	Explained variance: 0.0
	Mean absolute error: 1.4394572104251835
	R2 score: -4.2120317589322553e-07

Ridge()
	Training time: 0.034s
	Prediction time: 0.000s
	Explained variance: 0.031918048313726466
	Mean absolute error: 1.4109478422387305
	R2 score: 0.03191784041336232



In [None]:
# para regressão o modelo que usoutodos os atributos da base de dados obteve melhores resultados
# os melhores resultados foram alcançado pelo GradientBoostingRegressor