# Modelización

In [14]:
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
%matplotlib inline

from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RANSACRegressor
    
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_absolute_error

from sklearn.model_selection import KFold

from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

In [8]:
outcomesDf=pd.read_csv("C:/Users/Alex/OneDrive/Desktop/python_proyects/swoopo_proyect/outcomes.tsv", sep="\t")
categories=pd.read_csv("C:/Users/Alex/OneDrive/Desktop/python_proyects/swoopo_proyect/primera_categorizacion+ingles+arrays+clusters.csv")

In [9]:
categories_clusters=categories.loc[:,["item","clusters"]]

In [4]:
categories_clusters["clusters"].value_counts()

clusters
0.0    809
3.0    396
5.0    175
1.0    166
4.0    134
2.0    123
Name: count, dtype: int64

In [10]:
outcomesDf=outcomesDf.merge(categories_clusters,on="item",how="left")

In [11]:
outcomesDf=outcomesDf[~outcomesDf["item"].str.contains("voucher")]

In [12]:
outcomesDf.loc[:,["clusters"]].value_counts()

clusters
0.0         64252
3.0         12412
2.0         10145
4.0          7048
5.0          4728
1.0          4629
Name: count, dtype: int64

Lanzamos modelos para todo el set de datos sin tener en cuenta las categorías.

Los valores más extremos podrían considerarse valores atípicos y eliminarse del conjunto de datos para que no influyan negativamente en el modelo. Sin embargo, en este caso, los valores extremos son observaciones legítimas y lo más probable es que no sean datos introducidos incorrectamente. Por lo tanto, en lugar de eliminarlos, se ha decidido incluir algunos modelos de predicción que son robustos a los valores atípicos. Por ejemplo, los bosques aleatorios y los árboles de decisión aíslan las observaciones atípicas, y el algoritmo RANSAC proporciona una estimación robusta del modelo lineal.

Debido a la elevada varianza del precio de venta final de las subastas, además del error medio absoluto, también se ha decidido utilizar la mediana del error absoluto como métrica. Esta métrica no está tan influida por los valores extremos. Hacer predicciones del precio de venta final puede ayudar a Swoopo o a empresas similares a tener una estimación del dinero que van a ganar antes de que comience un grupo de subastas. Sin embargo, consideramos más importante tener una estimación precisa de grupos de varias subastas (por ejemplo, estimaciones precisas del beneficio semanal o mensual, que pueden calcularse utilizando las estimaciones del precio final de las subastas incluidas en esos periodos de tiempo), que estimaciones precisas para subastas individuales, ya que Swoopo se beneficiará del resultado total de todas sus subastas. Por este motivo, aunque se ha calculado el error medio absoluto, se considera que el error mediano absoluto es una métrica más adecuada en este caso.

In [11]:
x_column_names = ["retail","bidincrement","bidfee","flg_click_only","flg_beginnerauction","flg_fixedprice","flg_endprice"]

In [12]:
X = outcomesDf[x_column_names]
y = outcomesDf["price"]

In [16]:
kFoldMedianAbsoluteErrorsRandomForestRegressor = []
kFoldMedianAbsoluteErrorsKNeighborsRegressor = []
kFoldMedianAbsoluteErrorsDecisionTreeRegressor = []
kFoldMedianAbsoluteErrorsLinearRegression = []
kFoldMedianAbsoluteErrorsRANSACRegressor = []

kFoldMeanAbsoluteErrorsRandomForestRegressor = []
kFoldMeanAbsoluteErrorsKNeighborsRegressor = []
kFoldMeanAbsoluteErrorsDecisionTreeRegressor = []
kFoldMeanAbsoluteErrorsLinearRegression = []
kFoldMeanAbsoluteErrorsRANSACRegressor = []

kFoldNumber = 1
kf = KFold(n_splits=5, random_state=1,shuffle=True)
for train_index, test_index in kf.split(X):
    #5-fold cross-validation
    print("Executing k fold = "+str(kFoldNumber))
    kFoldNumber=kFoldNumber+1
    
    #train and test split
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    #RandomForestRegressor
    model=RandomForestRegressor(random_state=1)
    model.fit(X_train,y_train)
    P_price = model.predict(X_test)
    
    kFoldMedianAbsoluteError = median_absolute_error(y_test,P_price)
    kFoldMedianAbsoluteErrorsRandomForestRegressor.append(kFoldMedianAbsoluteError)    
    kFoldMeanAbsoluteError = mean_absolute_error(y_test,P_price)
    kFoldMeanAbsoluteErrorsRandomForestRegressor.append(kFoldMeanAbsoluteError)  

    #KNeighborsRegressor
    model = KNeighborsRegressor()
    model.fit(X_train,y_train)
    P_price = model.predict(X_test)
    
    kFoldMedianAbsoluteError = median_absolute_error(y_test,P_price)
    kFoldMedianAbsoluteErrorsKNeighborsRegressor.append(kFoldMedianAbsoluteError)
    kFoldMeanAbsoluteError = mean_absolute_error(y_test,P_price)
    kFoldMeanAbsoluteErrorsKNeighborsRegressor.append(kFoldMeanAbsoluteError)
    
    #DecisionTreeRegressor
    model = DecisionTreeRegressor()
    model.fit(X_train,y_train)
    P_price = model.predict(X_test)
    
    kFoldMedianAbsoluteError = median_absolute_error(y_test,P_price)
    kFoldMedianAbsoluteErrorsDecisionTreeRegressor.append(kFoldMedianAbsoluteError)
    kFoldMeanAbsoluteError = mean_absolute_error(y_test,P_price)
    kFoldMeanAbsoluteErrorsDecisionTreeRegressor.append(kFoldMeanAbsoluteError)
    
    #LinearRegression
    model = LinearRegression()
    model.fit(X_train,y_train)
    P_price = model.predict(X_test)
    
    kFoldMedianAbsoluteError = median_absolute_error(y_test,P_price)
    kFoldMedianAbsoluteErrorsLinearRegression.append(kFoldMedianAbsoluteError)
    kFoldMeanAbsoluteError = mean_absolute_error(y_test,P_price)
    kFoldMeanAbsoluteErrorsLinearRegression.append(kFoldMeanAbsoluteError)
    
    #RANSACRegressor
    model = RANSACRegressor(random_state=1)
    model.fit(X_train,y_train)
    P_price = model.predict(X_test)
    
    kFoldMedianAbsoluteError = median_absolute_error(y_test,P_price)
    kFoldMedianAbsoluteErrorsRANSACRegressor.append(kFoldMedianAbsoluteError)
    kFoldMeanAbsoluteError = mean_absolute_error(y_test,P_price)
    kFoldMeanAbsoluteErrorsRANSACRegressor.append(kFoldMeanAbsoluteError)
    
medianAbsoluteErrorsRandomForestRegressorAverage = np.mean(kFoldMedianAbsoluteErrorsRandomForestRegressor)
meanAbsoluteErrorsRandomForestRegressorAverage = np.mean(kFoldMeanAbsoluteErrorsRandomForestRegressor)
medianAbsoluteErrorsKNeighborsRegressorAverage = np.mean(kFoldMedianAbsoluteErrorsKNeighborsRegressor)
meanAbsoluteErrorsKNeighborsRegressorAverage = np.mean(kFoldMeanAbsoluteErrorsKNeighborsRegressor)
medianAbsoluteErrorsDecisionTreeRegressorAverage = np.mean(kFoldMedianAbsoluteErrorsDecisionTreeRegressor)
meanAbsoluteErrorsDecisionTreeRegressorAverage = np.mean(kFoldMeanAbsoluteErrorsDecisionTreeRegressor)
medianAbsoluteErrorsLinearRegressionAverage = np.mean(kFoldMedianAbsoluteErrorsLinearRegression)
meanAbsoluteErrorsLinearRegressionAverage = np.mean(kFoldMeanAbsoluteErrorsLinearRegression)
medianAbsoluteErrorsRANSACRegressorAverage = np.mean(kFoldMedianAbsoluteErrorsRANSACRegressor)
meanAbsoluteErrorsRANSACRegressorAverage = np.mean(kFoldMeanAbsoluteErrorsRANSACRegressor)

print("--")
print("Random Forest Regressor")
print("Median absolute error: "+str(medianAbsoluteErrorsRandomForestRegressorAverage))
print("Mean absolute error: "+str(meanAbsoluteErrorsRandomForestRegressorAverage))
print("--")
print("K Neighbors Regressor")
print("Median absolute error: "+str(medianAbsoluteErrorsKNeighborsRegressorAverage))
print("Mean absolute error: "+str(meanAbsoluteErrorsKNeighborsRegressorAverage))
print("--")
print("Decision Tree Regressor")
print("Median absolute error: "+str(medianAbsoluteErrorsDecisionTreeRegressorAverage))
print("Mean absolute error: "+str(meanAbsoluteErrorsDecisionTreeRegressorAverage))
print("--")
print("Linear Regression")
print("Median absolute error: "+str(medianAbsoluteErrorsLinearRegressionAverage))
print("Mean absolute error: "+str(meanAbsoluteErrorsLinearRegressionAverage))
print("--")
print("RANSAC Regressor")
print("Median absolute error: "+str(medianAbsoluteErrorsRANSACRegressorAverage))
print("Mean absolute error: "+str(meanAbsoluteErrorsRANSACRegressorAverage))

Executing k fold = 1
Executing k fold = 2
Executing k fold = 3
Executing k fold = 4
Executing k fold = 5
--
Random Forest Regressor
Median absolute error: 12.005467850412424
Mean absolute error: 30.523931139124727
--
K Neighbors Regressor
Median absolute error: 12.392
Mean absolute error: 32.71969650151373
--
Decision Tree Regressor
Median absolute error: 12.009995537595566
Mean absolute error: 30.62225105723394
--
Linear Regression
Median absolute error: 25.378300887423904
Mean absolute error: 41.996869933317434
--
RANSAC Regressor
Median absolute error: 11.567872089804883
Mean absolute error: 41.177625259892835


Lanzamos los moedelos incluyendo los clusters

In [17]:
x_column_names = ["retail","bidincrement","bidfee","flg_click_only","flg_beginnerauction","flg_fixedprice","flg_endprice",\
                  "clusters"]
X = outcomesDf[x_column_names]
y = outcomesDf["price"]

In [18]:
kFoldMedianAbsoluteErrorsRandomForestRegressor = []
kFoldMedianAbsoluteErrorsKNeighborsRegressor = []
kFoldMedianAbsoluteErrorsDecisionTreeRegressor = []
kFoldMedianAbsoluteErrorsLinearRegression = []
kFoldMedianAbsoluteErrorsRANSACRegressor = []

kFoldMeanAbsoluteErrorsRandomForestRegressor = []
kFoldMeanAbsoluteErrorsKNeighborsRegressor = []
kFoldMeanAbsoluteErrorsDecisionTreeRegressor = []
kFoldMeanAbsoluteErrorsLinearRegression = []
kFoldMeanAbsoluteErrorsRANSACRegressor = []

kFoldNumber = 1
kf = KFold(n_splits=5, random_state=1,shuffle=True)
for train_index, test_index in kf.split(X):
    #5-fold cross-validation
    print("Executing k fold = "+str(kFoldNumber))
    kFoldNumber=kFoldNumber+1
    
    #train and test split
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    #RandomForestRegressor
    model=RandomForestRegressor(random_state=1)
    model.fit(X_train,y_train)
    P_price = model.predict(X_test)
    
    kFoldMedianAbsoluteError = median_absolute_error(y_test,P_price)
    kFoldMedianAbsoluteErrorsRandomForestRegressor.append(kFoldMedianAbsoluteError)    
    kFoldMeanAbsoluteError = mean_absolute_error(y_test,P_price)
    kFoldMeanAbsoluteErrorsRandomForestRegressor.append(kFoldMeanAbsoluteError)  

    #KNeighborsRegressor
    model = KNeighborsRegressor()
    model.fit(X_train,y_train)
    P_price = model.predict(X_test)
    
    kFoldMedianAbsoluteError = median_absolute_error(y_test,P_price)
    kFoldMedianAbsoluteErrorsKNeighborsRegressor.append(kFoldMedianAbsoluteError)
    kFoldMeanAbsoluteError = mean_absolute_error(y_test,P_price)
    kFoldMeanAbsoluteErrorsKNeighborsRegressor.append(kFoldMeanAbsoluteError)
    
    #DecisionTreeRegressor
    model = DecisionTreeRegressor()
    model.fit(X_train,y_train)
    P_price = model.predict(X_test)
    
    kFoldMedianAbsoluteError = median_absolute_error(y_test,P_price)
    kFoldMedianAbsoluteErrorsDecisionTreeRegressor.append(kFoldMedianAbsoluteError)
    kFoldMeanAbsoluteError = mean_absolute_error(y_test,P_price)
    kFoldMeanAbsoluteErrorsDecisionTreeRegressor.append(kFoldMeanAbsoluteError)
    
    #LinearRegression
    model = LinearRegression()
    model.fit(X_train,y_train)
    P_price = model.predict(X_test)
    
    kFoldMedianAbsoluteError = median_absolute_error(y_test,P_price)
    kFoldMedianAbsoluteErrorsLinearRegression.append(kFoldMedianAbsoluteError)
    kFoldMeanAbsoluteError = mean_absolute_error(y_test,P_price)
    kFoldMeanAbsoluteErrorsLinearRegression.append(kFoldMeanAbsoluteError)
    
    #RANSACRegressor
    model = RANSACRegressor(random_state=1)
    model.fit(X_train,y_train)
    P_price = model.predict(X_test)
    
    kFoldMedianAbsoluteError = median_absolute_error(y_test,P_price)
    kFoldMedianAbsoluteErrorsRANSACRegressor.append(kFoldMedianAbsoluteError)
    kFoldMeanAbsoluteError = mean_absolute_error(y_test,P_price)
    kFoldMeanAbsoluteErrorsRANSACRegressor.append(kFoldMeanAbsoluteError)
    
medianAbsoluteErrorsRandomForestRegressorAverage = np.mean(kFoldMedianAbsoluteErrorsRandomForestRegressor)
meanAbsoluteErrorsRandomForestRegressorAverage = np.mean(kFoldMeanAbsoluteErrorsRandomForestRegressor)
medianAbsoluteErrorsKNeighborsRegressorAverage = np.mean(kFoldMedianAbsoluteErrorsKNeighborsRegressor)
meanAbsoluteErrorsKNeighborsRegressorAverage = np.mean(kFoldMeanAbsoluteErrorsKNeighborsRegressor)
medianAbsoluteErrorsDecisionTreeRegressorAverage = np.mean(kFoldMedianAbsoluteErrorsDecisionTreeRegressor)
meanAbsoluteErrorsDecisionTreeRegressorAverage = np.mean(kFoldMeanAbsoluteErrorsDecisionTreeRegressor)
medianAbsoluteErrorsLinearRegressionAverage = np.mean(kFoldMedianAbsoluteErrorsLinearRegression)
meanAbsoluteErrorsLinearRegressionAverage = np.mean(kFoldMeanAbsoluteErrorsLinearRegression)
medianAbsoluteErrorsRANSACRegressorAverage = np.mean(kFoldMedianAbsoluteErrorsRANSACRegressor)
meanAbsoluteErrorsRANSACRegressorAverage = np.mean(kFoldMeanAbsoluteErrorsRANSACRegressor)

print("--")
print("Random Forest Regressor")
print("Median absolute error: "+str(medianAbsoluteErrorsRandomForestRegressorAverage))
print("Mean absolute error: "+str(meanAbsoluteErrorsRandomForestRegressorAverage))
print("--")
print("K Neighbors Regressor")
print("Median absolute error: "+str(medianAbsoluteErrorsKNeighborsRegressorAverage))
print("Mean absolute error: "+str(meanAbsoluteErrorsKNeighborsRegressorAverage))
print("--")
print("Decision Tree Regressor")
print("Median absolute error: "+str(medianAbsoluteErrorsDecisionTreeRegressorAverage))
print("Mean absolute error: "+str(meanAbsoluteErrorsDecisionTreeRegressorAverage))
print("--")
print("Linear Regression")
print("Median absolute error: "+str(medianAbsoluteErrorsLinearRegressionAverage))
print("Mean absolute error: "+str(meanAbsoluteErrorsLinearRegressionAverage))
print("--")
print("RANSAC Regressor")
print("Median absolute error: "+str(medianAbsoluteErrorsRANSACRegressorAverage))
print("Mean absolute error: "+str(meanAbsoluteErrorsRANSACRegressorAverage))

Executing k fold = 1
Executing k fold = 2
Executing k fold = 3
Executing k fold = 4
Executing k fold = 5
--
Random Forest Regressor
Median absolute error: 11.587013814117878
Mean absolute error: 30.14573042174056
--
K Neighbors Regressor
Median absolute error: 12.4104
Mean absolute error: 32.55160800477139
--
Decision Tree Regressor
Median absolute error: 11.602408020800924
Mean absolute error: 30.3375155876706
--
Linear Regression
Median absolute error: 24.26105334325838
Mean absolute error: 41.63860765643362
--
RANSAC Regressor
Median absolute error: 12.53689346475899
Mean absolute error: 42.138776182677915


Los clusters mejoran las predicciones.

Probamos a utilizar un OneHotEncoder.

In [40]:
outcomes_tfidf=outcomesDf.drop("clusters",axis=1)

In [41]:
def convertCategoryDataFrameToOneHotEncodedVersion(clusteredCategoriesDf, categoryColumnName):

    categories = clusteredCategoriesDf[categoryColumnName].unique()
    categories=[i.replace("-"," ") for i in categories]
    vectorizer= TfidfVectorizer()
    X = vectorizer.fit_transform(categories)
    data =pd.DataFrame(X.toarray())
    data[categoryColumnName] = categories
    Kmeans = KMeans(n_clusters=6)
    Kmeans.fit(data.iloc[:,:-1])
    data["clusters"]=Kmeans.labels_
    data = data.iloc[:,[-2,-1]]
    data[categoryColumnName]=data[categoryColumnName].str.replace(" ","-")
    clusteredCategoriesDf=clusteredCategoriesDf.merge(data,how="left",on="item")
            
    return clusteredCategoriesDf

In [42]:
outcomestfidf=convertCategoryDataFrameToOneHotEncodedVersion(outcomes_tfidf,"item")



In [46]:
outcomestfidf["clusters"].value_counts()

clusters
5    68918
1    18166
3     5957
4     5737
0     2286
2     2150
Name: count, dtype: int64

In [47]:
x_column_names = ["retail","bidincrement","bidfee","flg_click_only","flg_beginnerauction","flg_fixedprice","flg_endprice",\
                  "clusters"]
X = outcomestfidf[x_column_names]
y = outcomestfidf["price"]

In [48]:
kFoldMedianAbsoluteErrorsRandomForestRegressor = []
kFoldMedianAbsoluteErrorsKNeighborsRegressor = []
kFoldMedianAbsoluteErrorsDecisionTreeRegressor = []
kFoldMedianAbsoluteErrorsLinearRegression = []
kFoldMedianAbsoluteErrorsRANSACRegressor = []

kFoldMeanAbsoluteErrorsRandomForestRegressor = []
kFoldMeanAbsoluteErrorsKNeighborsRegressor = []
kFoldMeanAbsoluteErrorsDecisionTreeRegressor = []
kFoldMeanAbsoluteErrorsLinearRegression = []
kFoldMeanAbsoluteErrorsRANSACRegressor = []

kFoldNumber = 1
kf = KFold(n_splits=5, random_state=1,shuffle=True)
for train_index, test_index in kf.split(X):
    #5-fold cross-validation
    print("Executing k fold = "+str(kFoldNumber))
    kFoldNumber=kFoldNumber+1
    
    #train and test split
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    #RandomForestRegressor
    model=RandomForestRegressor(random_state=1)
    model.fit(X_train,y_train)
    P_price = model.predict(X_test)
    
    kFoldMedianAbsoluteError = median_absolute_error(y_test,P_price)
    kFoldMedianAbsoluteErrorsRandomForestRegressor.append(kFoldMedianAbsoluteError)    
    kFoldMeanAbsoluteError = mean_absolute_error(y_test,P_price)
    kFoldMeanAbsoluteErrorsRandomForestRegressor.append(kFoldMeanAbsoluteError)  

    #KNeighborsRegressor
    model = KNeighborsRegressor()
    model.fit(X_train,y_train)
    P_price = model.predict(X_test)
    
    kFoldMedianAbsoluteError = median_absolute_error(y_test,P_price)
    kFoldMedianAbsoluteErrorsKNeighborsRegressor.append(kFoldMedianAbsoluteError)
    kFoldMeanAbsoluteError = mean_absolute_error(y_test,P_price)
    kFoldMeanAbsoluteErrorsKNeighborsRegressor.append(kFoldMeanAbsoluteError)
    
    #DecisionTreeRegressor
    model = DecisionTreeRegressor()
    model.fit(X_train,y_train)
    P_price = model.predict(X_test)
    
    kFoldMedianAbsoluteError = median_absolute_error(y_test,P_price)
    kFoldMedianAbsoluteErrorsDecisionTreeRegressor.append(kFoldMedianAbsoluteError)
    kFoldMeanAbsoluteError = mean_absolute_error(y_test,P_price)
    kFoldMeanAbsoluteErrorsDecisionTreeRegressor.append(kFoldMeanAbsoluteError)
    
    #LinearRegression
    model = LinearRegression()
    model.fit(X_train,y_train)
    P_price = model.predict(X_test)
    
    kFoldMedianAbsoluteError = median_absolute_error(y_test,P_price)
    kFoldMedianAbsoluteErrorsLinearRegression.append(kFoldMedianAbsoluteError)
    kFoldMeanAbsoluteError = mean_absolute_error(y_test,P_price)
    kFoldMeanAbsoluteErrorsLinearRegression.append(kFoldMeanAbsoluteError)
    
    #RANSACRegressor
    model = RANSACRegressor(random_state=1)
    model.fit(X_train,y_train)
    P_price = model.predict(X_test)
    
    kFoldMedianAbsoluteError = median_absolute_error(y_test,P_price)
    kFoldMedianAbsoluteErrorsRANSACRegressor.append(kFoldMedianAbsoluteError)
    kFoldMeanAbsoluteError = mean_absolute_error(y_test,P_price)
    kFoldMeanAbsoluteErrorsRANSACRegressor.append(kFoldMeanAbsoluteError)
    
medianAbsoluteErrorsRandomForestRegressorAverage = np.mean(kFoldMedianAbsoluteErrorsRandomForestRegressor)
meanAbsoluteErrorsRandomForestRegressorAverage = np.mean(kFoldMeanAbsoluteErrorsRandomForestRegressor)
medianAbsoluteErrorsKNeighborsRegressorAverage = np.mean(kFoldMedianAbsoluteErrorsKNeighborsRegressor)
meanAbsoluteErrorsKNeighborsRegressorAverage = np.mean(kFoldMeanAbsoluteErrorsKNeighborsRegressor)
medianAbsoluteErrorsDecisionTreeRegressorAverage = np.mean(kFoldMedianAbsoluteErrorsDecisionTreeRegressor)
meanAbsoluteErrorsDecisionTreeRegressorAverage = np.mean(kFoldMeanAbsoluteErrorsDecisionTreeRegressor)
medianAbsoluteErrorsLinearRegressionAverage = np.mean(kFoldMedianAbsoluteErrorsLinearRegression)
meanAbsoluteErrorsLinearRegressionAverage = np.mean(kFoldMeanAbsoluteErrorsLinearRegression)
medianAbsoluteErrorsRANSACRegressorAverage = np.mean(kFoldMedianAbsoluteErrorsRANSACRegressor)
meanAbsoluteErrorsRANSACRegressorAverage = np.mean(kFoldMeanAbsoluteErrorsRANSACRegressor)

print("--")
print("Random Forest Regressor")
print("Median absolute error: "+str(medianAbsoluteErrorsRandomForestRegressorAverage))
print("Mean absolute error: "+str(meanAbsoluteErrorsRandomForestRegressorAverage))
print("--")
print("K Neighbors Regressor")
print("Median absolute error: "+str(medianAbsoluteErrorsKNeighborsRegressorAverage))
print("Mean absolute error: "+str(meanAbsoluteErrorsKNeighborsRegressorAverage))
print("--")
print("Decision Tree Regressor")
print("Median absolute error: "+str(medianAbsoluteErrorsDecisionTreeRegressorAverage))
print("Mean absolute error: "+str(meanAbsoluteErrorsDecisionTreeRegressorAverage))
print("--")
print("Linear Regression")
print("Median absolute error: "+str(medianAbsoluteErrorsLinearRegressionAverage))
print("Mean absolute error: "+str(meanAbsoluteErrorsLinearRegressionAverage))
print("--")
print("RANSAC Regressor")
print("Median absolute error: "+str(medianAbsoluteErrorsRANSACRegressorAverage))
print("Mean absolute error: "+str(meanAbsoluteErrorsRANSACRegressorAverage))

Executing k fold = 1
Executing k fold = 2
Executing k fold = 3
Executing k fold = 4
Executing k fold = 5
--
Random Forest Regressor
Median absolute error: 11.655097301117584
Mean absolute error: 30.060296558111048
--
K Neighbors Regressor
Median absolute error: 12.374
Mean absolute error: 32.505447089582184
--
Decision Tree Regressor
Median absolute error: 11.65838803039814
Mean absolute error: 30.166286209382708
--
Linear Regression
Median absolute error: 21.739819863139065
Mean absolute error: 40.620790468527964
--
RANSAC Regressor
Median absolute error: 12.387258070944178
Mean absolute error: 38.96383334263691


Lanzamos los modelos para cada uno de los clusters

In [58]:
for i in range(6):
    x_column_names = ["retail","bidincrement","bidfee","flg_click_only","flg_beginnerauction","flg_fixedprice","flg_endprice"]
    X = outcomesDf[x_column_names].loc[outcomesDf.clusters==i]
    y = outcomesDf["price"].loc[outcomesDf.clusters==i]
    
    kFoldMedianAbsoluteErrorsRandomForestRegressor = []
    kFoldMedianAbsoluteErrorsKNeighborsRegressor = []
    kFoldMedianAbsoluteErrorsDecisionTreeRegressor = []
    kFoldMedianAbsoluteErrorsLinearRegression = []
    kFoldMedianAbsoluteErrorsRANSACRegressor = []

    kFoldMeanAbsoluteErrorsRandomForestRegressor = []
    kFoldMeanAbsoluteErrorsKNeighborsRegressor = []
    kFoldMeanAbsoluteErrorsDecisionTreeRegressor = []
    kFoldMeanAbsoluteErrorsLinearRegression = []
    kFoldMeanAbsoluteErrorsRANSACRegressor = []

    kFoldNumber = 1
    kf = KFold(n_splits=5, random_state=1,shuffle=True)
    for train_index, test_index in kf.split(X):
    #5-fold cross-validation
        print("Executing k fold = "+str(kFoldNumber))
        kFoldNumber=kFoldNumber+1
    
    #train and test split
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    #RandomForestRegressor
        model=RandomForestRegressor(random_state=1)
        model.fit(X_train,y_train)
        P_price = model.predict(X_test)
    
        kFoldMedianAbsoluteError = median_absolute_error(y_test,P_price)
        kFoldMedianAbsoluteErrorsRandomForestRegressor.append(kFoldMedianAbsoluteError)    
        kFoldMeanAbsoluteError = mean_absolute_error(y_test,P_price)
        kFoldMeanAbsoluteErrorsRandomForestRegressor.append(kFoldMeanAbsoluteError)  

    #KNeighborsRegressor
        model = KNeighborsRegressor()
        model.fit(X_train,y_train)
        P_price = model.predict(X_test)
    
        kFoldMedianAbsoluteError = median_absolute_error(y_test,P_price)
        kFoldMedianAbsoluteErrorsKNeighborsRegressor.append(kFoldMedianAbsoluteError)
        kFoldMeanAbsoluteError = mean_absolute_error(y_test,P_price)
        kFoldMeanAbsoluteErrorsKNeighborsRegressor.append(kFoldMeanAbsoluteError)
    
    #DecisionTreeRegressor
        model = DecisionTreeRegressor()
        model.fit(X_train,y_train)
        P_price = model.predict(X_test)
    
        kFoldMedianAbsoluteError = median_absolute_error(y_test,P_price)
        kFoldMedianAbsoluteErrorsDecisionTreeRegressor.append(kFoldMedianAbsoluteError)
        kFoldMeanAbsoluteError = mean_absolute_error(y_test,P_price)
        kFoldMeanAbsoluteErrorsDecisionTreeRegressor.append(kFoldMeanAbsoluteError)
    
    #LinearRegression
        model = LinearRegression()
        model.fit(X_train,y_train)
        P_price = model.predict(X_test)
    
        kFoldMedianAbsoluteError = median_absolute_error(y_test,P_price)
        kFoldMedianAbsoluteErrorsLinearRegression.append(kFoldMedianAbsoluteError)
        kFoldMeanAbsoluteError = mean_absolute_error(y_test,P_price)
        kFoldMeanAbsoluteErrorsLinearRegression.append(kFoldMeanAbsoluteError)
    
    #RANSACRegressor
        model = RANSACRegressor(random_state=1)
        model.fit(X_train,y_train)
        P_price = model.predict(X_test)
    
        kFoldMedianAbsoluteError = median_absolute_error(y_test,P_price)
        kFoldMedianAbsoluteErrorsRANSACRegressor.append(kFoldMedianAbsoluteError)
        kFoldMeanAbsoluteError = mean_absolute_error(y_test,P_price)
        kFoldMeanAbsoluteErrorsRANSACRegressor.append(kFoldMeanAbsoluteError)
    
    medianAbsoluteErrorsRandomForestRegressorAverage = np.mean(kFoldMedianAbsoluteErrorsRandomForestRegressor)
    meanAbsoluteErrorsRandomForestRegressorAverage = np.mean(kFoldMeanAbsoluteErrorsRandomForestRegressor)
    medianAbsoluteErrorsKNeighborsRegressorAverage = np.mean(kFoldMedianAbsoluteErrorsKNeighborsRegressor)
    meanAbsoluteErrorsKNeighborsRegressorAverage = np.mean(kFoldMeanAbsoluteErrorsKNeighborsRegressor)
    medianAbsoluteErrorsDecisionTreeRegressorAverage = np.mean(kFoldMedianAbsoluteErrorsDecisionTreeRegressor)
    meanAbsoluteErrorsDecisionTreeRegressorAverage = np.mean(kFoldMeanAbsoluteErrorsDecisionTreeRegressor)
    medianAbsoluteErrorsLinearRegressionAverage = np.mean(kFoldMedianAbsoluteErrorsLinearRegression)
    meanAbsoluteErrorsLinearRegressionAverage = np.mean(kFoldMeanAbsoluteErrorsLinearRegression)
    medianAbsoluteErrorsRANSACRegressorAverage = np.mean(kFoldMedianAbsoluteErrorsRANSACRegressor)
    meanAbsoluteErrorsRANSACRegressorAverage = np.mean(kFoldMeanAbsoluteErrorsRANSACRegressor)

    print(f"cluster {i}")
    print("--")
    print("Random Forest Regressor")
    print("Median absolute error: "+str(medianAbsoluteErrorsRandomForestRegressorAverage))
    print("Mean absolute error: "+str(meanAbsoluteErrorsRandomForestRegressorAverage))
    print("--")
    print("K Neighbors Regressor")
    print("Median absolute error: "+str(medianAbsoluteErrorsKNeighborsRegressorAverage))
    print("Mean absolute error: "+str(meanAbsoluteErrorsKNeighborsRegressorAverage))
    print("--")
    print("Decision Tree Regressor")
    print("Median absolute error: "+str(medianAbsoluteErrorsDecisionTreeRegressorAverage))
    print("Mean absolute error: "+str(meanAbsoluteErrorsDecisionTreeRegressorAverage))
    print("--")
    print("Linear Regression")
    print("Median absolute error: "+str(medianAbsoluteErrorsLinearRegressionAverage))
    print("Mean absolute error: "+str(meanAbsoluteErrorsLinearRegressionAverage))
    print("--")
    print("RANSAC Regressor")
    print("Median absolute error: "+str(medianAbsoluteErrorsRANSACRegressorAverage))
    print("Mean absolute error: "+str(meanAbsoluteErrorsRANSACRegressorAverage))

Executing k fold = 1
Executing k fold = 2
Executing k fold = 3
Executing k fold = 4
Executing k fold = 5
cluster 0
--
Random Forest Regressor
Median absolute error: 12.277255490380373
Mean absolute error: 30.504380791629455
--
K Neighbors Regressor
Median absolute error: 13.553199999999999
Mean absolute error: 33.84710765336919
--
Decision Tree Regressor
Median absolute error: 12.277670002264866
Mean absolute error: 30.594119271774197
--
Linear Regression
Median absolute error: 27.200326601869385
Mean absolute error: 41.309363053644155
--
RANSAC Regressor
Median absolute error: 12.447152047192455
Mean absolute error: 41.226539582212936
Executing k fold = 1
Executing k fold = 2
Executing k fold = 3
Executing k fold = 4
Executing k fold = 5
cluster 1
--
Random Forest Regressor
Median absolute error: 10.900616629968948
Mean absolute error: 23.080353699731912
--
K Neighbors Regressor
Median absolute error: 11.3336
Mean absolute error: 24.394991748292572
--
Decision Tree Regressor
Median ab

Buscamos los mejores hiperparametros para cada cluster.

In [15]:
for i in range(6):
    x_column_names = ["retail","bidincrement","bidfee","flg_click_only","flg_beginnerauction","flg_fixedprice","flg_endprice"]
    X = outcomesDf[x_column_names].loc[outcomesDf.clusters==i]
    y = outcomesDf["price"].loc[outcomesDf.clusters==i]
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.02, random_state = 42)
    
    n_estimators = [5,20,50,100] 
    max_features = ['auto', 'sqrt'] 
    max_depth = [int(x) for x in np.linspace(10, 120, num = 12)] 
    min_samples_split = [2, 6, 10] 
    min_samples_leaf = [1, 3, 4] 
    bootstrap = [True, False] 

    random_grid = {'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap}

    rf = RandomForestRegressor()

    rf_random = RandomizedSearchCV(estimator = rf,param_distributions = random_grid,
               n_iter = 100, cv = 5, verbose=2, random_state=35, n_jobs = -1)
    
    
    rf_random.fit(X_train, y_train)
    
    print(f"---------------cluster {i}---------------")
    print(f"Best parameters: {rf_random.best_params_}")
    
    
    
    

Fitting 5 folds for each of 100 candidates, totalling 500 fits


  warn(


---------------cluster 0---------------
Best parameters: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 50, 'bootstrap': True}
Fitting 5 folds for each of 100 candidates, totalling 500 fits
---------------cluster 1---------------
Best parameters: {'n_estimators': 20, 'min_samples_split': 6, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 110, 'bootstrap': True}
Fitting 5 folds for each of 100 candidates, totalling 500 fits
---------------cluster 2---------------
Best parameters: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 50, 'bootstrap': True}
Fitting 5 folds for each of 100 candidates, totalling 500 fits
---------------cluster 3---------------
Best parameters: {'n_estimators': 20, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'max_depth': 40, 'bootstrap': False}
Fitting 5 folds for each of 100 candidates, totalling 500 fits
--

Lanzamos un Rnadom Forest con los hiperparametros más adecuados para cada cluster.

In [17]:
x_column_names = ["retail","bidincrement","bidfee","flg_click_only","flg_beginnerauction","flg_fixedprice","flg_endprice"]
X = outcomesDf[x_column_names].loc[outcomesDf.clusters==0]
y = outcomesDf["price"].loc[outcomesDf.clusters==0]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.02, random_state = 42)

n_estimators = [100] 
max_features = ['auto'] 
max_depth = [50] 
min_samples_split = [10] 
min_samples_leaf = [4] 
bootstrap = [True]

random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
    
rf = RandomForestRegressor()

rf_random = RandomizedSearchCV(estimator = rf,param_distributions = random_grid,
            n_iter = 100, cv = 5, verbose=2, random_state=35, n_jobs = -1)

rf_random.fit(X_train, y_train)
P_price = rf_random.predict(X_test)
kFoldMedianAbsoluteError = median_absolute_error(y_test,P_price)
kFoldMeanAbsoluteError = mean_absolute_error(y_test,P_price)

print("results cluster 0:")
print(f"Median Absolute Error: {kFoldMedianAbsoluteError}")
print(f"Mean Absolute Error: {kFoldMeanAbsoluteError}")



Fitting 5 folds for each of 1 candidates, totalling 5 fits


  warn(


results cluster 0:
Median Absolute Error: 12.158507729536574
Mean Absolute Error: 30.061069725567382


In [77]:
x_column_names = ["retail","bidincrement","bidfee","flg_click_only","flg_beginnerauction","flg_fixedprice","flg_endprice"]
X = outcomesDf[x_column_names].loc[outcomesDf.clusters==1]
y = outcomesDf["price"].loc[outcomesDf.clusters==1]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.02, random_state = 42)

n_estimators = [20] 
max_features = ['sqrt'] 
max_depth = [110] 
min_samples_split = [6] 
min_samples_leaf = [4] 
bootstrap = [True]

random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
    
rf = RandomForestRegressor()

rf_random = RandomizedSearchCV(estimator = rf,param_distributions = random_grid,
            n_iter = 100, cv = 5, verbose=2, random_state=35, n_jobs = -1)

rf_random.fit(X_train, y_train)
P_price = rf_random.predict(X_test)
kFoldMedianAbsoluteError = median_absolute_error(y_test,P_price)
kFoldMeanAbsoluteError = mean_absolute_error(y_test,P_price)

print("results cluster 1:")
print(f"Median Absolute Error: {kFoldMedianAbsoluteError}")
print(f"Mean Absolute Error: {kFoldMeanAbsoluteError}")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
results cluster 1:
Median Absolute Error: 7.455179322238099
Mean Absolute Error: 15.330199707330783




In [78]:
x_column_names = ["retail","bidincrement","bidfee","flg_click_only","flg_beginnerauction","flg_fixedprice","flg_endprice"]
X = outcomesDf[x_column_names].loc[outcomesDf.clusters==2]
y = outcomesDf["price"].loc[outcomesDf.clusters==2]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.02, random_state = 42)

n_estimators = [100] 
max_features = ['sqrt'] 
max_depth = [50] 
min_samples_split = [10] 
min_samples_leaf = [1] 
bootstrap = [True]

random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
    
rf = RandomForestRegressor()

rf_random = RandomizedSearchCV(estimator = rf,param_distributions = random_grid,
            n_iter = 100, cv = 5, verbose=2, random_state=35, n_jobs = -1)

rf_random.fit(X_train, y_train)
P_price = rf_random.predict(X_test)
kFoldMedianAbsoluteError = median_absolute_error(y_test,P_price)
kFoldMeanAbsoluteError = mean_absolute_error(y_test,P_price)

print("results cluster 2:")
print(f"Median Absolute Error: {kFoldMedianAbsoluteError}")
print(f"Mean Absolute Error: {kFoldMeanAbsoluteError}")



Fitting 5 folds for each of 1 candidates, totalling 5 fits
results cluster 2:
Median Absolute Error: 9.067999019374568
Mean Absolute Error: 19.33268873780873


In [87]:
x_column_names = ["retail","bidincrement","bidfee","flg_click_only","flg_beginnerauction","flg_fixedprice","flg_endprice"]
X = outcomesDf[x_column_names].loc[outcomesDf.clusters==3]
y = outcomesDf["price"].loc[outcomesDf.clusters==3]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.02, random_state = 42)

n_estimators = [20] 
max_features = ['sqrt'] 
max_depth = [40] 
min_samples_split = [3] 
min_samples_leaf = [2] 
bootstrap = [False]

random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
    
rf = RandomForestRegressor()

rf_random = RandomizedSearchCV(estimator = rf,param_distributions = random_grid,
            n_iter = 100, cv = 5, verbose=2, random_state=35, n_jobs = -1)

rf_random.fit(X_train, y_train)
P_price = rf_random.predict(X_test)
kFoldMedianAbsoluteError = median_absolute_error(y_test,P_price)
kFoldMeanAbsoluteError = mean_absolute_error(y_test,P_price)

print("results cluster 3:")
print(f"Median Absolute Error: {kFoldMedianAbsoluteError}")
print(f"Mean Absolute Error: {kFoldMeanAbsoluteError}")



Fitting 5 folds for each of 1 candidates, totalling 5 fits
results cluster 3:
Median Absolute Error: 9.968589425897552
Mean Absolute Error: 28.81654031434219
