In [5]:
#@title  { vertical-output: true, form-width: "500px" }
import pandas as pd
import numpy as np
import seaborn as sns
import re
import sklearn.compose
import matplotlib.pyplot as plt
from sklearn.linear_model import ElasticNet
#from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import sklearn.preprocessing

# Ignorar errores de divergencias cuando alpha=0
import warnings
warnings.filterwarnings('ignore')

# Sort from lower to higher so that the lowest is 0 and the highest is n
def sortColumnByPriceAverage(X, columnName):
  companies = {}

  for product in X[columnName]:
    companies[product] = (0, 0)

  for i, company in enumerate(X[columnName]):
    price = X["Price"][i]
    val, n = companies[company]
    val += price

    companies[company] = (val, n+1)

  _companies = [ (company, companies[company][0]) for company in companies ]
  sortedCompanies = list(map(lambda t: t[0], sorted(_companies, key=lambda t: t[1])))

  return sortedCompanies

########################
# Handlers de columnas #
########################
def handle_Memoria(mem):
  regex_memoria = "(\d*\.?\d+)(\w+) (SSD|HDD|Flash Storage|Hybrid)" # Regex para identificar la memoria
  importancia = { "GB": 1, "TB": 10,
                "HDD": 1, "Flash Storage": 2, "Hybrid": 3, "SSD": 5 }

  productos = [ x for x in re.findall(regex_memoria, mem) ]
  total = 0

  for producto in productos:
    m1 = int(float(producto[0]))  # Cantidad de memoria
    m2 = importancia[producto[1]] # Unidades
    m3 = importancia[producto[2]] # Tipo de disco
  
    total += m1*m2*m3
  
  return total

def handle_Resolucion(res):
  is_touchscreen = "Touchscreen" in res # Si es touchscreen o no
  is_ips = "IPS" in res # Si es IPS o no
  
  resolution = re.findall("\d+x\d+", res)[0]
  w, h = [ int(_) for _ in resolution.split("x") ] # Ancho y alto

  return (w*h)//10 + (is_ips*200) + (is_touchscreen*400)

# Solo para que este aqui
def handle_Cpu(cpu):
  ghz = float( cpu.split()[-1][:-3] )
  brand = 300 if "Intel" in cpu else 200 if "AMD" in cpu else 100
  
  return ghz*100+brand

# Solo para que este aqui
def handle_Gpu(gpu):
  return 3 if "Nvidia" in gpu else 2 if "AMD" in gpu else 1

#------------------------------------------------------------------------------------------------------#

##########
# Modelo #
##########
class Model:
  def __init__(self, alpha, ratio):
    self.TRAIN_URL = "https://filebin.net/3sw6utbuul0sh225/train.csv"
    self.TEST_URL  = "https://filebin.net/3sw6utbuul0sh225/test.csv"

    self.alpha = alpha
    self.ratio = ratio

    self.X = None
    self.Y = None
    self.X_T = None
    self.Y_T = None

    self.trained_model = None
    self.prediction = None
    self.csv = None

    self.outlier_func = None # La funcion para eliminar outliers (ej ElasticNet)

  ###########
  # Dataset #
  ###########
  def load_datasets(self):
    self.X = pd.read_csv(self.TRAIN_URL, sep=",")
    self.Y = pd.read_csv(self.TEST_URL,  sep=",")
    
    # Mantener una copia para echarle un vistazo
    self.X_original = self.X.copy(deep=True)
    self.Y_original = self.X.copy(deep=True)
  
  ##############################
  # Transformacion de columnas #
  ##############################
  # Aplica la funcion f a todos los elementos de una columna
  def map_column(self, columna, f):
    self.X[columna] = [ f(val) for val in self.X[columna] ]
    self.Y[columna] = [ f(val) for val in self.Y[columna] ]

  # Modifica los valores de cada columna. Se pueden comentar y no pasa nada.
  def modify_columns(self):
    # Listas auxiliares ordenadas por peso
    sortedCompanies = sortColumnByPriceAverage(self.X, "Company")
    sortedTypeNames = sortColumnByPriceAverage(self.X, "TypeName")
    sortedOS = ["No OS", "Linux", "Android", "Chrome OS", "Windows 10 S", "Windows 7", "Windows 10", "macOS", "Mac OS X"]

    self.map_column("Ram",              lambda val: int(val[:-2]))
    self.map_column("Weight",           lambda val: float(val[:-2]))            # Baja un poooooooooooco el error
    self.map_column("Company",          lambda val: sortedCompanies.index(val))
    self.map_column("TypeName",         lambda val: sortedTypeNames.index(val))
    self.map_column("OpSys",            lambda val: sortedOS.index(val))        # Comprobar sesgo
    self.map_column("Memory",           handle_Memoria)
    self.map_column("ScreenResolution", handle_Resolucion)                       # Baja el error muy poco
    self.map_column("Cpu",              handle_Cpu)                              # Baja el error menos de 10
    self.map_column("Gpu",              handle_Gpu)

  def transform_datasets(self):
    self.X_T = sklearn.compose.ColumnTransformer(transformers=[
        ("LaptopId",          "drop",                                 [0]),
        ("Company",           "passthrough",                          [1]), # probar drop
        ("Product",           "drop",                                 [2]),
        ("TypeName",          "passthrough",                          [3]), # probar drop
        ("Inches",            "passthrough",                          [4]),
        ("ScreenResolution",  sklearn.preprocessing.MinMaxScaler(),   [5]),
        ("Cpu",               "passthrough",                          [6]),
        ("Ram",               "passthrough",                          [7]),
        ("Memory",            sklearn.preprocessing.MinMaxScaler(),   [8]),
        ("Gpu",               "passthrough",                          [9]),
        ("OpSys",             "passthrough",                          [10]),
        ("Weight",            "passthrough",                          [11]),
        ("Price",             "drop",                                 [12])
    ]).fit_transform(self.X)

    self.Y_T = sklearn.compose.ColumnTransformer(transformers=[
        ("LaptopId",          "drop",                                 [0]),
        ("Company",           "passthrough",                          [1]),
        ("Product",           "drop",                                 [2]),
        ("TypeName",          "passthrough",                          [3]),
        ("Inches",            "passthrough",                          [4]),
        ("ScreenResolution",  sklearn.preprocessing.MinMaxScaler(),   [5]),
        ("Cpu",               "passthrough",                          [6]),
        ("Ram",               "passthrough",                          [7]),
        ("Memory",            sklearn.preprocessing.MinMaxScaler(),   [8]),
        ("Gpu",               "passthrough",                          [9]),
        ("OpSys",             "passthrough",                          [10]),
        ("Weight",            "passthrough",   [11])
    ]).fit_transform(self.Y)

    # StandardScaler. No cambia nada.
    self.X_T = sklearn.preprocessing.RobustScaler().fit_transform(self.X_T)
    self.Y_T = sklearn.preprocessing.RobustScaler().fit_transform(self.Y_T)
  
  ##########################
  # Regresion y prediccion #
  ##########################
  def set_outlier_func(self):
    #from random import randint
    #rand = randint(0, 1000)
    #print(rand)

    #
    # KNN baja el error 40 puntos.
    #

    #from sklearn.neighbors import KNeighborsRegressor
    #self.outlier_func = KNeighborsRegressor(n_neighbors=4)


    # Lo puede bajar alrededor de 4 puntos o asi
    #from sklearn.linear_model import SGDRegressor
    #self.outlier_func = SGDRegressor(max_iter=100, random_state=rand)

    #
    # RandomForest baja el error a 200 o asi.
    # rand=683 => -199
    #

    #from sklearn.ensemble import RandomForestRegressor
    #self.outlier_func = RandomForestRegressor(max_depth=100, random_state=683)

    #
    # ~-206
    #

    #from sklearn.ensemble import ExtraTreesRegressor
    #self.outlier_func = ExtraTreesRegressor(n_estimators=40)

    #self.outlier_func = ElasticNet(alpha=self.alpha, l1_ratio=self.ratio)

  def apply_regression(self):
    self.trained_model = self.outlier_func.fit(self.X_T, self.X["Price"])

  def predict(self):
    self.prediction = self.trained_model.predict(self.Y_T)

  #######
  # CSV #
  #######
  def to_csv(self):
    self.solucion = pd.DataFrame(data = {
        "LaptopId" : self.Y["LaptopId"],
        "Price"    : self.prediction
    })
  
  def save_csv(self, name):
    self.solucion.to_csv(name, index=False)

  #################
  # Ejecutar todo #
  #################
  def run(self):
    # Obtener y parsear datasets
    self.load_datasets()
    self.modify_columns()
    self.transform_datasets()

    # Aplicar regresion y hacer una prediccion
    self.set_outlier_func()
    self.apply_regression()
    self.predict()

    # Convertir a csv
    self.to_csv()
    self.save_csv("solucion.csv")
  
  def print_avgs(self):
    train_avg = sum(self.X["Price"]) / len(self.X["Price"])
    pred_avg  = sum(self.prediction) / len(self.prediction)

    print("Medias\n" + "-" * 100)
    print("Avg training set :", train_avg)
    print("Avg prediction   :", pred_avg)
    print("Diff             :", abs(train_avg - pred_avg), "\n")

############################
# Clase para probar el MAE #
############################
class ModelTest(Model):
  def __init__(self, alpha, ratio, partition_n):
    # Inicializar padre. Tiene que ser la primera llamada dentro del __init__
    super().__init__(alpha, ratio) 

    self.PARTITION_N = partition_n # Numero de particiones que se van a probar
    self.maes = None               # Array con los maes de cada intento

  def get_maes(self):
    # Prueba n particiones para training y validacion y devuelve el error obtenido en cada intento
    self.maes = cross_val_score(self.outlier_func, self.X_T, self.X["Price"],
                                cv=self.PARTITION_N, scoring="neg_mean_absolute_error")

  def run(self):
    self.load_datasets()
    self.modify_columns()
    self.transform_datasets()
    self.set_outlier_func()
    self.get_maes()

  def print_MAE(self):
    print("MAEs   :", self.maes)
    print("MAE avg:", sum(self.maes) / len(self.maes))

def main():
  es_prueba = True   #@param {type:"boolean"}
  alpha = 0          #@param {type:"raw"}
  ratio = 0          #@param {type:"raw"}
  partition_n = 5    #@param {type:"slider", min:0, max:10,  step:1}

  if es_prueba:
    # Ejecutar para hacer pruebas
    model_test = ModelTest(alpha=alpha, ratio=ratio, partition_n=partition_n)
    model_test.run()
    model_test.print_MAE()

    return model_test

  else:
    # Ejecutar para guardar el resultado
    model = Model(alpha=alpha, ratio=ratio)
  
    model.run()
    model.print_avgs()

    return model

model = main()


MAEs   : [-251.55840164 -236.21770492 -224.03653846 -229.71527473 -234.43083791]
MAE avg: -235.19175153125565


In [None]:
model.X

Unnamed: 0,LaptopId,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,981,Toshiba,Portege Z30-C-1CV,Notebook,13.3,Full HD 1920x1080,Intel Core i5 6200U 2.3GHz,4,128GB SSD,Intel HD Graphics 520,Windows 7,1.2kg,1195.00
1,996,HP,Spectre 13-V111dx,Ultrabook,13.3,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 7500U 2.7GHz,8,256GB SSD,Intel HD Graphics 620,Windows 10,1.11kg,1349.00
2,77,Dell,Inspiron 5570,Notebook,15.6,Full HD 1920x1080,Intel Core i7 8550U 1.8GHz,8,128GB SSD + 1TB HDD,Intel UHD Graphics 620,Windows 10,2.02kg,855.00
3,103,HP,Envy 13-ad009n,Ultrabook,13.3,IPS Panel Full HD 1920x1080,Intel Core i7 7500U 2.7GHz,8,256GB SSD,Nvidia GeForce MX150,Windows 10,1.38kg,1119.00
4,946,Dell,Latitude 7280,Ultrabook,12.5,Full HD / Touchscreen 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics,Windows 10,1.36kg,1472.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...
907,994,Dell,Inspiron 5378,2 in 1 Convertible,13.3,Full HD / Touchscreen 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,Windows 10,1.68kg,889.00
908,615,Asus,FX753VD-GC007T (i7-7700HQ/8GB/1TB,Gaming,17.3,Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,8,128GB SSD + 1TB HDD,Nvidia GeForce GTX 1050,Windows 10,3kg,1168.00
909,1063,Dell,Inspiron 5567,Notebook,15.6,Full HD 1920x1080,Intel Core i7 7500U 2.7GHz,16,2TB HDD,AMD Radeon R7 M445,Windows 10,2.32kg,989.99
910,96,Dell,Inspiron 3567,Notebook,15.6,Full HD 1920x1080,Intel Core i7 7500U 2.7GHz,8,1TB HDD,AMD Radeon R5 M430,Linux,2.2kg,599.90


In [None]:
companies = {}

for product in model.X["Company"]:
  companies[product] = (0, 0)

for i, company in enumerate(model.X["Company"]):
  price = model.X["Price"][i]
  val, n = companies[company]
  val += price

  companies[company] = (val, n+1)

vals = [ (company, companies[company][0]) for company in companies ]

#test = sorted(companies, key=lambda vals: vals[1])
#test
for key in companies:
  print(key, companies[key][0]/companies[key][1])

#for key in test:
#  print(key, companies[key][1])

#vals


Toshiba 1230.388888888889
HP 1036.1311386138607
Dell 1184.473737373737
Lenovo 1132.3243137254904
Asus 1141.0162608695653
Microsoft 1612.3083333333334
Vero 219.63333333333333
MSI 1670.313513513514
Acer 632.0782352941177
Apple 1389.201
Mediacom 282.2
Samsung 1312.3333333333333
Razer 3470.6666666666665
Google 1677.6666666666667
Fujitsu 729.0
Chuwi 248.9
Huawei 1424.0
LG 2099.0
Xiaomi 1133.4625
