# Entrenamiento

## 1. Preprocesado

### 1.1. Lectura

In [None]:
import json
import requests

link='https://informatica.ieszaidinvergeles.org:10056/pia/scrapy/models/dictionary_manual_values_9.json'

all_dictionaries={}

f = requests.get(link)
if f.status_code == 200:
  all_dictionaries=json.loads(f.text)

In [None]:
all_dictionaries

In [None]:
import pandas as pd

url, names, dataframes = "./cars/", [ 'all.csv' ], []

for name in names:
  destiny = url + name.replace(' ', '%20')
  print('Cargando: ' + destiny)
  if 'csv' in name:
    dataframes.append(pd.read_csv(destiny, delimiter=',', decimal=".", thousands=None))
  else:
    dataframes.append(pd.read_json(destiny))

In [None]:
all = pd.concat(dataframes)

### 1.2. Procesado de datos nulos

In [None]:
print("Datos iniciales | Longitud: {} | Num Columns: {}".format(len(all), len(all.columns)))
all=all.dropna(axis=1,how='all')
print("Eliminados na or null on columns | Longitud: {} | Num Columns: {}".format(len(all), len(all.columns)))
all=all.dropna(axis=0)
print("Eliminados na or null on rows | Longitud: {} | Num Columns: {}".format(len(all), len(all.columns)))
all=all.drop_duplicates()
all=all.drop_duplicates(subset=['id'], keep='first')
print("Eliminados duplicados | Longitud: {} | Num Columns: {}".format(len(all), len(all.columns)))
print("Datos finales | Longitud: {} | Num Columns: {}".format(len(all), len(all.columns)))

### 1.3. Procesado columnas

In [None]:
all.columns

In [None]:
all.columns

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(60,16))
sns.set()
sns.heatmap(all.corr(), square=True, annot=True,linewidths=0.1)

In [None]:
columns_to_drop=['id','color','fuelTypeId','makeId','province','provinceId','title','transmissionTypeId','brand_id','luggageCapacity','consumptionAverage','consumptionExtraUrban','model_id','manufacturerPrice','seats','weight','emissions','fuelTankCapacity','height','luggageCapacity','brand','doors','topSpeed','consumptionUrban','model']
all=all.drop(columns_to_drop, axis = 1)

In [None]:
all.columns

In [None]:
all.head(2)

In [None]:
def columns_numeric_type(df,columns_names):
  for columns_name in columns_names:
    df[column_name]=pd.to_numeric(df[column_name])
  return df

In [None]:
qualitative_variable_names = [ e for e in all.select_dtypes(['object']).columns ]
for column_name in qualitative_variable_names:
  print(column_name, len(all[column_name].unique()))

In [None]:
import operator

def replace_for_numeric(df,column_name,initial,increaser,value_dictionary={}):
  values=df[column_name].unique()
  pos=initial
  if len(value_dictionary) > 0:
    max_dictionary=max(value_dictionary.items(), key=operator.itemgetter(1))[1]
  else:
    max_dictionary=initial
  if max_dictionary > pos:
    pos=max_dictionary+increaser
  for value in values:
    if value not in value_dictionary:
      value_dictionary[value]=pos
      df.loc[df[column_name] == value, column_name] = pos
      pos=pos+increaser
    else:
      df.loc[df[column_name] == value, column_name] = value_dictionary[value]
  df[column_name]=pd.to_numeric(df[column_name])
  # print("Nuevos valores: ", df[column_name].unique())
  return df,value_dictionary

In [None]:
all.select_dtypes(['object']).columns

In [None]:
all,word_dictionary=replace_for_numeric(all,'make',500,1000,all_dictionaries.get('make', {}))
all_dictionaries['make']=word_dictionary
all,word_dictionary=replace_for_numeric(all,'transmissionType',0,1,all_dictionaries.get('transmissionType', {}))
all_dictionaries['transmissionType']=word_dictionary
all,word_dictionary=replace_for_numeric(all,'seller_type',0,1,all_dictionaries.get('seller_type', {}))
all_dictionaries['seller_type']=word_dictionary
all,word_dictionary=replace_for_numeric(all,'bodyType',500,200,all_dictionaries.get('bodyType', {}))
all_dictionaries['bodyType']=word_dictionary
print(all_dictionaries)

### 1.4. Muestra de datos finales

In [None]:
all.columns

In [None]:
all.tail(10)

### 1.5. Split

In [None]:
from sklearn.model_selection import train_test_split as tts

all_x_data = all.drop('price', axis = 1)
all_y_data = all['price']

x_train, x_test, y_train, y_test = tts(all_x_data, all_y_data, random_state = 1, test_size = 0.20)

def print_bests(results, limit=5):
  best_results=sorted(results, key = lambda x: x.get('score'), reverse=True)[:limit]
  for best_result in best_results:
    print("score: {} | model_name: {}".format(best_result.get('score'), best_result.get('model_name')))
  return best_results


max_i=50
max_i_min=30

# max_i=15
# max_i_min=10

# max_i=25
# max_i_min=15

## 2. Carga XGBoost

### Dependencias

In [None]:
pip install xgboost

In [None]:
pip install sagemaker-scikit-learn-extension

In [None]:
conda install -c mlio -c conda-forge mlio-py==0.7

### 2.4. Prueba

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

In [None]:
model_xgb_2 = xgb.XGBRegressor()
model_xgb_2.load_model("./models/xgboost-model-0")

In [None]:
to_test=all[-10:]
to_test

In [None]:
to_test_x_data = to_test.drop('price', axis = 1)
to_test_y_data = to_test['price']

In [None]:
from dpp1 import build_feature_transform

In [None]:
pipeline_to_change_dataframe=build_feature_transform()
pipeline_to_change_dataframe.fit_transform(to_test_x_data)
AWS_to_test_x_data=pipeline_to_change_dataframe.transform(to_test_x_data)

In [None]:
AWS_to_test_x_data.tolist()

In [None]:
prediction = model_xgb_2.predict(AWS_to_test_x_data)
prediction