In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBRegressor


In [None]:
df_train = pd.read_csv('/kaggle/input/playground-series-s4e9/train.csv', index_col = 'id')
df_test = pd.read_csv('/kaggle/input/playground-series-s4e9/test.csv', index_col = 'id')

In [None]:
df_test.head(5)

In [None]:

df_train.head()


In [None]:
df_train.info()


In [None]:
df_train.describe().T

In [None]:
df_train.describe(include = ['O']).T

In [None]:
col_cat =  df_train.select_dtypes(include = 'O').columns
col_filt = [col for col in col_cat if df_train[col].nunique() < 60]

for col in col_filt:
    print(df_train[col].unique())

In [None]:
df_train.isnull().sum().sort_values(ascending = False)

In [None]:
df_train.isnull().mean().sort_values(ascending = False)*100

In [None]:
df_train.duplicated().sum()

In [None]:


df_train.info()



# values chance of the clean_title, fuel_type and accident from No, Gasolline and None reported

In [None]:
df_train['clean_title'] = df_train['clean_title'].fillna('No')
df_train['fuel_type'] = df_train['fuel_type'].fillna('Gasoline')
df_train['accident'] = df_train['accident'].fillna('None reported')

In [None]:
df_test['clean_title'] = df_test['clean_title'].fillna('No')
df_test['fuel_type'] = df_test['fuel_type'].fillna('Gasoline')
df_test['accident'] = df_test['accident'].fillna('None reported')

In [None]:
df_train.isnull().sum()

# EDA

In [None]:
for column in col_filt:
    plt.figure(figsize=(10, 5))
    sns.countplot(data=df_train, x=column)
    plt.title(f'contador de: {column}')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

In [None]:
df_train.select_dtypes(['int64', 'float64']).hist(figsize = (25,25), bins = 30)
plt.show()

In [None]:
columnas_numericas=df_train.select_dtypes(include=['int64','float64']).columns

fig, axes = plt.subplots(nrows=len(columnas_numericas), ncols=1, figsize=(10, 20))

for i, col in enumerate(columnas_numericas):
    sns.boxplot(x=df_train[col], ax=axes[i])
    axes[i].set_title(f'Diagrama de caja para {col}', fontsize=10)
    axes[i].set_xlabel('')

plt.tight_layout()
plt.show()

In [None]:
categorias = {
    'Marcas de lujo': ['Mercedes-Benz', 'Audi', 'BMW', 'Tesla', 'Cadillac', 'Genesis', 'Volvo', 'Lexus', 'Jaguar', 'Acura', 'INFINITI', 'Porsche', 'McLaren', 'Rolls-Royce', 'Maserati', 'Bentley', 'Ferrari', 'Aston', 'Lamborghini', 'Lucid', 'Lotus', 'Karma', 'Bugatti', 'Polestar', 'Maybach'],
    'Marcas generalistas': ['Chevrolet', 'Ford', 'Toyota', 'Hyundai', 'Volkswagen', 'Buick', 'GMC', 'RAM', 'Jeep', 'Honda', 'Nissan', 'Kia', 'Mitsubishi', 'Mazda', 'Subaru', 'Chrysler', 'FIAT', 'Suzuki'],
    'Marcas menos comunes': ['MINI', 'Lincoln', 'Land', 'Rivian', 'Hummer', 'Alfa', 'Pontiac', 'Saturn', 'Scion', 'smart', 'Plymouth', 'Saab', 'Mercury']
}

df_train['brand'] = df_train['brand'].apply(lambda x: next((cat for cat, marcas in categorias.items() if x in marcas), 'Desconocido'))
df_test['brand'] = df_test['brand'].apply(lambda x: next((cat for cat, marcas in categorias.items() if x in marcas), 'Desconocido'))


In [None]:
bins = [0, 50000, 100000, 150000, 200000, 250000,400000, 450000]
labels = ['0-50k', '50k-100k', '100k-150k', '150k-200k', '200k-250k', '250k-400k', '400k+']

df_train['milage'] = pd.cut(df_train['milage'], bins=bins, labels=labels, right=False, include_lowest=True)
df_test['milage'] = pd.cut(df_test['milage'], bins=bins, labels=labels, right=False, include_lowest=True)

df_train['milage']= df_train['milage'].astype(object)
df_test['milage']= df_test['milage'].astype(object)


In [None]:
df_train

In [None]:
X_train =  df_train.drop(columns = ['price'])
y =  df_train ['price']

# cambio de valores la tabla como objetos a numericos

In [None]:
s=(X_train.dtypes=='object')
object_cols=list(s[s].index)

ordinal_encoder=OrdinalEncoder()
ordinal_encoder.fit(pd.concat([X_train[object_cols],df_test[object_cols]]))

X_train[object_cols]=ordinal_encoder.transform(X_train[object_cols])
df_test[object_cols]=ordinal_encoder.transform(df_test[object_cols])

In [None]:
X_train.dtypes

In [None]:
X_train.head()

In [None]:


X_train, X_valid, y_train, y_valid = train_test_split(X_train, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)




In [None]:
param_grid = {
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.5, 0.6, 0.7],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [50, 100],
}

estimator = XGBRegressor()

optimal_params = GridSearchCV(estimator=estimator, param_grid=param_grid, verbose=0)
optimal_params.fit(X_train, y_train, verbose=False)

model= optimal_params.best_estimator_

In [None]:


predicciones = model.predict(df_test)
df_test['price'] = predicciones



In [None]:


df_test_reset = df_test.reset_index()
df= df_test_reset[['id','price']]
df.head(10)



In [None]:
df.to_csv('submission.csv', index=False)