In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
df=pd.read_csv("../Datasets/carros_usados_exercicio.csv")
df.head()

In [None]:
df.describe()

In [None]:
df.describe(exclude=np.number)

In [None]:
df.dtypes

In [None]:
df.isna().sum()

In [None]:
df.isnull().sum()

# Tratamento do atributo unnamed

In [None]:
df.columns

In [None]:
df=df.drop(['Unnamed: 0'],axis=1)
df.head()

# Tratamento do atributo model

In [None]:
from sklearn.preprocessing import OrdinalEncoder
enc=OrdinalEncoder()
df['model']=enc.fit_transform(df[['model']])
df.head()

# Tratamento do atributo year

In [None]:
atributo='year'

In [None]:
sns.displot(df[atributo].dropna())

# Tratamento do atributo transmission

In [None]:
atributo='transmission'

In [None]:
df.groupby(atributo).sum()

In [None]:
posicao_desejada = df.columns.get_loc(atributo)
posicao_desejada

In [None]:
mapa_transmission = { 'Automatic':1, 'Semi-Auto':2,'Manual':3,'Other':4}
df['transmission_encoded'] = df[atributo].map(mapa_transmission)

nome_coluna = 'transmission_encoded'
coluna = df.pop(nome_coluna)
df.insert(posicao_desejada, nome_coluna, coluna)
df=df.drop([atributo],axis=1)
df

# Tratamento do atributo fuelType

In [None]:
atributo='fuelType'
posicao_desejada = df.columns.get_loc(atributo);posicao_desejada

In [None]:
df.groupby(atributo).sum()

In [None]:
mapa_fuelType = { 'Petrol':1,'Diesel':2,'Hybrid':3,'Electric':4,'Other':5}
df['fuelType_encoded'] = df[atributo].map(mapa_fuelType)

nome_coluna = 'fuelType_encoded'
coluna = df.pop(nome_coluna)
df.insert(posicao_desejada, nome_coluna, coluna)
df=df.drop([atributo],axis=1)
df

# Tratamento do atributo Make

In [None]:
atributo='Make'
posicao_desejada = df.columns.get_loc(atributo);posicao_desejada

In [None]:
df.groupby(atributo).sum()

In [None]:
mapa_Make = { 'BMW':1,'Ford':2,'Hyundai':3,'audi':4,'skoda':5,'toyota':6,'vw':7 }
df['Make_encoded'] = df[atributo].map(mapa_Make)

nome_coluna = 'Make_encoded'
coluna = df.pop(nome_coluna)
df.insert(posicao_desejada, nome_coluna, coluna)
df=df.drop([atributo],axis=1)
df

# Determinar valor do carro baseado nos dados do veiculo

In [None]:
X=df.drop(['price'],axis=1)
y=df['price']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=24)

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

lr =LinearRegression()
lr.fit(X_train, y_train)

In [None]:
lr_pred= lr.predict(X_test)

In [None]:
plt.scatter(y_test, lr_pred, s=1)

In [None]:
sns.displot((y_test-lr_pred))

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error as mae, mean_squared_error as mse

In [None]:
print("R2: ", r2_score(y_test, lr_pred))
print("MAE: ", mae(y_test, lr_pred))
print("MSE: ", mse(y_test, lr_pred))
print("RMSE: ", np.sqrt(mse(y_test, lr_pred)))

In [None]:
def evaluate(reais, previstos):
    _mae = mae(reais, previstos)
    _mse = mse(reais, previstos)
    _rmse = np.sqrt(_mse)  # Calculate RMSE from MSE
    _r2 = r2_score(reais, previstos)
    return round(_mae, 2), round(_mse, 2), round(_rmse, 2), round(_r2, 2)

def print_evaluate(model, reais, previstos):
    mae, mse, rmse, r2 = evaluate(reais,previstos)
    print("------------------------------------------")
    print("MAE:", mae)
    print("RMSE:", rmse)
    print("R2 score:", r2)
    print("------------------------------------------")
    
    results=np.array([model,mae,mse,rmse,r2])
    results_df.loc[len(results_df)+1]= results

results_df = pd.DataFrame(columns=['Model', 'MAE','MSE','RMSE','R2'])

In [None]:
from time import time
from sklearn.model_selection import GridSearchCV

def grid_search(model, params):
    GSCV = GridSearchCV(estimator=model,
                        param_grid=params,
                        cv=5,
                        n_jobs=-1,
                        verbose=2
                       ) 
    start=time()
    GSCV.fit(X_train,y_train)
    grid_time=time()-start
    print("Grid time: ", grid_time)
    print(GSCV.best_params_)
    return GSCV.best_params_

In [None]:
#registar valores de RL
print_evaluate("Linear Regression", y_test,lr_pred)

# Random Forest

In [None]:
from sklearn.ensemble import  RandomForestRegressor

In [None]:
rf_reg= RandomForestRegressor(random_state=24)

rf_reg.fit(X_train, y_train)

rf_pred = rf_reg.predict(X_test)

In [None]:
print("Random Forest Regressor:")
print_evaluate("Linear Regression", y_test,rf_pred)

In [None]:
rf_reg.get_params()

# Random Forest Otimizada

In [None]:
model = RandomForestRegressor(random_state=24)
params ={
    'n_estimators':[100, 500,1000],
    'max_features':[1.0,'sqrt','log2',None],
    'max_depth':[None, 5,10],
    'criterion':['squared_error', 'poisson']
}

best_params = grid_search(model, params)
best_params

In [None]:
#create a new RF
rf_reg_2 = RandomForestRegressor(random_state=24,
                                n_estimators=best_params['n_estimators'],
                                max_features=best_params['max_features'],
                                max_depth=best_params['max_depth'],
                                criterion=best_params['criterion'])
rf_reg_2.fit(X_train,y_train)
rf_pred_2 = rf_reg_2.predict(X_test)
print("Random Forest Regressor:")
print_evaluate("Linear Regression", y_test,rf_pred_2)