<!DOCTYPE html>
<html>
<head>
<style>
    .header-banner {
        background-color: white;
        color: black; 
        padding: 1rem; 
        font-family: 'Nunito', sans-serif;
    }
    .header-content {
        max-width: 2000px;
        margin: 0 auto;
        display: flex;
        align-items: center;
        gap: 2rem;
    }
    .logo {
        max-width: 160px;
    }
    .text-content {
        flex: 1;
    }
    .text-content h1 {
        font-size: 34px;
        margin: 0 0 10px;
        font-weight: 700;
        color: #7e4d02ff;
        border-bottom: 2px solid #e5c120ff;
        padding-bottom: 10px;
    }
    .text-content h2 {
        font-size: 21px;
        margin: 0 0 5px;
        font-weight: 600;
        color: #222;
    }
    .member-list {
        display: grid;
        grid-template-columns: repeat(2, auto);
        gap: 6px 40px;
        font-size: 17px;
        color: #444;
    }
    .member {
        position: relative;
        padding-left: 20px;
    }
</style>
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=Nunito:wght@400;700&display=swap" rel="stylesheet">
</head>
<body>

<header class="header-banner">
    <div class="header-content">
        <img src="https://i.ibb.co/JBPWVYR/Logo-Nova-IMS-Black.png" alt="NOVA IMS Logo" class="logo">
        <div class="text-content">
            <h1>Cars 4 You: Expediting Car Evaluations with ML</h1>
            <h2>Group 37</h2>
            <div class="member-list">
                <div class="member">Filipa Pereira, 20240509</div>
                <div class="member">Gonçalo Silva, 20250354</div>
                <div class="member">Marta La Feria, 20211051 </div>
                <div class="member">Tomás Coroa, 20250394 </div>
            </div>
        </div>
    </div>
</header>

</body>
</html>

In [41]:
import pandas as pd
import numpy as np
import datetime

# Preprocessing
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# For models
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# For grid search
from sklearn.model_selection import GridSearchCV

# For plots
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# For reproducibility
np.random.seed(37)

In [43]:
# Load both learning and testing datasets
learning = pd.read_csv('../project_data/train.csv') #Let's call Learning the Training + Validation sets together
X_test = pd.read_csv('../project_data/test.csv')

In [None]:
# Data Cleaning and Feature Engineering Function
def clean_and_prepare_data(df, is_learning_set=True):
    print(f"\n*** Starting Data Cleaning for {'Learning' if is_learning_set else 'Test'} Set ***")
    print(f"Initial shape: {df.shape}")

    # 1. Set 'carID' as Index if unique
    if 'carID' in df.columns:
        if df['carID'].is_unique:
            print("'carID' is unique so it will be set as index.")
            df.set_index('carID', inplace=True)
        else:
            print("'carID' is not unique. Cannot be set as index.")
            # If not unique then drop it
            df.drop(columns=['carID'], inplace=True)





            

    # 2. Deal with inconsistencies in categorical variables
    categorical_cols_to_clean = ['Brand', 'transmission', 'fuelType']
    # A simple mapping for known typos in 'Brand'
    brand_corrections = {'bmw': 'BMW', 'vw': 'Volkswagen', 'merc': 'Mercedes-Benz', 'mw': 'BMW'}
    for col in categorical_cols_to_clean:
        if col in df.columns:
            df[col] = df[col].str.lower().str.strip() # Lowercase and remove whitespace
            if col == 'Brand':
                 df[col] = df[col].replace(brand_corrections)

    print("Standardized 'Brand', 'transmission', and 'fuelType' columns.")




    #4. Clean and Validate Specific Variables
    current_year = datetime.datetime.now().year

    # 4.1. Year and previousOwners
    if 'year' in df.columns:
        df['year'] = df['year'].apply(lambda x: abs(int(x)) if pd.notnull(x) else x)
        # Correct values outside the range [current_year-30, current_year]
        df['year'] = df['year'].clip(lower=current_year - 30, upper=current_year)

    if 'previousOwners' in df.columns:
        df['previousOwners'] = df['previousOwners'].apply(lambda x: abs(int(x)) if pd.notnull(x) else x)

    # 4.2. paintQuality%
    if 'paintQuality%' in df.columns:
        df['paintQuality%'] = df['paintQuality%'].clip(0, 100)

    # 4.3. Other numeric variables
    numeric_cols_to_abs = ['mileage', 'tax', 'mpg', 'engineSize']
    for col in numeric_cols_to_abs:
        if col in df.columns:
            df[col] = df[col].apply(lambda x: abs(float(x)) if pd.notnull(x) else np.nan)

    print("Cleaned and validated numerical columns.")



    if is_learning_set:
        # 1.1. Drop exact duplicates
        df.drop_duplicates(inplace=True)
        print(f"Shape after dropping full duplicates: {df.shape}")
        
        # 1.2. Drop duplicates ignoring 'carID'
        subset_cols = list(df.columns)
        subset_cols.remove('carID')
        df.drop_duplicates(subset=subset_cols, inplace=True)
        print(f"Shape after dropping duplicates (ignoring carID): {df.shape}")
    
        # 1.3. Drop duplicates ignoring 'carID' and 'price'
        subset_cols.remove('price')
        df.drop_duplicates(subset=subset_cols, inplace=True)
        print(f"Shape after dropping duplicates (ignoring carID and price): {df.shape}")



    # 5. Feature Engineering
    # 5.1. Car Age
    if 'year' in df.columns:
        df['car_age'] = current_year - df['year']
        df.drop(columns=['year'], inplace=True) # Drop original year column

    # 5.2. Brand and Model concatenation
    if 'Brand' in df.columns and 'model' in df.columns:
        df['brand_model'] = df['Brand'].astype(str) + '_' + df['model'].astype(str)
        # We can keep original columns as they might be useful on their own
        
    print("Created new features: 'car_age' and 'brand_model'.")

    # --- 6. Drop Rows with High Percentage of Missing Values ---
    min_non_missing = int(np.ceil(df.shape[1] * (1 - 0.4))) # At least 60% non-missing
    rows_before_drop = df.shape[0]
    df.dropna(thresh=min_non_missing, inplace=True)
    rows_after_drop = df.shape[0]
    print(f"Dropped {rows_before_drop - rows_after_drop} rows with >= 40% missing values.")




    # --- 7. Handle Outliers using IQR method ---
    ###APAGAR### This should be done after cleaning and imputation might be a better strategy in the pipeline
    # But as requested, here is a method to cap them.
    numeric_features = df.select_dtypes(include=np.number).columns.tolist()
    if 'price' in numeric_features and is_learning_set:
        numeric_features.remove('price') # Don't cap the target variable

    for col in numeric_features:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)

    print("Capped outliers in numerical columns using the IQR method.")
    print(f"Final shape after cleaning: {df.shape}")
    print("--- End of Data Cleaning ---")
    return df

In [46]:
# Apply the cleaning function to both datasets
learning_cleaned = clean_and_prepare_data(learning, is_learning_set=True)
test_cleaned = clean_and_prepare_data(X_test, is_learning_set=False)

# Inspect Unique Values
print("\n\033[1mUnique values for each column (up to 100) after cleaning:\033[0m")
# Let's inspect the learning set. Test set should be similar.
for col in learning_cleaned.columns:
    unique_vals = learning_cleaned[col].unique()
    num_unique = len(unique_vals)
    if num_unique > 100:
        print(f"{col} ({num_unique}): [More than 100 unique values, not shown]\n")
    else:
        print(f"{col} ({num_unique}): {unique_vals}\n")



--- Starting Data Cleaning for Learning Set ---
Initial shape: (75973, 14)
Shape after dropping full duplicates: (75973, 14)
Shape after dropping duplicates (ignoring carID): (75969, 14)
Shape after dropping duplicates (ignoring carID and price): (75962, 14)
'carID' is unique in the learning set. Setting it as index.
Standardized 'Brand', 'transmission', and 'fuelType' columns.
Cleaned and validated numerical columns.
Created new features: 'car_age' and 'brand_model'.
Dropped 0 rows with >= 40% missing values.
Capped outliers in numerical columns using the IQR method.
Final shape after cleaning: (75962, 14)
--- End of Data Cleaning ---

--- Starting Data Cleaning for Test Set ---
Initial shape: (32567, 13)
Standardized 'Brand', 'transmission', and 'fuelType' columns.
Cleaned and validated numerical columns.
Created new features: 'car_age' and 'brand_model'.
Dropped 1 rows with >= 40% missing values.
Capped outliers in numerical columns using the IQR method.
Final shape after cleaning:

In [47]:
# separate features and target variable from cleaned learning set
X_learning = learning_cleaned.drop('price', axis=1)
y_learning = learning_cleaned['price']

In [48]:
# Align columns - very important!
# The test set might not have the same columns as the learning set after one-hot encoding
# We will align them after fitting the preprocessor
X_test = test_cleaned

In [49]:
###APAGAR### Só para tornar simples este primeiro modelo
X_learning = X_learning.drop(['model', 'transmission'], axis=1)
X_test = X_test.drop(['model', 'transmission'], axis=1)

In [52]:
# Define the preprocessing pipeline

# Identify numeric and categorical columns
numeric_features = X_learning.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_learning.select_dtypes(exclude=np.number).columns.tolist()
print("Numeric columns:", numeric_features)
print("Categorical columns:", categorical_features)

# Pipeline for numerical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Pipeline for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing for numeric and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)




# Define models and their parameter grids
models_to_test = {
    'Ridge': {
        'model': Ridge(),
        'params': {
            'regressor__alpha': [0.1, 1.0, 10.0, 100.0, 200.0]
        }
    },
    'RandomForest': {
        'model': RandomForestRegressor(random_state=37),
        'params': {
            'regressor__n_estimators': [100, 200],
            'regressor__max_depth': [10, 20],
            'regressor__min_samples_split': [2, 5]
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingRegressor(random_state=37),
        'params': {
            'regressor__n_estimators': [100, 200],
            'regressor__learning_rate': [0.05, 0.1],
            'regressor__max_depth': [3, 5]
        }
    }
}

best_score = -np.inf
best_estimator = None
best_model_name = ""


# Iterate through each model, create a pipeline, and run GridSearchCV
for name, info in models_to_test.items():
    print(f"\n{'='*20} Running GridSearchCV for {name} {'='*20}")

    # Create the full pipeline with the current model
    full_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', info['model'])
    ])

    # Create and run GridSearchCV
    grid_search = GridSearchCV(
        full_pipeline,
        info['params'],
        cv=5,
        scoring='neg_root_mean_squared_error',
        verbose=1,
        n_jobs=-1
    )

    grid_search.fit(X_learning, y_learning)

    print(f"\nResults for {name}:")
    print(f"Best parameters found: {grid_search.best_params_}")
    # The score is negative, so we negate it to make it positive RMSE
    print(f"Best CV RMSE score: {-grid_search.best_score_:.4f}")

    # Check if this model is the best one so far
    if grid_search.best_score_ > best_score:
        best_score = grid_search.best_score_
        best_estimator = grid_search.best_estimator_
        best_model_name = name

print(f"\n{'='*20} Overall Best Model: {best_model_name} {'='*20}")
print(f"Best overall CV RMSE: {-best_score:.4f}")

# --- Make Final Predictions ---
print("\nMaking predictions on the submission test data using the best model...")
# Ensure test set columns are in the same order as training set before predicting
# The pipeline handles this internally, but it's good practice
final_predictions = best_estimator.predict(X_test)

print("Predictions generated successfully.")

Numeric columns: ['mileage', 'tax', 'mpg', 'engineSize', 'paintQuality%', 'previousOwners', 'hasDamage', 'car_age']
Categorical columns: ['Brand', 'fuelType', 'brand_model']

Fitting 5 folds for each of 5 candidates, totalling 25 fits

Results for Ridge:
Best parameters found: {'regressor__alpha': 1.0}
Best CV RMSE score: 4091.4736

Fitting 5 folds for each of 8 candidates, totalling 40 fits

Results for RandomForest:
Best parameters found: {'regressor__max_depth': 20, 'regressor__min_samples_split': 5, 'regressor__n_estimators': 200}
Best CV RMSE score: 2755.4158

Fitting 5 folds for each of 8 candidates, totalling 40 fits

Results for GradientBoosting:
Best parameters found: {'regressor__learning_rate': 0.1, 'regressor__max_depth': 5, 'regressor__n_estimators': 200}
Best CV RMSE score: 2987.7328

Best overall CV RMSE: 2755.4158

Making predictions on the submission test data using the best model...
Predictions generated successfully.


In [53]:
# RANDOMCV É MELHOR

# Quando você chama grid_search.fit(X_learning, y_learning), o GridSearchCV pega nos dados de treino e validação (X_learning) e executa o processo de validação cruzada internamente.
# O parâmetro cv=5 instrui o GridSearchCV a fazer o seguinte:
# Ignorar X_test: Ele só vai olhar para os dados que lhe foram passados: X_learning.
# Dividir em 5 "Folds": Ele divide X_learning em 5 pedaços de tamanho aproximadamente igual.
# [Fold 1 | Fold 2 | Fold 3 | Fold 4 | Fold 5]
# Iterar 5 Vezes: Para cada combinação de hiperparâmetros que você definiu no param_grid, ele vai repetir o seguinte processo 5 vezes:
# Corrida 1: Treina o modelo usando os Folds 2, 3, 4 e 5. Depois, valida (mede a pontuação) no Fold 1.
# Corrida 2: Treina o modelo usando os Folds 1, 3, 4 e 5. Depois, valida no Fold 2.
# Corrida 3: Treina o modelo usando os Folds 1, 2, 4 e 5. Depois, valida no Fold 3.
# Corrida 4: Treina o modelo usando os Folds 1, 2, 3 e 5. Depois, valida no Fold 4.
# Corrida 5: Treina o modelo usando os Folds 1, 2, 3 e 4. Depois, valida no Fold 5.
# Calcular a Média: No final destas 5 corridas, ele calcula a média das 5 pontuações de validação. Este valor médio é a "pontuação de validação cruzada" (CV score) para aquela combinação de hiperparâmetros.
# Encontrar o Melhor: O GridSearchCV faz isto para todas as combinações ({'C': 0.01, 'penalty': 'l1'}, {'C': 0.01, 'penalty': 'l2'}, etc.). No final, ele compara todas as pontuações médias e determina qual a combinação de hiperparâmetros que teve o melhor desempenho.
# Treino Final: Depois de encontrar os melhores hiperparâmetros, o GridSearchCV automaticamente treina um novo modelo, do zero, usando todos os dados de X_train_val com esses hiperparâmetros ótimos. Este modelo final é o que fica guardado em grid_search.best_estimator_.

In [None]:
#ALTERNATIVAS
# POSSIVELMENTE DEIXAR UNKNOWN E PÔR OS MISSINGS A UNKNOWN OU PREENCHER COM A MODA/MEDIAN OU COM KNN
# paintQuality%<0 talvez usar abs antes de clip

In [None]:
# O que fiz até agora:
# -  Set Index<- carID
# - inconsistencies de variáveis categóricas

In [None]:
# O que tenho ainda que fazer:
# - apresentar CRISP-DM, flowchart (gráficos+img), index (se necessário), papers e referências sobre tudo o que não demos, requirements no github
# - comentar todo o código e justificar (por exemplo dizer que fizémos CV...), abstract e group members contributions
# - ver qual o erro cometido no test
# - ver projetos dos anos anteriores
# ------
# - outliers
# - missings
# - inconsistências
# - duplicados
# - feature selection
# - medidas de performance
# - EDA