In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer

In [4]:
def read_data_from_csv(file_path):
    """
    Reads the data from a CSV file and returns a pandas DataFrame.

    Args:
        file_path (str): Path to the CSV file.

    Returns:
        pandas DataFrame: DataFrame containing the data.
    """
    df = pd.read_csv(file_path)
    return df

# Read data from CSV file
file_path = 'auto-mpg.csv'
df = read_data_from_csv(file_path)

In [5]:
def preprocess_data(data):
    """
    Performs data preprocessing steps on the DataFrame.

    Args:
        df (pandas DataFrame): DataFrame containing the data.

    Returns:
        pandas DataFrame: Preprocessed DataFrame.
    """
    data['horsepower'].replace('?', np.NaN, inplace=True)
    data['horsepower'] = data['horsepower'].astype(float)
    median_value = data['horsepower'].median()
    data['horsepower'].fillna(median_value, inplace=True)

    data.drop('car name', axis=1, inplace=True)
    
    data['displacement_per_cylinder'] = data['displacement'] / data['cylinders']
    data['power_to_weight_ratio'] = data['horsepower'] / data['weight']
    data = data[['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration',
             'model year', 'origin', 'power_to_weight_ratio', 'displacement_per_cylinder']]
    return data


# Preprocess data
df = preprocess_data(df)

In [6]:
def augment_data(data):
    """
    Generates augmented synthetic data based on the given data.

    Args:
        data (pandas DataFrame): DataFrame containing the original data.

    Returns:
        pandas DataFrame: Augmented DataFrame with synthetic data.
    """
    unique_mpg_values = data['mpg'].unique()
    new_data = []

    for mpg_value in unique_mpg_values:
        instances = data[data['mpg'] == mpg_value]
        instances_sorted = instances.sort_values('cylinders', ascending=True)

        if len(instances_sorted) > 1:
            for i in range(len(instances_sorted) - 1):
                current_instance = instances_sorted.iloc[i]
                next_instance = instances_sorted.iloc[i + 1]

                mean_displacement = np.mean([current_instance['displacement'], next_instance['displacement']])
                mean_horsepower = np.mean([current_instance['horsepower'], next_instance['horsepower']])
                mean_weight = np.int32(np.mean([current_instance['weight'], next_instance['weight']]))
                mean_acceleration = np.mean([current_instance['acceleration'], next_instance['acceleration']])

                mean_displacement += np.random.normal(0, 0.1)
                mean_horsepower += np.random.normal(0, 0.1)
                mean_weight += np.random.normal(0, 10)
                mean_acceleration += np.random.normal(0, 0.1)

                mean_displacement = round(mean_displacement, 2)
                mean_horsepower = round(mean_horsepower, 2)
                mean_weight = int(round(mean_weight))
                mean_acceleration = round(mean_acceleration, 2)

                power_to_weight_ratio = mean_horsepower / mean_weight
                displacement_per_cylinder = mean_displacement / current_instance['cylinders']

                new_instance = {
                    'mpg': current_instance['mpg'],
                    'cylinders': current_instance['cylinders'],
                    'displacement': mean_displacement,
                    'horsepower': mean_horsepower,
                    'weight': mean_weight,
                    'acceleration': mean_acceleration,
                    'model year': current_instance['model year'],
                    'origin': current_instance['origin'],
                    'power_to_weight_ratio': power_to_weight_ratio,
                    'displacement_per_cylinder': displacement_per_cylinder
                }

                new_data.append(new_instance)

    return pd.DataFrame(new_data)

# Augment data
augmented_df = augment_data(df)

In [7]:
def train_linear_regression(X_train, y_train):
    """
    Trains a Linear Regression model on the given training data.

    Args:
        X_train (pandas DataFrame): Features of the training data.
        y_train (pandas Series): Target variable of the training data.

    Returns:
        LinearRegression: Trained Linear Regression model.
    """
    model = LinearRegression()
    model.fit(X_train, y_train)
    return model

# Split the augmented data into features (X_augmented) and target variable (y_augmented)
X_augmented = augmented_df.iloc[:, 1:]
y_augmented = augmented_df['mpg']

# Split the original data into features (X) and target variable (y)
X = df.iloc[:, 1:]
y = df['mpg']

# Split the augmented data into training and testing sets
X_train_augmented, X_test_augmented, y_train_augmented, y_test_augmented = train_test_split(
    X_augmented, y_augmented, test_size=0.2, random_state=42
)

# Train linear regression on augmented data
linear_reg_model = train_linear_regression(X_train_augmented, y_train_augmented)
linear_reg_test_acc = linear_reg_model.score(X, y)

In [8]:
def standardize_data(data, columns_to_normalize):
    """
    Standardizes the selected columns in the given DataFrame using StandardScaler.

    Args:
        df (pandas.DataFrame): The input DataFrame.
        columns_to_normalize (list): The columns to be standardized.

    Returns:
        pandas.DataFrame: The standardized DataFrame.
    """
    scaler = StandardScaler()
    data[columns_to_normalize] = scaler.fit_transform(data[columns_to_normalize])
    return df

# Standardize the combined data
combined_df = pd.concat([augmented_df, df], ignore_index=True)
columns_to_normalize = ['displacement', 'horsepower', 'weight', 'acceleration', 'power_to_weight_ratio',
                        'displacement_per_cylinder']
combined_df = standardize_data(combined_df, columns_to_normalize)

In [9]:
def train_ridge_regression(X_train, y_train):
    """
    Trains a Ridge Regression model on the given training data.

    Args:
        X_train (pandas DataFrame): Features of the training data.
        y_train (pandas Series): Target variable of the training data.

    Returns:
        Ridge: Trained Ridge Regression model.
    """
    model = Ridge()
    model.fit(X_train, y_train)
    return model


def train_lasso_regression(X_train, y_train):
    """
    Trains a Lasso Regression model on the given training data.

    Args:
        X_train (pandas DataFrame): Features of the training data.
        y_train (pandas Series): Target variable of the training data.

    Returns:
        Lasso: Trained Lasso Regression model.
    """
    model = Lasso()
    model.fit(X_train, y_train)
    return model


def train_random_forest_regressor(X_train, y_train):
    """
    Trains a Random Forest Regressor model on the given training data.

    Args:
        X_train (pandas DataFrame): Features of the training data.
        y_train (pandas Series): Target variable of the training data.

    Returns:
        RandomForestRegressor: Trained Random Forest Regressor model.
    """
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    return model


# Split the combined data into features (X) and target variable (y)
X = combined_df.iloc[:, 1:]
y = combined_df['mpg']

# Split the combined data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train different models on the combined data
linear_reg_model = train_linear_regression(X_train, y_train)
ridge_reg_model = train_ridge_regression(X_train, y_train)
lasso_reg_model = train_lasso_regression(X_train, y_train)
rf_reg_model = train_random_forest_regressor(X_train, y_train)

In [11]:
def train_gradient_boosting_regressor(X_train, y_train, base_estimator):
    """
    Trains a Gradient Boosting Regressor model with the given base estimator on the training data.

    Args:
        X_train (pandas DataFrame): Features of the training data.
        y_train (pandas Series): Target variable of the training data.
        base_estimator: Base estimator used by the Gradient Boosting Regressor.

    Returns:
        GradientBoostingRegressor: Trained Gradient Boosting Regressor model.
    """
    model = GradientBoostingRegressor(init=base_estimator)
    model.fit(X_train, y_train)
    return model

# Train gradient boosting with random forest as base estimator
gb_rf_reg_model = train_gradient_boosting_regressor(X_train, y_train, rf_reg_model)

In [13]:
def perform_hyperparameter_tuning(X_train, y_train, base_estimator):
    """
    Performs hyperparameter tuning for Gradient Boosting Regressor with the given base estimator.

    Args:
        X_train (pandas DataFrame): Features of the training data.
        y_train (pandas Series): Target variable of the training data.
        base_estimator: Base estimator used by the Gradient Boosting Regressor.

    Returns:
        tuple: Best model, train accuracy, and test accuracy.
    """
    param_grid = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.5],
        'max_depth': [2, 3, 4],
        'subsample': [0.7, 0.8, 0.9],
    }

    scorer = make_scorer(lambda y_true, y_pred: np.corrcoef(y_true, y_pred)[0, 1] ** 2)

    random_search = RandomizedSearchCV(
        GradientBoostingRegressor(init=base_estimator),
        param_distributions=param_grid,
        n_iter=10,
        scoring=scorer,
        cv=5,
        random_state=42,
    )

    random_search.fit(X_train, y_train)

    best_model = random_search.best_estimator_
    best_model_train_acc = best_model.score(X_train, y_train)
    best_model_test_acc = best_model.score(X_test, y_test)

    return best_model, best_model_train_acc, best_model_test_acc

# Perform hyperparameter tuning on gradient boosting
best_gb_model, best_gb_train_acc, best_gb_test_acc = perform_hyperparameter_tuning(X_train, y_train, rf_reg_model)

In [14]:
# Create a dictionary to store the model results
results = {'Model': ['Linear Regression', 'Ridge Regression', 'Lasso Regression', 'Random Forest Regressor',
                     'Gradient Boosting with Random Forest', 'Best Gradient Boosting'],
           'Train Accuracy': [linear_reg_model.score(X_train, y_train), ridge_reg_model.score(X_train, y_train),
                              lasso_reg_model.score(X_train, y_train), rf_reg_model.score(X_train, y_train),
                              gb_rf_reg_model.score(X_train, y_train), best_gb_train_acc],
           'Test Accuracy': [linear_reg_model.score(X_test, y_test), ridge_reg_model.score(X_test, y_test),
                             lasso_reg_model.score(X_test, y_test), rf_reg_model.score(X_test, y_test),
                             gb_rf_reg_model.score(X_test, y_test), best_gb_test_acc]}

# Create a pandas DataFrame from the results dictionary
df_results = pd.DataFrame(results)

# Display the results
print(df_results)

                                  Model  Train Accuracy  Test Accuracy
0                     Linear Regression        0.853421       0.841062
1                      Ridge Regression        0.848730       0.853072
2                      Lasso Regression        0.807823       0.828470
3               Random Forest Regressor        0.981215       0.904482
4  Gradient Boosting with Random Forest        0.994793       0.904392
5                Best Gradient Boosting        0.985611       0.909080


