## Import Libraries

In [79]:
## for loading and preprocessing 
import pandas as pd
import numpy as np 

## for data visualization 
import matplotlib.pyplot as plt
import seaborn as sns

## splitting dataset
from sklearn.model_selection import train_test_split

## feature engineering 
from sklearn.feature_extraction import DictVectorizer

## load regression model 
from sklearn.linear_model import LinearRegression

## 
from sklearn.ensemble import RandomForestRegressor

## evaluation metric 
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score

## Loading Data & Creating Copy

In [80]:
## read dataset


In [81]:
## create a deep copy 


## Data Preview And Understanding
- Loading the dataset

In [1]:
## view the first rows 


In [2]:
## view the last five rows 


In [8]:
## check the number of rows and columns 
print(f'Total number of rows: {} -> Total number of columns: {}')

In [6]:
## get a summary description of the data


In [5]:
## checking for missing values


In [4]:
## checking for duplicated values 


In [3]:
## checking type on columns


In [9]:
## finding unique instances in each column 


In [10]:
## finding unique instances in each column 


## Data preprocessing 
- replacing unsual characters with NaN values
- Drop the Id column
- Normalizing column names
- Removing extra text in numeric columns
- Standardize datatypes of columns
- Filling of missing 

In [None]:
## replace unusual characters


In [None]:
## drop the id column


In [94]:
## change column names to lower case and replace spaces with underscore


In [None]:
## remove extra text in numeric columns


In [None]:
## set appropriate datatype for each column


In [None]:
# fill-in missing values


## Descriptive Analysis
- statistical summary

In [11]:
## statistical summary of dataset


## Exploratory Data Analysis
- Target variable analysis
- Plot a coorelation againts the target variable
- Outlier analysis

In [13]:
## target variable analysis (distribution and outlier)


In [14]:
## performing a coorelation on numerical column
## select numerical features/columns


## compute correlation


## print or visualize correlation


## Data Preprocessing Step 2
- Removing outliers 
- replacing missing values

In [15]:
## removing ouliers 
## compute first(Q1) and third(Q3) quartiles

## compute the inter-quartile range
# IQR = Q3 - Q1

## define the lower and upper bound for outliers
# lower_bound = Q1 - 1.5 * IQR 
# upper_bound = Q3 - 1.5 * IQR 

## remove outliers from dataset using upper and lower bound values


## Building A Validation Framework / Splitting Dataset
- Training set 60 %
- Validation dataset 20%
- Test dataset 20%


In [16]:
## splitting data into training, validation and testing sets


In [None]:
## selecting target column and converting it to a vector format for each data set


In [None]:
# remove target column from each data set


## Regression Tasks
- This involves making quantitative predictions. 

### Regression ML Algorimths : Main ideas, Adavantages and disadvantages
- Linear Regression:
    - Main Idea: Linear regression aims to model the relationship between a dependent variable and one or more independent variables by fitting a linear equation to observed data.
    - Advantages:
        - Simple and easy to interpret.
        - Computational efficiency.
    - Disadvantages:
        - Assumes a linear relationship between variables, which may not hold in all cases.
        - Sensitive to outliers.

- Decision Trees:
    - Main Idea: Decision trees recursively split the data into subsets based on the value of input features, with the goal of predicting the target variable in each subset.
    - Advantages:
        - Can handle both numerical and categorical data.
        - Interpretability.
        - Can capture nonlinear relationships.
    - Disadvantages:
        - Prone to overfitting, especially with deep trees.
        - Can be unstable, sensitive to small variations in the data.

- Random Forest:
    - Main Idea: Random Forest is an ensemble learning method that constructs a multitude of decision trees at training time and outputs the average prediction (for regression tasks) or the mode of the predictions (for classification tasks) of the individual trees.
    - Advantages:
        - Reduced risk of overfitting compared to single decision trees.
        - High accuracy and robustness.
        - Can handle large datasets with high dimensionality.
    - Disadvantages:
        - Less interpretable compared to single decision trees.
        - Requires more computational resources and longer training time compared to simpler models like linear regression.

- Gradient Boosting Machines (GBM):
    - Main Idea: GBM builds an ensemble of weak learners (usually decision trees) sequentially, where each new model fits the residuals of the previous model, thereby reducing the error with each iteration.
    - Advantages:
        - High predictive accuracy.
        - Can capture complex relationships in the data.
        - Less prone to overfitting compared to other ensemble methods.
    - Disadvantages:
        - More sensitive to hyperparameter tuning compared to other algorithms.
        - Longer training time compared to simpler models.

- Support Vector Regression (SVR):
    - Main Idea: SVR performs regression by finding a hyperplane in the high-dimensional feature space that has the maximum margin, while penalizing deviations from the target variable beyond a certain threshold.
    - Advantages:
        - Effective in high-dimensional spaces.
        - Can handle non-linear relationships using kernel tricks.
        - Robust to overfitting, especially in high-dimensional spaces.
    - Disadvantages:
        - Computationally expensive, especially with large datasets.
        - Requires careful selection of hyperparameters and kernel functions.

## Train A Baseline Regression Model
- Selecting baseline features (pandas dataframe)
- convert dataframe to matrix/vector format
- Train a regression model

In [17]:
## selecting baseline features (features to be considered by the model)


In [18]:
## convert dataframes to pandas array 


In [19]:
## lets initial a model algorithm and train the model 


### Introduction To Error Evaluation Metrics
- Mean Absolute Error (MAE):

    - Main Idea: MAE measures the average absolute difference between the predicted values and the actual values.
    - Advantages:
        - Easy to interpret as it represents the average magnitude of errors.
        - Robust to outliers.
    - Disadvantages:
        - Does not penalize large errors more than small errors, which may be undesirable in some cases.

- Mean Squared Error (MSE):

    - Main Idea: MSE measures the average squared difference between the predicted values and the actual values.
    - Advantages:
        - Penalizes larger errors more than MAE, providing more sensitivity to large deviations.
        - Useful for optimization algorithms as it is differentiable.
    - Disadvantages:
        - Sensitive to outliers due to squaring the errors.
        - The scale of MSE is not in the same units as the target variable.

- Root Mean Squared Error (RMSE):

    - Main Idea: RMSE is the square root of the MSE, providing a measure of the average magnitude of errors in the same units as the target variable.
    - Advantages:
        - Provides an interpretable measure of the average prediction error in the original units of the target variable.
    - Disadvantages:
        - Similar to MSE, RMSE is sensitive to outliers and may not always provide a clear interpretation of error magnitude.

- R-squared (R²) Score:

    - Main Idea: R-squared measures the proportion of the variance in the target variable that is explained by the model. It ranges from 0 to 1, where higher values indicate a better fit.
    - Advantages:
        - Provides an intuitive measure of model goodness-of-fit.
        - Can be interpreted as the percentage of variance explained by the model.
    - Disadvantages:
        - Can be misleading when used with complex models, as R² tends to increase with the number of predictors, even if they are not relevant.

## Model Evaluation
- Generate the validation predictions
- Compare the validation predictions to the actual predictions
- Return an accuracy score
  

In [None]:
## generate validation baseline features 


## convert dataframe into numpy array


In [None]:
## generate our validation prediction


In [20]:
## compute the validation baseline accuracy for r-squared


In [21]:
## compute the validation baseline accuracy for 


In [22]:
## compute the validation baseline accuracy for 


## Feature Engineering 
- Convert categorical data to numerical using one-hot encoding 

In [None]:
## select categoricals 
categorical_features = ['manufacturer', 'model', 'category', 'leather_interior', 'fuel_type', 'gear_box_type', 'drive_wheels','wheel']
numerical_features = ['levy', 'prod._year', 'mileage','cylinders', 'airbags']

## conbine of dataframe
df_train_fe = df_train[categorical_features + numerical_features]

## convert our dataframe to a list of dict
train_dict_fe = df_train_fe.to_dict(orient='records')

In [None]:
## create an instance of the dict vectorizer 
dv = DictVectorizer()

dv.fit(train_dict_fe)

In [None]:
## 
X_train_fe = dv.transform(train_dict_fe) 

In [None]:
# print(dv.get_feature_names_out()) < feature_names.txt

## Training A New Model
- Trian a linear regression model
- Generate validation predictions
- Evaluate the performance of the model.


In [None]:
fe_model = LinearRegression()

fe_model.fit(X_train_fe, y_train)

In [None]:
## 
df_valid_fe = df_valid[numerical_features + categorical_features] 

## convert dataframe to diction
dict_valid_fe = df_valid_fe.to_dict(orient='records') 

X_valid_fe = dv.transform(dict_valid_fe)

In [None]:
## lets generate validation predictions
y_valid_predict = fe_model.predict(X_valid_fe) 


In [None]:
## compute error accuracy
fe_rsquared_acc = r2_score(y_valid, y_valid_predict) 

print(f'Model Accuracy: {fe_rsquared_acc}')

Model Accuracy: 0.000198407689372182


## Train A Ramdom Forest Model 
- Trian a random regression model
- Generate validation predictions
- Evaluate the performance of the model.

In [None]:
# Initialize the Random Forest Regressor
random_forest = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
random_forest.fit(X_train_fe, y_train)


In [137]:
# generate validation predictions
y_valid_pred = random_forest.predict(X_valid_fe)

In [138]:
# Calculate R Squared Error
rsquared_acc = r2_score(y_valid, y_valid_pred)
print(round(rsquared_acc * 100, 2), '%')

60.2 %


In [139]:
# Calculate mean absolute error
mae = mean_absolute_error(y_valid, y_valid_pred)
print(round(mae * 100, 2), '%')

49.84 %


## Train A Decison tree Model 
- Trian a simple decision tree model model
- Generate validation predictions
- Evaluate the performance of the model.

## Train A XGBoost Model 
- Trian a simple xgboost model model
- Generate validation predictions
- Evaluate the performance of the model.

## Regularization 


In [None]:
from sklearn.linear_model import Ridge

# Create Ridge Regression model
ridge = Ridge(alpha=0.1)  # alpha is the regularization strength