## Class 7 
### Predictive models with car database:

Our objective is to predict the price the buyer is willing to pay.

In [0]:
# Import necessary libraries for data handling, machine learning model building, preprocessing, evaluation, and visualization.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

In [0]:
# Load the dataset from a CSV file
df = pd.read_csv('/dbfs/FileStore/CDS2024/cars.csv')

In [0]:
# DataFrame dimensionality
df.shape

In [0]:
# Display the first five rows of the DataFrame to get an initial overview of the data structure and values.
df.head()

### Data Description:
Simulated data with:
 - age: buyer's age
 - gender: buyer's gender (0 = Male, 1 = Female)
 - avr_per_day: average miles driven per day by the buyer
 - debt: amount of buyer's debt
 - income: buyer's income
 - price: price paid

In [0]:
# Extract feature variables by dropping the 'price' column and convert the DataFrame to a NumPy array.
X = df.drop('price', axis=1).values
X

In [0]:
# Extract the target variable 'price' and convert it to a NumPy array.
y = df['price'].values
y

In [0]:
# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Dimensions of X_train:", X_train.shape)
print("Dimensions of X_test:", X_test.shape)
print("Dimensions of y_train:", y_train.shape)
print("Dimensions of y_test:", y_test.shape)

In [0]:
# Standardizing the data: important for many machine learning models
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit and transform the training data
X_test_scaled = scaler.transform(X_test)       # Only transform the test data
X_train_scaled

In [0]:
# Create a Linear Regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)  # Train the model on the scaled training data

In [0]:
# Predict using the scaled test data
y_pred = model.predict(X_test_scaled)
y_pred

In [0]:
# Calculate the R-squared score to evaluate the model
r2 = r2_score(y_test, y_pred)
r2

In [0]:
# Plotting the actual vs predicted values for visualization
plt.scatter(y_test, y_pred, color='blue', label='Predicted vs. Actual')
# Optionally, add a line to indicate perfect predictions
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--', label='Perfect Fit')
plt.title(f'Linear Regression Model\nR-squared: {r2:.2f}')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.legend()
plt.show()

In [0]:
# Let's estimate the buying profile of a fictitious user.
age = 28
gender = 0
avr_per_day = 20
debt = 0
income = 100000
# So the new X would be:
new_X = np.array([[age, gender, avr_per_day, debt, income]])
new_X

In [0]:
new_X_scaled = scaler.transform(new_X)
new_y_pred = model.predict(new_X_scaled)
print('Age: %i'%(age))
print('Gender: %s'%('Male' if gender == 0 else 'Female'))
print('Average Miles per Day: %i'%(avr_per_day))
print('Debt ($): %i'%(debt))
print('Income ($): %i'%(income))
print('Estimated Car Price ($): %1.3f'%(new_y_pred))

---------------------------------------------------------------------------------------------------------------------------

Author: <b>Julio Iglesias</b>