# Business objective

The model predicts if a prospective client will purchase a car or not. This model predict the probability of a client purchasing a car using the client features such as gender, age, and annual salary. The model can support businesses to find out if their client will purchase a car or not.

## Import necessary Python Packages

In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import missingno as msno

%matplotlib inline

## Get the data

### Import the Dataset

In [None]:
# load the dataset
car_data = pd.read_csv('car_data.csv')
car_data.head()

## Explore the Dataset

In [None]:
# dataset info
car_data.info()

In [None]:
# dataset dimension
print("There are {} rows and {} columns in the dataset.".format(car_data.shape[0], car_data.shape[1]))

In [None]:
# check the columns
car_data.columns

In [None]:
# drop the User ID column.
car_data.drop(columns=['User ID'], inplace=True)

In [None]:
# summary statistics
car_data.describe().transpose()

In [None]:
# correlation between the variables
car_data.corr()

In [None]:
sns.heatmap(car_data.corr(), annot=True);

The closer the value to 1, the stronger the correlation between the variables.

In [None]:
# check for missing rows
car_data.isnull().sum()

In [None]:
msno.matrix(car_data)
plt.show()

In [None]:
msno.bar(car_data)
plt.show()

The above plots showed that there is no missing values in the dataset.

## Functions to explore each feature

In [None]:
# function that will return value count and frequency for each observation within a feature
def value_cnt_norm_cal(df,feature):
    ftr_value_cnt = df[feature].value_counts()
    ftr_value_cnt_norm = df[feature].value_counts(normalize=True) * 100
    ftr_value_cnt_concat = pd.concat([ftr_value_cnt, ftr_value_cnt_norm], axis=1)
    ftr_value_cnt_concat.columns = ['Count', 'Frequency (%)']
    return ftr_value_cnt_concat

In [None]:
# function to display information about the feature
def gen_info_feat(df,feature):
        match feature:
            case 'Age':
                print('Description:\n{}'.format(df[feature].describe()))
                print('*'*50)
                print('Object type:\n{}'.format(df[feature].dtype))
            case _:    
                print('Description:\n{}'.format(df[feature].describe()))
                print('*'*50)
                print('Object type:\n{}'.format(df[feature].dtype))
                print('*'*50)
                value_cnt = value_cnt_norm_cal(df,feature)
                print('Value count:\n{}'.format(value_cnt))

In [None]:
# function to create a pie chart plot
def create_pie_plot(df, feature):
    ratio_size = value_cnt_norm_cal(df, feature)
    ratio_size_len = len(ratio_size.index)
    ratio_list = []
    for i in range(ratio_size_len):
        ratio_list.append(ratio_size.iloc[i]['Frequency (%)'])
    fig, ax = plt.subplots(figsize=(8,8))
    # %1.2f%% display decimals in the pie chart with 2 decimal places
    plt.pie(ratio_list, labels=ratio_size.index, autopct='%1.2f%%', startangle=90, wedgeprops={'edgecolor' :'black'})
    plt.title('Pie chart of {}'.format(feature), fontsize=20)
    plt.legend(loc='best')
    plt.axis('equal')
    return plt.show()

In [None]:
# function to create a bar chart plot 
def create_bar_plot(df, feature):
    fig, ax  = plt.subplots(figsize=(6,10))
    sns.set_theme(style='whitegrid')
    sns.barplot(x=value_cnt_norm_cal(df, feature).index, y=value_cnt_norm_cal(df,feature).values[:,0])
    plt.xlabel('{}'.format(feature))
    plt.ylabel('Count')
    plt.title('{} count'.format(feature), fontsize=20)
    return plt.show()

In [None]:
# create to create a box plot
def create_box_plot(df, feature):
    fig, ax = plt.subplots(figsize=(2,8))
    sns.set_theme(style='whitegrid')
    sns.boxplot(y=df[feature])
    plt.title('{} distribution(Boxplot)'.format(feature), fontsize=20)
    return plt.show()

In [None]:
# function to create a histogram plot
def create_hist_plot(df, feature, the_bins=50):
    fig, ax = plt.subplots(figsize=(18,10))
    sns.set_theme(style='whitegrid')
    sns.histplot(df[feature], bins=the_bins, kde=True)
    plt.title('{} distribution'.format(feature), fontsize=20)
    return plt.show()

In [None]:
# purchase vs non_purchase individuals compared on a box plot
def purchase_nonpurchase_box_plot(df, feature):
    print(df.groupby('Purchased')[feature].mean())
    fig, ax = plt.subplots(figsize=(5,8))
    sns.set_theme(style='whitegrid')
    sns.boxplot(y=df[feature], x=df['Purchased'])
    plt.xticks(ticks=[0,1], labels=['no', 'yes'])
    plt.title('Purchase individuals grouped by {}'.format(feature), fontsize=20)
    return plt.show()

In [None]:
# purchase vs non_purchase individuals compared on a bar plot
def purchase_nonpurchase_bar_plot(df, feature):
    purchase_grp = df.groupby(feature)['Purchased'].sum()
    purchase_grp_srt = purchase_grp.sort_values(ascending=False)
    print(dict(purchase_grp_srt))
    fig, ax = plt.subplots(figsize=(6,10))
    sns.set_theme(style='whitegrid')
    sns.barplot(x=purchase_grp_srt.index, y=purchase_grp_srt.values)
    ax.set_xticklabels(labels=purchase_grp_srt.index, ha='right')
    plt.ylabel('Count')
    plt.title('Purchase individuals count grouped by {}'.format(feature), fontsize=20)
    return plt.show()

## Univariate Analysis

### Gender

In [None]:
gen_info_feat(car_data, 'Gender')

In [None]:
create_bar_plot(car_data, 'Gender')

In [None]:
create_pie_plot(car_data, 'Gender')

Interpretation:
* We have more females clients than male (52%, 48%)

In [None]:
purchase_nonpurchase_bar_plot(car_data, 'Gender')

### Age

In [None]:
gen_info_feat(car_data, 'Age')

In [None]:
create_hist_plot(car_data, 'Age')

In [None]:
create_box_plot(car_data, 'Age')

In [None]:
purchase_nonpurchase_box_plot(car_data, 'Age')

Interpretation:
* The youngest client is 18 years while the oldest is 68 years old, with average of 40 years old.
* The average age of clients who purchased a car is 48 years while the average age of clients who didn't purchase a car is 35 years old, with the average age difference of 13 years.

### Annual Salary

In [None]:
gen_info_feat(car_data, 'AnnualSalary')

In [None]:
create_hist_plot(car_data, 'AnnualSalary')

In [None]:
create_box_plot(car_data, 'AnnualSalary')

In [None]:
purchase_nonpurchase_box_plot(car_data, 'AnnualSalary')

Interpretation:
* The clients minimun Annual salary is 15,000 and the maximum value of 152,500.
* The average Annual salary of clients who purchased a car is 88,034 while the average of clients who didn't purchase is 62,374

## Bivariate Analysis

### Numerical vs Numerical Features (Correlation & Scatter Plots)

### Scatter Plots

In [None]:
sns.pairplot(car_data[['Age', 'AnnualSalary']], corner=True)
plt.show()

In [None]:
sns.regplot(x='Age', y='AnnualSalary',data=car_data, line_kws={'color': 'red'})
plt.show()

Interpretation:
* The Age and Annual salary are not correlated. 

### Correlation analysis

In [None]:
# correlation analysis with heatmap, after dropping the has a mobile phone with the target feature as int
car_data_corr_no_gender = pd.concat([car_data.drop(['Gender','Purchased'], axis=1), car_data['Purchased']],axis=1).corr()
# Get the lower triangle of the correlation matrix
# Generate a mask for the upper triangle
mask = np.zeros_like(car_data_corr_no_gender, dtype='bool')
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
fig, ax = plt.subplots(figsize=(18,10))
# seaborn heatmap
sns.heatmap(car_data_corr_no_gender, annot=True, cmap='flare',mask=mask, linewidths=.5)
# plot the heatmap
plt.show()

Interpretation:
* The age is highly correlated with the target feature(purchased). The older the clients is, the most likely of purchasing a car.
* Annual salary has some correlation with the target feature(purchased). The higher the annual salary the higher the chances of purchasing a car.

## Business findings from the EDA.

* **Typical profile of a prospective clients is: a Female, 40 years of age with annual salary of 72,000**.
* **Most clients are 32 to 48 years old**.

## Prepare the data

### One hot encoding for Gender.

In [None]:
one_hot_enc = OneHotEncoder()
one_hot_enc_gender = pd.DataFrame(one_hot_enc.fit_transform(car_data[['Gender']]).toarray())
car_data = car_data.join(one_hot_enc_gender)
car_data = car_data.rename(columns={0:'Gender_F', 1:'Gender_M'})
car_data

In [None]:
car_data.shape

## Build the Model

In [None]:
# split the train data into X and y (target)
X = car_data[['Age', 'AnnualSalary', 'Gender_F', 'Gender_M']]  # independent features
Y = car_data['Purchased']                                      # dependent feature

In [None]:
# split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.4, random_state=100)

## Feature Scaling

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# fit Logistic Regression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

In [None]:
# predict likelihood of purchase.
y_pred = classifier.predict(X_test)

## Evaluate the Model's Performance

### Classification Report

In [None]:
print('Acurracy: {}'.format(metrics.accuracy_score(y_test, y_pred)))
print('Recall: {}'.format(metrics.recall_score(y_test, y_pred, zero_division=1)))
print('Precision: {}'.format(metrics.precision_score(y_test, y_pred, zero_division=1)))
print('CL Report: {}'.format(metrics.classification_report(y_test, y_pred, zero_division=1)))

* Accuracy is the score used to evaluate the model's performance. The higher it is, the better.
* Recall measures the model's ability to correctly predict the true positive values.
* Precision is the ratio of true positives to the sum of both true and false positives.
* F-Score combines precision and recall into one metric. Ideally, its value should be closeest to 1, the better.
* Support is the number of actual occurences of each class in the dataset.

## The model's performance in percent:
* Accuracy: 81%
* Recall: 69%
* Precision: 79%


We can safely conclude that this model predicted the likelihood of car purchase well.

## Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)

In [None]:
def plot_confusion_matrix(cm, target_names, title='Confusion matric', cmap=plt.cm.summer):
    plt.clf
    plt.imshow(cm, interpolation='nearest')
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(target_names))
    plt.xticks(tick_marks, target_names)
    plt.yticks(tick_marks, target_names)
    plt.tight_layout()
    
    width, height = cm.shape
    
    for x in range(width):
        for y in range(height):
            plt.annotate(str(cm[x][y]), xy=(y,x), 
                        horizontalalignment='center',
                        verticalalignment='center', 
                        color='red', fontsize=22)
            plt.ylabel('True Label')
            plt.xlabel('Predicted Label')

In [None]:
plot_confusion_matrix(cm, np.unique(y_pred))

* The confusion matrix showing that indeed **216 correct prediction** of the **class 0** (meaning the clients of the test who didn't purchase the car).
* **108 correct predictions** of the **class 1** (meaning correct predictions of the clients who purchased the car). 
* **28 incorrect predictions** of the **class 1** (meaning 28 incorrect predictions of the clients who in reality bought a car but were predicted not to).
* **48 incorrect predictions** of the **class 0** (meaning 48 clients who in reality didn't buy the car but were predicted to buy)

## ROC Curve

In [None]:
# To determine the ROC Curve, First define the metrics.
y_pred_proba = classifier.predict_proba(X_test) [::, 1]

In [None]:
# Calculate the true positive and false psositive rates
false_positive_rate, true_positive_rate, _ = metrics.roc_curve(y_test, y_pred_proba)

In [None]:
# Calculate the AUC to see the model's performance
auc = metrics.roc_auc_score(y_test, y_pred_proba)

In [None]:
# Plot the ROC Curve.
plt.plot(false_positive_rate, true_positive_rate, label='AUC=' + str(auc))
plt.title('ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc=4);

### The AUC is 0.91, meaning that the model did a great job



The receiver operating characteristic (ROC) curve is used to display the sensitivity and specificity of the logistic regression model by calculating the true positive and false positive rates.

From the ROC curve, we can calculate the area under the curve (AUC) whose value ranges from 0 to 1. You’ll remember that the closer to 1, the better it is for our predictive modeling.