In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import xgboost as xgb

In [14]:
df = pd.read_excel("Bike Sales Dashboard.xlsx")

In [15]:
df.head()

Unnamed: 0,ID,Martial Status,Gender,Income,Children,Education,Occupation,Home Owner,Cars,Commute Distance,Region,Age,Age Brackets,Purchased Bike
0,12496,Married,Female,40000,1,Bachelors,Skilled Manual,Yes,0,0-1 Miles,Europe,42,Middle Age,No
1,24107,Married,Male,30000,3,Partial College,Clerical,Yes,1,0-1 Miles,Europe,43,Middle Age,No
2,14177,Married,Male,80000,5,Partial College,Professional,No,2,2-5 Miles,Europe,60,Old,No
3,24381,Single,Male,70000,0,Bachelors,Professional,Yes,1,5-10 Miles,Pacific,41,Middle Age,Yes
4,25597,Single,Male,30000,0,Bachelors,Clerical,No,0,0-1 Miles,Europe,36,Middle Age,Yes


In [16]:
# Check for missing values
df.isnull().sum()


ID                  0
Martial Status      0
Gender              0
Income              0
Children            0
Education           0
Occupation          0
Home Owner          0
Cars                0
Commute Distance    0
Region              0
Age                 0
Age Brackets        0
Purchased Bike      0
dtype: int64

In [17]:
df = df.drop(columns=['ID'])

In [18]:
# Check the number of unique values for each column
unique_values = df.nunique()
print(unique_values)


Martial Status       2
Gender               2
Income              16
Children             6
Education            5
Occupation           5
Home Owner           2
Cars                 5
Commute Distance     5
Region               3
Age                 53
Age Brackets         3
Purchased Bike       2
dtype: int64


In [19]:
categorical_columns = ['Martial Status', 'Gender', 'Education', 'Occupation', 'Home Owner', 'Cars', 'Commute Distance', 'Region', 'Age Brackets']


# Inspect unique values in a column to check for multiple labels
for col in categorical_columns:
    print(f"Unique values in column '{col}':")
    print(df[col].unique())

Unique values in column 'Martial Status':
['Married' 'Single']
Unique values in column 'Gender':
['Female' 'Male']
Unique values in column 'Education':
['Bachelors' 'Partial College' 'High School' 'Partial High School'
 'Graduate Degree']
Unique values in column 'Occupation':
['Skilled Manual' 'Clerical' 'Professional' 'Manual' 'Management']
Unique values in column 'Home Owner':
['Yes' 'No']
Unique values in column 'Cars':
[0 1 2 4 3]
Unique values in column 'Commute Distance':
['0-1 Miles' '2-5 Miles' '5-10 Miles' '1-2 Miles' 'More than 10 Miles']
Unique values in column 'Region':
['Europe' 'Pacific' 'North America']
Unique values in column 'Age Brackets':
['Middle Age' 'Old' 'Adolescent']


In [20]:
# Define columns for encoding
label_encode_columns = ['Martial Status', 'Gender', 'Home Owner', 'Cars']
one_hot_encode_columns = ['Education', 'Occupation', 'Commute Distance', 'Region', 'Age Brackets']

# Initialize the label encoder
label_encoder = LabelEncoder()

# Apply label encoding to specified columns
for col in label_encode_columns:
    df[col] = label_encoder.fit_transform(df[col])

# Separate the target variable
y = df['Purchased Bike'].apply(lambda x: 1 if x == 'Yes' else 0)
X = df.drop(columns=['Purchased Bike'])

# Define the preprocessing pipeline for one-hot encoding and scaling
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), one_hot_encode_columns),
        ('scaler', StandardScaler(), ['Income', 'Age'])
    ],
    remainder='passthrough'
)

# Apply preprocessing
X_preprocessed = preprocessor.fit_transform(X)

# Convert the preprocessed data back to a DataFrame
# Extract feature names
feature_names = preprocessor.get_feature_names_out()

# Convert to DataFrame
X_preprocessed = pd.DataFrame(X_preprocessed, columns=feature_names)

# Print info to check the types
print(X_preprocessed.info())

# Add the target variable to the preprocessed dataframe
df_preprocessed = X_preprocessed.copy()
df_preprocessed['Purchased Bike'] = y

print(df_preprocessed.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 28 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   onehot__Education_Bachelors                  1000 non-null   float64
 1   onehot__Education_Graduate Degree            1000 non-null   float64
 2   onehot__Education_High School                1000 non-null   float64
 3   onehot__Education_Partial College            1000 non-null   float64
 4   onehot__Education_Partial High School        1000 non-null   float64
 5   onehot__Occupation_Clerical                  1000 non-null   float64
 6   onehot__Occupation_Management                1000 non-null   float64
 7   onehot__Occupation_Manual                    1000 non-null   float64
 8   onehot__Occupation_Professional              1000 non-null   float64
 9   onehot__Occupation_Skilled Manual            1000 non-null   float64
 10  o

In [21]:
df_preprocessed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 29 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   onehot__Education_Bachelors                  1000 non-null   float64
 1   onehot__Education_Graduate Degree            1000 non-null   float64
 2   onehot__Education_High School                1000 non-null   float64
 3   onehot__Education_Partial College            1000 non-null   float64
 4   onehot__Education_Partial High School        1000 non-null   float64
 5   onehot__Occupation_Clerical                  1000 non-null   float64
 6   onehot__Occupation_Management                1000 non-null   float64
 7   onehot__Occupation_Manual                    1000 non-null   float64
 8   onehot__Occupation_Professional              1000 non-null   float64
 9   onehot__Occupation_Skilled Manual            1000 non-null   float64
 10  o

In [22]:
X = df_preprocessed.drop('Purchased Bike', axis=1)
y = df_preprocessed['Purchased Bike']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [23]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# Make predictions
y_pred_log_reg = log_reg.predict(X_test)

# Calculate accuracy
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
print(f"Logistic Regression Accuracy: {accuracy_log_reg}")

Logistic Regression Accuracy: 0.6


In [24]:
xgb_classifier = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_classifier.fit(X_train, y_train)

# Make predictions
y_pred_xgb = xgb_classifier.predict(X_test)

# Calculate accuracy
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Classifier Accuracy: {accuracy_xgb}")

XGBoost Classifier Accuracy: 0.7033333333333334
