# Logistics Regression Project on SUV Cars datasets

## The aim of this project is to predict if suv will have more buyers in the future

##### Step 1 : Load the SUV dataset

In [24]:
import pandas as pd  # I am importing pandas to handle data loading and manipulation

data = pd.read_csv('suv_data.csv') # I am reading the Excel file into a pandas DataFrame so I can work with it in Python

In [26]:
data

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510.0,Male,19.0,19000.0,0.0
1,15810944.0,Male,35.0,20000.0,0.0
2,15668575.0,Female,26.0,43000.0,0.0
3,15603246.0,Female,27.0,57000.0,0.0
4,15804002.0,Male,19.0,76000.0,0.0
...,...,...,...,...,...
395,15691863.0,Female,46.0,41000.0,1.0
396,15706071.0,Male,51.0,23000.0,1.0
397,15654296.0,Female,50.0,20000.0,1.0
398,15755018.0,Male,36.0,33000.0,0.0


 ##### Step 2: Drop unnecessary columns

In [28]:
# I am removing 'User ID' because it does not contribute to predicting purchases
data.drop(columns=['User ID'], axis=1, inplace=True)

In [14]:
data

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510.0,Male,19.0,19000.0,0.0
1,15810944.0,Male,35.0,20000.0,0.0
2,15668575.0,Female,26.0,43000.0,0.0
3,15603246.0,Female,27.0,57000.0,0.0
4,15804002.0,Male,19.0,76000.0,0.0
...,...,...,...,...,...
395,15691863.0,Female,46.0,41000.0,1.0
396,15706071.0,Male,51.0,23000.0,1.0
397,15654296.0,Female,50.0,20000.0,1.0
398,15755018.0,Male,36.0,33000.0,0.0


##### Step 3: Check the balance of the target variable 'Purchased'

In [30]:
# I am counting how many people purchased vs not purchased
data['Purchased'].value_counts()

Purchased
0.0    257
1.0    143
Name: count, dtype: int64

In [18]:
data.duplicated().sum() #I am checking for duplicates value

0

In [20]:
data.info() #I am checking for information about the dat

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   User ID          400 non-null    float64
 1   Gender           400 non-null    object 
 2   Age              400 non-null    float64
 3   EstimatedSalary  400 non-null    float64
 4   Purchased        400 non-null    float64
dtypes: float64(4), object(1)
memory usage: 15.8+ KB


In [22]:
data.describe() #I am checking for the statistics of the data

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0,400.0
mean,15691540.0,37.655,69742.5,0.3575
std,71658.32,10.482877,34096.960282,0.479864
min,15566690.0,18.0,15000.0,0.0
25%,15626760.0,29.75,43000.0,0.0
50%,15694340.0,37.0,70000.0,0.0
75%,15750360.0,46.0,88000.0,1.0
max,15815240.0,60.0,150000.0,1.0


## Problem Statement: To predict if suv will have more buyers in the future

### Now, working on Interpretation
#### Interpretation tells us whether our data is balanced or imbalanced between buyers and non-buyers

##### Step 4: Separate the classes for balancing

In [35]:
# I am creating separate datasets for people who didn't purchase and those who did
did_not_buy = data[data['Purchased'] == 0]
did_buy = data[data['Purchased'] == 1]

##### Step 5: Equalize class sizes by random sampling 

In [47]:
# I am sampling the same number from each group to avoid bias in training
did_not_buy_rows = did_not_buy.sample(n=min(len(did_not_buy), len(did_buy)), random_state=40)
did_buy_rows = did_buy.sample(n=min(len(did_not_buy), len(did_buy)), random_state=40)

In [49]:
did_not_buy_rows

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
310,Female,42.0,70000.0,0.0
266,Male,40.0,75000.0,0.0
176,Female,35.0,47000.0,0.0
359,Male,42.0,54000.0,0.0
120,Female,36.0,75000.0,0.0
...,...,...,...,...
74,Male,32.0,18000.0,0.0
263,Female,35.0,72000.0,0.0
143,Male,30.0,89000.0,0.0
127,Male,26.0,32000.0,0.0


In [51]:
did_buy_rows

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
383,Male,49.0,28000.0,1.0
303,Male,37.0,79000.0,1.0
290,Male,39.0,134000.0,1.0
354,Male,36.0,99000.0,1.0
245,Female,51.0,146000.0,1.0
...,...,...,...,...
283,Female,52.0,21000.0,1.0
27,Female,47.0,30000.0,1.0
249,Female,35.0,97000.0,1.0
22,Male,48.0,41000.0,1.0


##### Step 6: Combine balanced data back together

In [56]:
data = pd.concat([did_not_buy_rows, did_buy_rows], axis=0)

In [58]:
data

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
310,Female,42.0,70000.0,0.0
266,Male,40.0,75000.0,0.0
176,Female,35.0,47000.0,0.0
359,Male,42.0,54000.0,0.0
120,Female,36.0,75000.0,0.0
...,...,...,...,...
283,Female,52.0,21000.0,1.0
27,Female,47.0,30000.0,1.0
249,Female,35.0,97000.0,1.0
22,Male,48.0,41000.0,1.0


##### Step 7: Encode categorical variables

In [61]:
# I am replacing text labels with numbers so the model can understand them
data['Gender'] = data['Gender'].replace(['Male', 'Female'], [0, 1])

  data['Gender'] = data['Gender'].replace(['Male', 'Female'], [0, 1])


##### Step 8: Define features (X) and target (y)

In [64]:
# I am separating predictors (Age, EstimatedSalary, Gender) from the target (Purchased)
X = data.drop('Purchased', axis=1)
y = data['Purchased']

In [66]:
X

Unnamed: 0,Gender,Age,EstimatedSalary
310,1,42.0,70000.0
266,0,40.0,75000.0
176,1,35.0,47000.0
359,0,42.0,54000.0
120,1,36.0,75000.0
...,...,...,...
283,1,52.0,21000.0
27,1,47.0,30000.0
249,1,35.0,97000.0
22,0,48.0,41000.0


In [68]:
y

310    0.0
266    0.0
176    0.0
359    0.0
120    0.0
      ... 
283    1.0
27     1.0
249    1.0
22     1.0
282    1.0
Name: Purchased, Length: 286, dtype: float64

##### Step 9: Split into training and testing sets

In [71]:
# I am splitting the data so the model learns on one part and is tested on another
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

##### Step 10: Train logistic regression model

In [74]:
# I am importing and creating a Logistic Regression model instance
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)  # I am training the model with training data

##### Step 11: Make predictions on the test set

In [93]:
computer_prediction = lr.predict(X_test)  # I am asking the trained model to guess on unseen data

##### Step 12: Compare with original answers

In [78]:
test_actual = y_test.to_numpy()  # I am converting actual answers to a numpy array for comparison

In [80]:
test_actual

array([1., 0., 0., 0., 1., 1., 1., 1., 1., 0., 1., 0., 1., 0., 0., 0., 0.,
       1., 0., 0., 1., 0., 1., 0., 1., 1., 1., 0., 1., 0., 0., 0., 0., 0.,
       1., 1., 0., 1., 1., 1., 0., 0., 0., 1., 1., 0., 1., 1., 0., 0., 1.,
       1., 0., 0., 1., 1., 1., 0.])

##### Step 13: Evaluate results

In [85]:
# I am importing metrics to check how good the model is
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

##### Getting the Accuracy

In [91]:
logistic_accuracy = accuracy_score(test_actual, computer_prediction)  # I am measuring overall correctness
print("Accuracy:", logistic_accuracy)  # Interpretation: The closer to 1.0, the better the model

Accuracy: 0.7586206896551724


##### Getting the Confusion Matrix

In [97]:
lr_cm = confusion_matrix(test_actual, computer_prediction)  # I am checking TP, TN, FP, FN counts
print("Confusion Matrix:\n", lr_cm)
# Interpretation: Shows correct and incorrect predictions split by class

Confusion Matrix:
 [[19 10]
 [ 4 25]]


##### Classification Report

In [102]:
lr_cl = classification_report(test_actual, computer_prediction)  # I am summarizing precision, recall, and F1-score
print("Classification Report:\n", lr_cl)
# Interpretation: Precision tells how many predicted buyers were correct, recall tells how many real buyers we found

Classification Report:
               precision    recall  f1-score   support

         0.0       0.83      0.66      0.73        29
         1.0       0.71      0.86      0.78        29

    accuracy                           0.76        58
   macro avg       0.77      0.76      0.76        58
weighted avg       0.77      0.76      0.76        58



##### Model interpretation

#### By 

## Ebenezer Adebiyi

### Linkedin : Ebenezer Adebiyi
### Email : Ebenezerdadebiyi@gmail.com