The significance of using a Machine Learning model for purchasing     lies in its ability to leverage data-driven insights to
optimize the purchasing process. After gathering a dataset steps included are as below
Data preprocessing,Feature engineering, model building, saving the model and making a user interface.

### Importing required libraries and packages

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
import pickle

### (1) Exploratory data analysis

In [2]:
ads = pd.read_csv("Social_Network_Ads.csv")

In [3]:
ads.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [4]:
ads.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [5]:
ads.shape

(400, 5)

In [6]:
ads.isnull().sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

In [7]:
ads.duplicated().sum()

0

In [8]:
ads.describe()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0,400.0
mean,15691540.0,37.655,69742.5,0.3575
std,71658.32,10.482877,34096.960282,0.479864
min,15566690.0,18.0,15000.0,0.0
25%,15626760.0,29.75,43000.0,0.0
50%,15694340.0,37.0,70000.0,0.0
75%,15750360.0,46.0,88000.0,1.0
max,15815240.0,60.0,150000.0,1.0


In [9]:
ads['Gender'] = ads['Gender'].map({'Male':1,'Female':0})

In [10]:
ads

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,1,19,19000,0
1,15810944,1,35,20000,0
2,15668575,0,26,43000,0
3,15603246,0,27,57000,0
4,15804002,1,19,76000,0
...,...,...,...,...,...
395,15691863,0,46,41000,1
396,15706071,1,51,23000,1
397,15654296,0,50,20000,1
398,15755018,1,36,33000,0


### (2) Model building

In [11]:
X_train, X_test, y_train, y_test = train_test_split(ads.drop(['User ID','Purchased'],axis=1),ads['Purchased'], test_size=0.2, random_state=58)

In [14]:
X_train.shape

(320, 3)

In [15]:
X_test.shape

(80, 3)

### (3) Scaling

In [16]:
scalar = StandardScaler()

In [17]:
scalar.fit(X_train)
X_train = scalar.transform(X_train)
X_test = scalar.transform(X_test)

In [19]:
X_test

### Training models

In [20]:
models = {
    "lg":LogisticRegression(),
    'dtc': DecisionTreeClassifier(),
    'rfc': RandomForestClassifier(),
    'gnb': GaussianNB()
}

for name, algo in models.items():
    algo.fit(X_train,y_train)
    y_pred = algo.predict(X_test)
    
    print(f"{name} with accuracy {accuracy_score(y_test,y_pred)}")

lg with accuracy 0.875
dtc with accuracy 0.8875
rfc with accuracy 0.925
gnb with accuracy 0.9125


### since Random Forest Classifier is giving highest accuracy,we'll go ahead with that

In [21]:
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)
accuracy_score(y_test,y_pred)

0.925

In [24]:
X_train[20]

array([-0.96922337, -1.31765037, -0.45793145])

In [25]:
input_text = (-0.96922337, -1.31765037, -0.45793145)
input_np_text = np.asarray(input_text)
predicion = rfc.predict(input_np_text.reshape(1,-1))
if predicion == 1:
    print("purchased")
else:
    print("Not purchased")

Not purchased


In [None]:
pickle.dump(rfc,open('model.pkl'))