In [177]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [178]:
import warnings
warnings.filterwarnings("ignore")

In [193]:
df = pd.read_csv("/content/customer.csv")

In [180]:
df.head()

Unnamed: 0,age,gender,review,education,purchased
0,30,Female,Average,School,No
1,68,Female,Poor,UG,No
2,70,Female,Good,PG,No
3,72,Female,Good,PG,No
4,16,Female,Average,UG,No


In [181]:
df.shape

(50, 5)

In [194]:
X = df[['age','gender','review','education']]
y = df[['purchased']]

**Label Encoder**

In [195]:
#No=0 and Yes=1
from sklearn.preprocessing import LabelEncoder

label_encode = LabelEncoder()
y_encode=label_encode.fit_transform(y)
y_encode

array([0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 1, 0])

**Machine Learning Preprocessing  Pipelines**




In [196]:
num_cols = ['age']
cat_col = ['gender','review','education']

review_cat = ['Poor','Average',"Good"]
edu_cat = ['School','UG','PG']

X_train,X_test,y_train,y_test = train_test_split(X,y_encode,test_size=0.2,random_state=42)

#pipelines for numerical features
num_pipe = Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="mean")),
    ("Stdscalar",StandardScaler())
])


preprocess = ColumnTransformer(transformers=[
    ("num",num_pipe,num_cols),
    ("cat_col",OneHotEncoder(drop="first",handle_unknown="ignore"),[feat for feat in cat_col if feat not in ['review','education']]),
    ("ordinal",OrdinalEncoder(categories=[review_cat,edu_cat]),['review','education'])
],remainder="passthrough")

pipes = Pipeline(steps=[
    ("Preprocessor",preprocess),
])

# Fit and transform training data
X_train_transform = pipes.fit_transform(X_train)

# Transform test data (do not fit again to prevent data leakage)
X_test_transform= pipes.transform(X_test)

#using decisiontreeclassifier
clf1 = DecisionTreeClassifier(max_depth=7,min_samples_leaf=1,min_samples_split=2)
clf1.fit(X_train_transform,y_train)

#prediction
y_pred1=clf1.predict(X_test_transform)

#find the accuracy
accuracy_score(y_test,y_pred1)*100

#crossval score
np.mean(cross_val_score(clf1,X_train_transform,y_train,cv=6))*100


60.71428571428571

**Prediction without feature scaling**

In [197]:
# Sample input values for prediction
sample_input_categorical = np.array([[50, 'Male', 'Good', 'UG']])

# Convert input array into a pandas DataFrame
sample_df_categorical = pd.DataFrame(sample_input_categorical, columns=['age', 'gender', 'review', 'education'])

sample_df_transform=pipes.transform(sample_df_categorical)
# Make predictions using the trained decision tree classifier
prediction_categorical = clf1.predict(sample_df_transform)

# Interpret the prediction
if prediction_categorical[0] == 1:
    print("The model predicts that the individual is likely to make a purchase.")
else:
    print("The model predicts that the individual is unlikely to make a purchase.")

The model predicts that the individual is likely to make a purchase.


**now with the help of Pickle we have to save Pipes and clf1**

In [198]:
import pickle

# Save the preprocessing pipeline
with open('preprocessor.pkl', 'wb') as file:
    pickle.dump(pipes, file)

# Save the trained decision tree classifier
with open('model.pkl', 'wb') as file:
    pickle.dump(clf1, file)

**Model Deployment**

In [187]:
import streamlit as st
import pickle
import pandas as pd

# Load the preprocessing pipelines and the trained model
with open("preprocessor.pkl", "rb") as file:
    preprocessor = pickle.load(file)

with open("model.pkl", "rb") as file:
    model = pickle.load(file)

# Define the function to make predictions
def predict_purchase(sample_df):
    # Preprocess the input data
    sample_transform_df = preprocessor.transform(sample_df)
    # Make predictions
    prediction = model.predict(sample_transform_df)
    return prediction

# Define the Streamlit app
st.title("Purchase Prediction App")

# Create input fields for user to input data
age = st.slider("Age", min_value=18, max_value=100, value=30)
gender = st.selectbox("Gender", ["Male", "Female"])
review = st.selectbox("Review", ["Poor", "Average", "Good"])
education = st.selectbox("Education", ["School", "UG", "PG"])

# Create a button to make prediction
if st.button("Predict"):
    # Create a DataFrame with the input data
    sample_df = pd.DataFrame([[age, gender, review, education]], columns=["age", "gender", "review", "education"])
    predictions = predict_purchase(sample_df)
    # Display prediction
    if predictions[0] == 1:
        st.write("The model predicts that the individual is likely to make a purchase.")
    else:
        st.write("The model predicts that the individual is unlikely to make a purchase.")


In [216]:
! pip install streamlit -q

In [217]:
!wget -q -O - ipv4.icanhazip.com

35.245.251.156


In [218]:
! streamlit run app.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://35.245.251.156:8501[0m
[0m
[K[?25hnpx: installed 22 in 3.38s
[34m  Stopping...[0m
^C


**Scaling after doing ohe and ordinal encoder**

In [190]:
cat_col = ['gender','review','education']

review_cat = ['Poor','Average',"Good"]
edu_cat = ['School','UG','PG']

X_train,X_test,y_train,y_test = train_test_split(X,y_encode,test_size=0.2,random_state=42)



preprocess = ColumnTransformer(transformers=[
    ("cat_col",OneHotEncoder(drop="first",handle_unknown="ignore"),[feat for feat in cat_col if feat not in ['review','education']]),
    ("ordinal",OrdinalEncoder(categories=[review_cat,edu_cat]),['review','education'])
],remainder="passthrough")

pipes = Pipeline(steps=[
    ("Preprocessor",preprocess)
])

#transforming X_train,test from pipelines
X_train_process=pipes.fit_transform(X_train)
X_test_process=pipes.transform(X_test)

#scaling
scalar = StandardScaler()
X_train_scaled = scalar.fit_transform(X_train_process)
X_test_scaled = scalar.transform(X_test_process)

#using decisiontreeclassifier
clf = DecisionTreeClassifier(max_depth=7,min_samples_leaf=1,min_samples_split=2)
clf.fit(X_train_scaled,y_train)

#prediction
y_pred=clf.predict(X_test_scaled)

#find the accuracy
accuracy_score(y_test,y_pred)*100

np.mean(cross_val_score(clf,X_train_scaled,y_train,cv=6))*100

57.93650793650793

In [191]:
# Sample input values for prediction
sample_input_val = np.array([[50, 'Male', 'Good', 'UG']])

# Convert input array into a pandas DataFrame
sample_df = pd.DataFrame(sample_input_val, columns=['age', 'gender', 'review', 'education'])

sample_df_transform=pipes.transform(sample_df)

scale_sample_df_transform = scalar.transform(sample_df_transform)
# Make predictions using the trained decision tree classifier
prediction_categorical = clf1.predict(scale_sample_df_transform)

# Interpret the prediction
if prediction_categorical[0] == 1:
    print("The model predicts that the individual is likely to make a purchase.")
else:
    print("The model predicts that the individual is unlikely to make a purchase.")

The model predicts that the individual is unlikely to make a purchase.


**Hyperparameter tuning**

In [192]:
# Define the hyperparameter grid
param_grid = {
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform grid search using 5-fold cross-validation
grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')

# Fit grid search on the training data
grid_search.fit(X_train_scaled, y_train)

# Get the best hyperparameters and the corresponding accuracy score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best hyperparameters:", best_params)
print("Best accuracy score:", best_score)

Best hyperparameters: {'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best accuracy score: 0.6
