In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from random import choice
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load your dataset into a Pandas DataFrame
print("Loading the dataset...")
df1 = pd.read_csv('Amazon Customer Behavior Survey.csv')
# Remove missing values
df = df1.dropna()

# Deleting rows where 'Age' is less than 18
deleted_rows = df[df['age'] < 18]
df = df[df['age'] >= 18]
print("Dataset loaded successfully.")

Loading the dataset...
Dataset loaded successfully.


In [3]:
# Create a mask for NaN values
nan_mask = df.isna()

# Filter the DataFrame to show only rows with NaN values
nan_rows = df[nan_mask.any(axis=1)]

# Print the rows with NaN values
print("Rows with NaN values:")
print(nan_rows)

Rows with NaN values:
Empty DataFrame
Columns: [Timestamp, age, Gender, Purchase_Frequency, Purchase_Categories, Personalized_Recommendation_Frequency, Browsing_Frequency, Product_Search_Method, Search_Result_Exploration, Customer_Reviews_Importance, Add_to_Cart_Browsing, Cart_Completion_Frequency, Cart_Abandonment_Factors, Saveforlater_Frequency, Review_Left, Review_Reliability, Review_Helpfulness, Personalized_Recommendation_Frequency , Recommendation_Helpfulness, Rating_Accuracy , Shopping_Satisfaction, Service_Appreciation, Improvement_Areas]
Index: []

[0 rows x 23 columns]


In [4]:
# Encoding categorical variables (convert Gender to numerical)
print("Encoding categorical variables...")
# Assuming 'df' is your DataFrame
df = df[(df['Gender'] != 'Others') & (df['Gender'] != 'Prefer not to say')]

# Assuming 'df' is your DataFrame and 'Gender' column contains these values
gender_mapping = {'Male': 0, 'Female': 1}

# Mapping the genders in the 'Gender' column using the provided mapping
df['Gender'] = df['Gender'].map(gender_mapping)
print("Categorical variables encoded.")

Encoding categorical variables...
Categorical variables encoded.


In [5]:
# Function to map purchase frequency to a numeric value
def map_purchase_frequency(freq):
    mapping = {
        "Multiple times a week": 5,
        "Once a week": 4,
        "Few times a month": 3,
        "Once a month": 2,
        "Less than once a month": 1
    }
    return mapping.get(freq, 0)

# Apply the mapping to the dataframe
df['Purchase_Frequency_Num'] = df['Purchase_Frequency'].apply(map_purchase_frequency)
print("Separating features and target variable...")
X = df[['age', 'Gender']]
y = df['Purchase_Categories']
print("Features and target variable separated.")

# Splitting the dataset into train and test sets
print("Splitting the dataset into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Dataset split completed.")

# Creating a decision tree classifier
print("Creating a decision tree classifier...")
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)
print("Decision tree classifier created and fitted.")

# Parameters to tune
print("Defining parameters to tune...")
param_grid = {
    'max_depth': [15, 16, 17, 18],
    'min_samples_split': [15, 16, 17],
    'min_samples_leaf': [14, 15, 16]
}
print("Parameters defined.")

# Using GridSearchCV to find the best parameters
print("Finding the best parameters using GridSearchCV...")
grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid, cv=3)
grid_search.fit(X_train, y_train)
print("Best parameters found using GridSearchCV.")

# Getting the best parameters and fitting the model
print("Getting the best parameters and fitting the model...")
best_params = grid_search.best_params_
best_dt_classifier = DecisionTreeClassifier(**best_params)
best_dt_classifier.fit(X_train, y_train)
print("Model fitted with the best parameters.")

# Predicting on the test set
print("Predicting on the test set...")
y_pred = best_dt_classifier.predict(X_test)
print("Prediction completed.")

# Calculating accuracy
print("Calculating accuracy...")
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the tuned decision tree classifier: {accuracy * 100:.2f}%")

Separating features and target variable...
Features and target variable separated.
Splitting the dataset into train and test sets...
Dataset split completed.
Creating a decision tree classifier...
Decision tree classifier created and fitted.
Defining parameters to tune...
Parameters defined.
Finding the best parameters using GridSearchCV...




Best parameters found using GridSearchCV.
Getting the best parameters and fitting the model...
Model fitted with the best parameters.
Predicting on the test set...
Prediction completed.
Calculating accuracy...
Accuracy of the tuned decision tree classifier: 24.74%


In [6]:
# Separating features and target variable
print("Separating features and target variable...")
X = df[['age','Gender']]
y = df['Purchase_Categories']
print("Features and target variable separated.")

# Splitting the dataset into train and test sets
print("Splitting the dataset into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Dataset split completed.")

# Creating a decision tree classifier
print("Creating a decision tree classifier...")
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)
print("Decision tree classifier created and fitted.")

# Parameters to tune
print("Defining parameters to tune...")
param_grid = {
    'max_depth': [2, 3, 4, 5],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3]
}
print("Parameters defined.")

# Using GridSearchCV to find the best parameters
print("Finding the best parameters using GridSearchCV...")
grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid, cv=3)
grid_search.fit(X_train, y_train)
print("Best parameters found using GridSearchCV.")

# Getting the best parameters and fitting the model
print("Getting the best parameters and fitting the model...")
best_params = grid_search.best_params_
best_dt_classifier = DecisionTreeClassifier(**best_params)
best_dt_classifier.fit(X_train, y_train)
print("Model fitted with the best parameters.")

# Predicting on the test set
print("Predicting on the test set...")
y_pred = best_dt_classifier.predict(X_test)
print("Prediction completed.")

# Calculating accuracy
print("Calculating accuracy...")
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the tuned decision tree classifier: {accuracy * 100:.2f}%")

Separating features and target variable...
Features and target variable separated.
Splitting the dataset into train and test sets...
Dataset split completed.
Creating a decision tree classifier...
Decision tree classifier created and fitted.
Defining parameters to tune...
Parameters defined.
Finding the best parameters using GridSearchCV...




Best parameters found using GridSearchCV.
Getting the best parameters and fitting the model...
Model fitted with the best parameters.
Predicting on the test set...
Prediction completed.
Calculating accuracy...
Accuracy of the tuned decision tree classifier: 22.68%


In [7]:
# Input age and gender
age = int(input("Enter age: "))
gender = input("Enter gender (M/F): ")

# Encode gender to numerical value
gender = 0 if gender.upper() == 'M' else 1

# Prediction
predicted_category = best_dt_classifier.predict([[age, gender]])

print(f"Based on the input, the predicted purchase category is: {predicted_category[0]}")


ValueError: invalid literal for int() with base 10: ''

In [9]:
# Drop rows where 'Gender' is 'Others' or 'Prefer not to say'
df = df[(df['Gender'] != 'Others') & (df['Gender'] != 'Prefer not to say')]

# Encoding categorical variables using LabelEncoder
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])
df['Purchase_Categories'] = le.fit_transform(df['Purchase_Categories'])

# Separating features and target variable
X = df[['age', 'Gender', 'Purchase_Categories']]
y = df['Purchase_Frequency']

# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Assuming 'dt_classifier' is your Decision Tree Classifier
dt_classifier = DecisionTreeClassifier()

# Fit the classifier with training data
dt_classifier.fit(X_train, y_train)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the tuned decision tree classifier: {accuracy * 100:.2f}%")

# Run 5 tests with random age, gender, and purchase category inputs
for test_number in range(5):
    # Generate random age (you can replace this with your input method)
    age = 20 + test_number * 5  # Example: Start from 20 and increase by 5 for each test

    # Generate random gender (you can replace this with your input method)
    gender = choice([0, 1])  # Randomly choose 0 or 1, encoded values for 'Male' and 'Female'
    
    # Generate random purchase category (you can replace this with your input method)
    purchase_category = choice(df['Purchase_Categories'].unique())  # Randomly select from unique categories
    
    # Predict purchase frequency using the trained Decision Tree Classifier
    predicted_purchase_frequency = dt_classifier.predict([[age, gender, purchase_category]])

    # Display input and predicted output
    print(f"Test {test_number + 1}: Input - Age: {age}, Gender (Encoded): {gender}, Purchase Category: {le.inverse_transform([purchase_category])[0]}, Predicted Purchase Frequency: {predicted_purchase_frequency[0]}")


Accuracy of the tuned decision tree classifier: 0.00%
Test 1: Input - Age: 20, Gender (Encoded): 1, Purchase Category: Groceries and Gourmet Food;Beauty and Personal Care;Clothing and Fashion;Home and Kitchen, Predicted Purchase Frequency: Multiple times a week
Test 2: Input - Age: 25, Gender (Encoded): 0, Purchase Category: Beauty and Personal Care;Home and Kitchen;others, Predicted Purchase Frequency: Once a week
Test 3: Input - Age: 30, Gender (Encoded): 1, Purchase Category: Groceries and Gourmet Food;Clothing and Fashion;others, Predicted Purchase Frequency: Few times a month
Test 4: Input - Age: 35, Gender (Encoded): 0, Purchase Category: Groceries and Gourmet Food;Home and Kitchen;others, Predicted Purchase Frequency: Less than once a month
Test 5: Input - Age: 40, Gender (Encoded): 1, Purchase Category: Beauty and Personal Care;others, Predicted Purchase Frequency: Once a month


