In [43]:
# Used libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import classification_report

In [44]:
# load data
df = pd.read_csv('Amazon Customer Behavior Survey.csv')

In [45]:
# Initialize instances from imported classes.
le = LabelEncoder()
scaler = MinMaxScaler()

In [46]:
# Data Preprocessing
df.drop('Timestamp', axis=1, inplace=True)

age_bin = [18, 30, 35, 45, 55, float('inf')]
age_label = ['18-29', '30-34', '35-44', '45-54', '55+']
df['Age_Groups'] = pd.cut(df['age'], bins=age_bin, labels=age_label, right=False)

In [47]:
# Convert all dtypes to int
cols=["age","Gender","Purchase_Categories","Purchase_Frequency",
     "Personalized_Recommendation_Frequency","Recommendation_Helpfulness",
     "Browsing_Frequency","Product_Search_Method","Search_Result_Exploration",
     "Add_to_Cart_Browsing","Cart_Completion_Frequency",
     "Cart_Abandonment_Factors","Saveforlater_Frequency",
     "Review_Left","Review_Reliability","Review_Helpfulness",
     "Service_Appreciation","Improvement_Areas",'Age_Groups']

for c in cols:
  df[c]=le.fit_transform(df[c])

## Specify Target and Feature Column for SVM analysis

In [48]:
target = df['Recommendation_Helpfulness']

featureslst=df.drop(['age', 'Gender','Purchase_Frequency',
              'Purchase_Categories', 'Browsing_Frequency','Product_Search_Method',
              'Customer_Reviews_Importance','Cart_Abandonment_Factors',
              'Saveforlater_Frequency','Recommendation_Helpfulness',
              "Service_Appreciation","Improvement_Areas"],axis=1)


In [49]:
# Split test and train data
X_train,X_test,y_train,y_test=train_test_split(featureslst,target,test_size=0.2,random_state=42)

In [50]:
# Initialize support vector with optimal parameters
clf_sv=SVC(C=1, 
        kernel= 'linear')
# Fit the SVC classifier on the training sets
clf_sv.fit(X_train,y_train)

# Classifier predictions on the test data
clf_svPred=clf_sv.predict(X_test)
print(clf_svPred)

[2 1 1 2 1 1 1 1 1 1 1 0 2 1 0 1 2 2 1 1 0 1 1 1 1 1 2 2 0 1 1 0 1 2 0 1 2
 0 1 0 1 1 2 1 1 2 1 2 1 1 0 0 1 2 1 1 2 0 1 1 1 0 2 2 1 2 1 1 1 1 0 1 1 1
 1 0 1 1 1 1 0 2 2 2 1 2 0 2 1 1 1 1 2 0 1 2 2 1 2 1 2 1 2 1 1 1 0 0 0 1 2
 1 1 1 1 1 2 1 1 0 0]


In [51]:
# Accuracy, precision
print(classification_report(y_test, clf_svPred))

              precision    recall  f1-score   support

           0       0.55      0.44      0.49        27
           1       0.64      0.77      0.70        57
           2       0.57      0.46      0.51        37

    accuracy                           0.60       121
   macro avg       0.58      0.56      0.57       121
weighted avg       0.60      0.60      0.59       121



In [53]:
# Predict purchase category
prediction = clf_sv.predict([[1, 0, 3, 1, 0, 0, 4, 0, 3, 0, 1]])

# Display input and predicted output
print(list(le.inverse_transform(prediction)))



['35-44']


## KNN comparisons

In [None]:
#KNN
clf_knn=KNeighborsClassifier()
clf_knn.fit(X_train,y_train)
clf_knnpred=clf_knn.predict(X_test)
print(clf_knn)

KNeighborsClassifier()


In [None]:
# Define hyperparameters for K-Nearest Neighbors
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

# Initializing K-Nearest Neighbors Classifier
knn = KNeighborsClassifier()

# Using GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')

# Fitting GridSearchCV with the training data
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                         'n_neighbors': [3, 5, 7, 9],
                         'weights': ['uniform', 'distance']},
             scoring='accuracy')

In [None]:
print(classification_report(y_test, clf_knnpred))

              precision    recall  f1-score   support

           0       0.39      0.56      0.46        27
           1       0.57      0.63      0.60        57
           2       0.65      0.35      0.46        37

    accuracy                           0.53       121
   macro avg       0.54      0.51      0.51       121
weighted avg       0.56      0.53      0.53       121



In [None]:
print(y_test)

110    2
419    2
567    1
77     1
181    1
      ..
340    1
148    0
495    2
439    1
145    0
Name: Recommendation_Helpfulness, Length: 121, dtype: int32


In [56]:
print(clf.best_params_)

NameError: name 'clf' is not defined

## Some fluff.

In [None]:
X = df[['age', 'Gender']]
y = df["Purchase_Categories"]


In [None]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=4)

# SVM classifier
sv = SVC(C=10, gamma=1, kernel='rbf')
sv.fit(X_train, y_train)
y_pred = sv.predict(X_test)

# Classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.16      0.38      0.22         8
           1       0.50      0.12      0.20         8
           2       0.14      0.25      0.18         4
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         8
           9       0.00      0.00      0.00         5
          10       0.00      0.00      0.00         2
          11       0.00      0.00      0.00         3
          13       0.00      0.00      0.00         2
          14       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         2
          16       0.00      0.00      0.00         3
          19       0.00      0.00      0.00         1
          21       0.00      0.00      0.00         1
          22       0.00      0.00      0.00         0
          24       0.00    

In [None]:
new_features = [[0, 2]] 
predicted_category = sv.predict(new_features)

# Display the predicted purchase category
print(list(le.inverse_transform(predicted_category)))

ValueError: y contains previously unseen labels: [26]