In [87]:
# Used libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import classification_report

In [99]:
# load data
df = pd.read_csv('Amazon Customer Behavior Survey.csv')
df.drop('Timestamp', axis=1, inplace=True)

In [95]:
# Initialize instances from imported classes. These are prepocessing tools to encode labels and scale features
le = LabelEncoder()
scaler = MinMaxScaler()

In [101]:
# Data Preprocessing
age_bin = [20, 30, 40, 50, 60, 70, float('inf')]
age_label = ['20-29', '30-39', '40-49', '50-59', '60-69', '70+']
df['Age_Groups'] = pd.cut(df['age'], bins=age_bin, labels=age_label, right=False)

In [102]:
# Convert all dtypes to int
cols=["age","Gender","Purchase_Categories","Purchase_Frequency",
     "Personalized_Recommendation_Frequency","Recommendation_Helpfulness",
     "Browsing_Frequency","Product_Search_Method","Search_Result_Exploration",
     "Add_to_Cart_Browsing","Cart_Completion_Frequency",
     "Cart_Abandonment_Factors","Saveforlater_Frequency",
     "Review_Left","Review_Reliability","Review_Helpfulness",
     "Service_Appreciation","Improvement_Areas",'Age_Groups']

for c in cols:
  df[c]=le.fit_transform(df[c])

## Specify Target and Feature Column for SVM analysis

In [103]:
# Selecting target and feature columns for SVM analysis
target = df['Recommendation_Helpfulness']

featureslst=df.drop(['age', 'Gender','Purchase_Frequency',
              'Purchase_Categories', 'Browsing_Frequency','Product_Search_Method',
              'Customer_Reviews_Importance','Cart_Abandonment_Factors',
              'Saveforlater_Frequency','Recommendation_Helpfulness',
              "Service_Appreciation","Improvement_Areas"],axis=1)


In [104]:
# Split test and train data with the classic Hitchhiker reference 
X_train,X_test,y_train,y_test=train_test_split(featureslst,target,test_size=0.2,random_state=42)

In [105]:
# Initialize support vector with optimal parameters
clf_sv=SVC(C=1, 
        kernel= 'linear')
# Fit the SVC classifier on the training sets
clf_sv.fit(X_train,y_train)

# Classifier predictions on the test data
clf_svPred=clf_sv.predict(X_test)
print(clf_svPred)

[2 1 1 2 1 1 1 1 1 1 1 0 2 1 0 1 2 2 1 1 0 1 2 1 1 1 2 2 0 1 1 2 1 2 0 1 2
 0 1 0 1 1 2 1 1 2 1 2 1 1 0 0 1 2 1 1 2 0 1 1 1 0 2 2 1 2 1 1 1 1 0 1 1 1
 1 0 1 2 1 1 0 2 2 2 1 2 0 2 1 1 1 1 2 0 1 2 2 1 2 1 2 1 2 1 1 1 0 0 0 1 0
 1 1 1 1 1 2 1 1 0 0]


In [106]:
# Accuracy, precision
print(classification_report(y_test, clf_svPred))

              precision    recall  f1-score   support

           0       0.55      0.44      0.49        27
           1       0.64      0.75      0.69        57
           2       0.53      0.46      0.49        37

    accuracy                           0.60       121
   macro avg       0.57      0.55      0.56       121
weighted avg       0.59      0.60      0.59       121



In [107]:
# Predict purchase category
prediction = clf_sv.predict([[1, 0, 3, 1, 0, 0, 4, 0, 3, 0, 1]])

# Display input and predicted output
print(list(le.inverse_transform(prediction)))

['40-49']


## KNN comparisons

In [108]:
#KNN
clf_knn=KNeighborsClassifier()
clf_knn.fit(X_train,y_train)
clf_knnpred=clf_knn.predict(X_test)
print(clf_knn)

KNeighborsClassifier()


In [109]:
# Define hyperparameters for K-Nearest Neighbors
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

# Initializing K-Nearest Neighbors Classifier
knn = KNeighborsClassifier()

# Using GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')

# Fitting GridSearchCV with the training data
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                         'n_neighbors': [3, 5, 7, 9],
                         'weights': ['uniform', 'distance']},
             scoring='accuracy')

In [110]:
print(classification_report(y_test, clf_knnpred))

              precision    recall  f1-score   support

           0       0.37      0.48      0.42        27
           1       0.52      0.65      0.58        57
           2       0.60      0.24      0.35        37

    accuracy                           0.49       121
   macro avg       0.50      0.46      0.45       121
weighted avg       0.51      0.49      0.47       121



In [111]:
print(y_test)

110    2
419    2
567    1
77     1
181    1
      ..
340    1
148    0
495    2
439    1
145    0
Name: Recommendation_Helpfulness, Length: 121, dtype: int32
