In [6]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
# Load your dataset into a Pandas DataFrame
print("Loading the dataset...")
df = pd.read_csv('Amazon Customer Behavior Survey.csv')
print("Dataset loaded successfully.")
# Encoding categorical variables (convert Gender to numerical)
print("Encoding categorical variables...")
# Disregard 'Prefer not to say' and 'others' from the gender row
df = df[(df['Gender'] != 'Others') & (df['Gender'] != 'Prefer not to say')]

# Assuming 'df' is your DataFrame and 'Gender' column contains these values
gender_mapping = {'Male': 0, 'Female': 1}

# Mapping the genders in the 'Gender' column using the provided mapping
df['Gender'] = df['Gender'].map(gender_mapping)
print("Categorical variables encoded.")

# Encoding categorical variables (convert Gender to numerical)
print("Encoding categorical variables...")
df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1, 'Prefer not to say': 2, 'Others': 3})
print("Categorical variables encoded.")

# Separating features and target variable
print("Separating features and target variable...")
X = df[['age', 'Gender']]
y = df['Purchase_Categories']
print("Features and target variable separated.")

# Impute missing values in the features
imputer = SimpleImputer(strategy='mean')  # Use any strategy: mean, median, most_frequent
X_imputed = imputer.fit_transform(X)

# Splitting the dataset into train and test sets
print("Splitting the dataset into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)
print("Dataset split completed.")

# Creating a RandomForestClassifier
print("Creating a Random Forest classifier...")
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)
print("Random Forest classifier created and fitted.")

# Predicting on the test set
print("Predicting on the test set...")
y_pred = rf_classifier.predict(X_test)
print("Prediction completed.")

# Calculating accuracy
print("Calculating accuracy...")
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the Random Forest classifier: {accuracy * 100:.2f}%")



Loading the dataset...
Dataset loaded successfully.
Encoding categorical variables...
Categorical variables encoded.
Encoding categorical variables...
Categorical variables encoded.
Separating features and target variable...
Features and target variable separated.
Splitting the dataset into train and test sets...
Dataset split completed.
Creating a Random Forest classifier...
Random Forest classifier created and fitted.
Predicting on the test set...
Prediction completed.
Calculating accuracy...
Accuracy of the Random Forest classifier: 22.22%


In [4]:
# Load your dataset into a Pandas DataFrame
print("Loading the dataset...")
df = pd.read_csv('Amazon Customer Behavior Survey.csv')
print("Dataset loaded successfully.")
# Encoding categorical variables (convert Gender to numerical)
print("Encoding categorical variables...")
# Disregard 'Prefer not to say' and 'others' from the gender row
df = df[(df['Gender'] != 'Others') & (df['Gender'] != 'Prefer not to say')]

# Assuming 'df' is your DataFrame and 'Gender' column contains these values
gender_mapping = {'Male': 0, 'Female': 1}

# Mapping the genders in the 'Gender' column using the provided mapping
df['Gender'] = df['Gender'].map(gender_mapping)
print("Categorical variables encoded.")

# Perform one-hot encoding for Purchase_Categories
print("Performing one-hot encoding for Purchase_Categories...")
categories = df['Purchase_Categories'].str.get_dummies(';')
df = pd.concat([df, categories], axis=1)
print("One-hot encoding completed for Purchase_Categories.")

# Separating features and target variable
print("Separating features and target variable...")
X = df[['age', 'Gender'] + list(categories.columns)]
y = df['Purchase_Frequency']
print("Features and target variable separated.")

# Splitting the dataset into train and test sets
print("Splitting the dataset into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Dataset split completed.")

# Defining parameters to tune
print("Defining parameters to tune...")
param_grid = {
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
print("Parameters defined.")

# Creating a decision tree classifier
print("Creating a decision tree classifier...")
dt_classifier = DecisionTreeClassifier()

# Using GridSearchCV to find the best parameters
print("Finding the best parameters using GridSearchCV...")
grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid, cv=3)
grid_search.fit(X_train, y_train)
print("Best parameters found using GridSearchCV.")

# Getting the best parameters and fitting the model
print("Getting the best parameters and fitting the model...")
best_params = grid_search.best_params_
best_dt_classifier = DecisionTreeClassifier(**best_params)
best_dt_classifier.fit(X_train, y_train)
print("Model fitted with the best parameters.")

# Predicting on the test set
print("Predicting on the test set...")
y_pred = best_dt_classifier.predict(X_test)
print("Prediction completed.")

# Calculating accuracy
print("Calculating accuracy...")
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the tuned decision tree classifier: {accuracy * 100:.2f}%")


Loading the dataset...
Dataset loaded successfully.
Encoding categorical variables...
Categorical variables encoded.
Performing one-hot encoding for Purchase_Categories...
One-hot encoding completed for Purchase_Categories.
Separating features and target variable...
Features and target variable separated.
Splitting the dataset into train and test sets...
Dataset split completed.
Defining parameters to tune...
Parameters defined.
Creating a decision tree classifier...
Finding the best parameters using GridSearchCV...
Best parameters found using GridSearchCV.
Getting the best parameters and fitting the model...
Model fitted with the best parameters.
Predicting on the test set...
Prediction completed.
Calculating accuracy...
Accuracy of the tuned decision tree classifier: 40.40%


In [7]:
df = pd.read_csv('Amazon Customer Behavior Survey.csv')
print("Dataset loaded successfully.")
# Load your dataset into a Pandas DataFrame
print("Loading the dataset...")
df = pd.read_csv('Amazon Customer Behavior Survey.csv')
print("Dataset loaded successfully.")
# Encoding categorical variables (convert Gender to numerical)
print("Encoding categorical variables...")
# Disregard 'Prefer not to say' and 'others' from the gender row
df = df[(df['Gender'] != 'Others') & (df['Gender'] != 'Prefer not to say')]

# Assuming 'df' is your DataFrame and 'Gender' column contains these values
gender_mapping = {'Male': 0, 'Female': 1}

# Mapping the genders in the 'Gender' column using the provided mapping
df['Gender'] = df['Gender'].map(gender_mapping)
print("Categorical variables encoded.")
categories = df['Purchase_Categories'].str.get_dummies(';')
df = pd.concat([df, categories], axis=1)
print("One-hot encoding completed for Purchase_Categories.")

X = df[['age', 'Gender'] + list(categories.columns)]
y = df['Purchase_Frequency']
print("Features and target variable separated.")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Dataset split completed.")

param_grid = {
    'max_depth': [20, 40, 70],
    'min_samples_split': [20, 50, 100],
    'min_samples_leaf': [10, 20, 40]
}
print("Parameters defined.")

gb_classifier = GradientBoostingClassifier()

grid_search = GridSearchCV(estimator=gb_classifier, param_grid=param_grid, cv=3)
grid_search.fit(X_train, y_train)
print("Best parameters found using GridSearchCV.")

best_params = grid_search.best_params_
best_gb_classifier = GradientBoostingClassifier(**best_params)
best_gb_classifier.fit(X_train, y_train)
print("Model fitted with the best parameters.")

print("Predicting on the test set...")
y_pred = best_gb_classifier.predict(X_test)


print("Calculating accuracy...")
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the tuned Gradient Boosting classifier: {accuracy * 100:.2f}%")


Dataset loaded successfully.
Loading the dataset...
Dataset loaded successfully.
Encoding categorical variables...
Categorical variables encoded.
One-hot encoding completed for Purchase_Categories.
Features and target variable separated.
Dataset split completed.
Parameters defined.
Best parameters found using GridSearchCV.
Model fitted with the best parameters.
Predicting on the test set...
Calculating accuracy...
Accuracy of the tuned Gradient Boosting classifier: 42.42%


In [8]:
df = pd.read_csv('Amazon Customer Behavior Survey.csv')
print("Dataset loaded successfully.")
# Load your dataset into a Pandas DataFrame
print("Loading the dataset...")
df = pd.read_csv('Amazon Customer Behavior Survey.csv')
print("Dataset loaded successfully.")
# Encoding categorical variables (convert Gender to numerical)
print("Encoding categorical variables...")
# Disregard 'Prefer not to say' and 'others' from the gender row
df = df[(df['Gender'] != 'Others') & (df['Gender'] != 'Prefer not to say')]
# Assuming 'df' is your DataFrame and 'Gender' column contains these values
gender_mapping = {'Male': 0, 'Female': 1}

# Mapping the genders in the 'Gender' column using the provided mapping
df['Gender'] = df['Gender'].map(gender_mapping)
print("Categorical variables encoded.")
categories = df['Purchase_Categories'].str.get_dummies(';')
df = pd.concat([df, categories], axis=1)
print("One-hot encoding completed for Purchase_Categories.")


# Perform one-hot encoding for Purchase_Categories
categories = df['Purchase_Categories'].str.get_dummies(';')
df = pd.concat([df, categories], axis=1)
print("One-hot encoding completed for Purchase_Categories.")

# Separating features and target variable
X = df[['age', 'Gender'] + list(categories.columns)]
y = df['Purchase_Frequency']
print("Features and target variable separated.")

# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Dataset split completed.")

# Defining parameters for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 150],
    'max_depth': [20, 40, 70]
}
print("Parameters defined for Random Forest.")

# Random Forest Classifier
rf_classifier = RandomForestClassifier()
grid_search_rf = GridSearchCV(estimator=rf_classifier, param_grid=param_grid_rf, cv=3)
grid_search_rf.fit(X_train, y_train)
print("Best parameters found using GridSearchCV for Random Forest.")

best_params_rf = grid_search_rf.best_params_
best_rf_classifier = RandomForestClassifier(**best_params_rf)
best_rf_classifier.fit(X_train, y_train)
print("Random Forest model fitted with the best parameters.")

# Predicting using Random Forest
y_pred_rf = best_rf_classifier.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy of the tuned Random Forest classifier: {accuracy_rf * 100:.2f}%")




Dataset loaded successfully.
Loading the dataset...
Dataset loaded successfully.
Encoding categorical variables...
Categorical variables encoded.
One-hot encoding completed for Purchase_Categories.
One-hot encoding completed for Purchase_Categories.
Features and target variable separated.
Dataset split completed.
Parameters defined for Random Forest.
Best parameters found using GridSearchCV for Random Forest.
Random Forest model fitted with the best parameters.
Accuracy of the tuned Random Forest classifier: 41.41%
