In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load the training and test datasets
training_data_path = 'Consumer_Dataset.csv'
test_data_path = 'Consumer Test Dataset.csv'

training_data = pd.read_csv(training_data_path)
test_data = pd.read_csv(test_data_path)
#print(training_data.head())

# Drop the 'Unnamed: 0' column
training_data = training_data.drop(columns=['Unnamed: 0'])
test_data = test_data.drop(columns=['Unnamed: 0'])

# Remove the 'Group' column before fitting the imputer on the training data
training_features = training_data.drop(columns=['Group'])

# Handle missing values
training_features_imputed = training_features.fillna(training_features.mode().iloc[0])
test_data_imputed = test_data.fillna(training_features.mode().iloc[0])

# Convert categorical variables to dummy variables
X = pd.get_dummies(training_features_imputed)
test_data_imputed = pd.get_dummies(test_data_imputed)


# Ensure both training and test data have the same dummy variables
X, test_data_imputed = X.align(test_data_imputed, join='left', axis=1)
test_data_imputed = test_data_imputed.fillna(0)  # Fill any new columns created in alignment with 0

y = training_data['Group']

# Split the training data for model evaluation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a k-Nearest Neighbors Classifier with a k value
knn = KNeighborsClassifier(49)
knn.fit(X_train, y_train)

# Evaluate the model on the training set
y_train_pred = knn.predict(X_train)
training_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {training_accuracy:.2f}")

# Evaluate the model on the validation set
y_val_pred = knn.predict(X_val)
validation_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {validation_accuracy:.2f}")

# Predict customer segments for the test dataset
test_data_predictions = knn.predict(test_data_imputed)
test_data['Group'] = test_data_predictions  # Assuming no need for inverse transform as no encoding is done

# Save the predictions to a CSV file
#test_data.to_csv('DAVq3predfinal.csv', index=False)



Training Accuracy: 0.90
Validation Accuracy: 0.88
