**# Using Decision Trees**

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load the data
train_data = pd.read_csv('/kaggle/input/customer/Train.csv')
test_data = pd.read_csv('/kaggle/input/customer/Test.csv')

# Separate target variable from features in training data
X_train_full = train_data.drop(['ID', 'Segmentation'], axis=1)
y_train_full = train_data['Segmentation']

# Separate ID column from test features
test_ids = test_data['ID']
X_test = test_data.drop('ID', axis=1)

# Handle missing values
X_train_full.fillna(method='ffill', inplace=True)
X_test.fillna(method='ffill', inplace=True)

# Encode categorical variables
cat_cols = ['Gender', 'Ever_Married', 'Graduated', 'Profession', 'Spending_Score', 'Var_1']
for col in cat_cols:
    le = LabelEncoder()
    X_train_full[col] = le.fit_transform(X_train_full[col])
    X_test[col] = le.transform(X_test[col])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

# Initialize the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)

# Fit the model on training data
clf.fit(X_train, y_train)

# Predict on validation set
y_pred_val = clf.predict(X_val)

# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, y_pred_val)
print(f"Validation Accuracy: {val_accuracy}")

# Predict segments for test data
test_predictions = clf.predict(X_test)

# Create a DataFrame to store the predictions along with 'ID'
submission_df = pd.DataFrame({'ID': test_ids, 'Segmentation': test_predictions})

# Save the predictions to a CSV file
submission_df.to_csv('predicted_segments.csv', index=False)


Validation Accuracy: 0.4281288723667906


**Random Forest**

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the data
train_data = pd.read_csv('/kaggle/input/customer/Train.csv')
test_data = pd.read_csv('/kaggle/input/customer/Test.csv')

# Separate target variable from features in training data
X_train_full = train_data.drop(['ID', 'Segmentation'], axis=1)
y_train_full = train_data['Segmentation']

# Separate ID column from test features
test_ids = test_data['ID']
X_test = test_data.drop('ID', axis=1)

# Handle missing values
X_train_full.fillna(method='ffill', inplace=True)
X_test.fillna(method='ffill', inplace=True)

# Encode categorical variables
cat_cols = ['Gender', 'Ever_Married', 'Graduated', 'Profession', 'Spending_Score', 'Var_1']
for col in cat_cols:
    le = LabelEncoder()
    X_train_full[col] = le.fit_transform(X_train_full[col])
    X_test[col] = le.transform(X_test[col])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

# Initialize the Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model on training data
clf.fit(X_train, y_train)

# Predict on validation set
y_pred_val = clf.predict(X_val)

# Calculate accuracy on validation set
val_accuracy = accuracy_score(y_val, y_pred_val)
print(f"Validation Accuracy with Random Forest Classifier: {val_accuracy}")

# Predict segments for test data
test_predictions = clf.predict(X_test)

# Create a DataFrame to store the predictions along with 'ID'
submission_df = pd.DataFrame({'ID': test_ids, 'Segmentation': test_predictions})

# Save the predictions to a CSV file
submission_df.to_csv('predicted_segments.csv', index=False)


Validation Accuracy with Random Forest Classifier: 0.48451053283767037


**GridSearchCV**

In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the data
train_data = pd.read_csv('/kaggle/input/customer/Train.csv')
test_data = pd.read_csv('/kaggle/input/customer/Test.csv')

# Separate target variable from features in training data
X_train_full = train_data.drop(['ID', 'Segmentation'], axis=1)
y_train_full = train_data['Segmentation']

# Separate ID column from test features
test_ids = test_data['ID']
X_test = test_data.drop('ID', axis=1)

# Handle missing values
X_train_full.fillna(method='ffill', inplace=True)
X_test.fillna(method='ffill', inplace=True)

# Encode categorical variables
cat_cols = ['Gender', 'Ever_Married', 'Graduated', 'Profession', 'Spending_Score', 'Var_1']
for col in cat_cols:
    le = LabelEncoder()
    X_train_full[col] = le.fit_transform(X_train_full[col])
    X_test[col] = le.transform(X_test[col])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    # Add more parameters to tune
}

# Initialize the Random Forest Classifier
clf = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and the best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# Use the best estimator for prediction and evaluation
y_pred_val = best_estimator.predict(X_val)
val_accuracy = accuracy_score(y_val, y_pred_val)
print(f"Improved Validation Accuracy: {val_accuracy} with parameters: {best_params}")

# Predict segments for test data
test_predictions = best_estimator.predict(X_test)

# Create a DataFrame to store the predictions along with 'ID'
submission_df = pd.DataFrame({'ID': test_ids, 'Segmentation': test_predictions})

# Save the predictions to a CSV file
submission_df.to_csv('predicted_segments.csv', index=False)


Improved Validation Accuracy: 0.5179677819083024 with parameters: {'max_depth': 10, 'n_estimators': 300}


In [4]:
!pip install optuna



In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import optuna

# Load the data
train_data = pd.read_csv('/kaggle/input/customer/Train.csv')
test_data = pd.read_csv('/kaggle/input/customer/Test.csv')

# Separate target variable from features in training data
X_train_full = train_data.drop(['ID', 'Segmentation'], axis=1)
y_train_full = train_data['Segmentation']

# Separate ID column from test features
test_ids = test_data['ID']
X_test = test_data.drop('ID', axis=1)

# Handle missing values
X_train_full.fillna(method='ffill', inplace=True)
X_test.fillna(method='ffill', inplace=True)

# Encode categorical variables
cat_cols = ['Gender', 'Ever_Married', 'Graduated', 'Profession', 'Spending_Score', 'Var_1']
for col in cat_cols:
    le = LabelEncoder()
    X_train_full[col] = le.fit_transform(X_train_full[col])
    X_test[col] = le.transform(X_test[col])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

# Define the objective function for optimization
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2']),
        # Add more parameters to tune
    }
    
    clf = RandomForestClassifier(**params)
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_val)
    return 1.0 - accuracy_score(y_val, y_pred)

# Perform Bayesian optimization with Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Get the best parameters
best_params = study.best_params
print(f"Best Parameters: {best_params}")

# Use the best parameters for training the final model
best_clf = RandomForestClassifier(**best_params)
best_clf.fit(X_train_full, y_train_full)

# Predict segments for test data using the best model
test_predictions = best_clf.predict(X_test)

# Create a DataFrame to store the predictions along with 'ID'
submission_df = pd.DataFrame({'ID': test_ids, 'Segmentation': test_predictions})

# Save the predictions to a CSV file
submission_df.to_csv('predicted_segments.csv', index=False)


[I 2023-11-24 17:09:15,606] A new study created in memory with name: no-name-ea7578a0-a800-4676-8474-bb87dca97e78
  warn(
[I 2023-11-24 17:09:16,596] Trial 0 finished with value: 0.4851301115241635 and parameters: {'n_estimators': 207, 'max_depth': 4, 'min_samples_split': 7, 'min_samples_leaf': 2, 'max_features': 'auto'}. Best is trial 0 with value: 0.4851301115241635.
  warn(
[I 2023-11-24 17:09:18,563] Trial 1 finished with value: 0.4826517967781908 and parameters: {'n_estimators': 362, 'max_depth': 6, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 'auto'}. Best is trial 1 with value: 0.4826517967781908.
[I 2023-11-24 17:09:20,589] Trial 2 finished with value: 0.48822800495662955 and parameters: {'n_estimators': 254, 'max_depth': 20, 'min_samples_split': 11, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.4826517967781908.
[I 2023-11-24 17:09:22,390] Trial 3 finished with value: 0.476456009913259 and parameters: {'n_estimators': 335, 'max

Best Parameters: {'n_estimators': 412, 'max_depth': 8, 'min_samples_split': 18, 'min_samples_leaf': 8, 'max_features': 'sqrt'}


In [6]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import optuna

# Load the data
train_data = pd.read_csv('/kaggle/input/customer/Train.csv')
test_data = pd.read_csv('/kaggle/input/customer/Test.csv')

# Separate target variable from features in training data
X_train_full = train_data.drop(['ID', 'Segmentation'], axis=1)
y_train_full = train_data['Segmentation']

# Separate ID column from test features
test_ids = test_data['ID']
X_test = test_data.drop('ID', axis=1)

# Handle missing values
X_train_full.fillna(method='ffill', inplace=True)
X_test.fillna(method='ffill', inplace=True)

# Encode categorical variables
cat_cols = ['Gender', 'Ever_Married', 'Graduated', 'Profession', 'Spending_Score', 'Var_1']
for col in cat_cols:
    le = LabelEncoder()
    X_train_full[col] = le.fit_transform(X_train_full[col])
    X_test[col] = le.transform(X_test[col])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

# Define the objective function for optimization
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2']),
        # Add more parameters to tune
    }
    
    clf = RandomForestClassifier(**params)
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_val)
    return 1.0 - accuracy_score(y_val, y_pred)

# Perform Bayesian optimization with Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Get the best parameters
best_params = study.best_params
print(f"Best Parameters: {best_params}")

# Use the best parameters for training the final model
best_clf = RandomForestClassifier(**best_params)
best_clf.fit(X_train_full, y_train_full)

# Predict segments for validation data using the best model
val_predictions = best_clf.predict(X_val)

# Calculate validation accuracy
val_accuracy = accuracy_score(y_val, val_predictions)
print(f"Validation Accuracy with Best Parameters: {val_accuracy}")

# Predict segments for test data using the best model
test_predictions = best_clf.predict(X_test)

# Create a DataFrame to store the predictions along with 'ID'
submission_df = pd.DataFrame({'ID': test_ids, 'Segmentation': test_predictions})

# Save the predictions to a CSV file
submission_df.to_csv('predicted_segments.csv', index=False)


[I 2023-11-24 17:12:48,903] A new study created in memory with name: no-name-09d3fa8b-2804-438a-b891-93bd16b07f99
[I 2023-11-24 17:12:51,640] Trial 0 finished with value: 0.476456009913259 and parameters: {'n_estimators': 471, 'max_depth': 7, 'min_samples_split': 7, 'min_samples_leaf': 9, 'max_features': 'log2'}. Best is trial 0 with value: 0.476456009913259.
[I 2023-11-24 17:12:52,669] Trial 1 finished with value: 0.48760842627013634 and parameters: {'n_estimators': 219, 'max_depth': 4, 'min_samples_split': 10, 'min_samples_leaf': 9, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.476456009913259.
[I 2023-11-24 17:12:53,803] Trial 2 finished with value: 0.49070631970260226 and parameters: {'n_estimators': 266, 'max_depth': 3, 'min_samples_split': 20, 'min_samples_leaf': 5, 'max_features': 'log2'}. Best is trial 0 with value: 0.476456009913259.
  warn(
[I 2023-11-24 17:12:54,706] Trial 3 finished with value: 0.4807930607187113 and parameters: {'n_estimators': 115, 'max_depth': 1

Best Parameters: {'n_estimators': 150, 'max_depth': 9, 'min_samples_split': 5, 'min_samples_leaf': 10, 'max_features': 'log2'}
Validation Accuracy with Best Parameters: 0.5935563816604709
