In [1]:
from sklearn.tree import DecisionTreeClassifier
import sklearn as svm
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pandas as pd
import numpy as np
from collections import Counter

In [12]:
# Importing CSV
# importing dataset from csv file
og_train_data = pd.read_csv("Customer_train.csv")
og_test_data = pd.read_csv("Customer_test.csv")

# printing number of rows in the datasets
print("Number of rows in the training dataset: ", len(og_train_data))
print("Number of rows in the test dataset: ", len(og_test_data))

Number of rows in the training dataset:  1838
Number of rows in the test dataset:  789


In [13]:
# Pre-Processing Training Dataset
def fill_missing_based_on_skewness(df, column):
    if df[column].isnull().sum() > 0:
        skewness = df[column].skew()
        if abs(skewness) > 0.5:
            fill_value = df[column].median()
            method = "median"
        else:
            fill_value = df[column].mean()
            method = "mean"
        df[column].fillna(fill_value, inplace=True)
        print(f"Filled missing '{column}' with {method}: {fill_value}\n")

def transform_if_skewed(df, column):
    if df[column].isnull().sum() == 0:
        skewness = df[column].skew()
        print(f"Skewness of '{column}': {skewness}")
        if abs(skewness) > 0.5:
            df[column] = np.log1p(df[column])
            print(f"Applied log transformation to '{column}' due to skewness.\n")
        else:
            print(f"'{column}' is not significantly skewed; no transformation applied.\n")
    else:
        print(f"Column '{column}' has missing values, handle them before transformation.\n")

def fill_missing_categorical(df, column):
    if df[column].isnull().sum() > 0:
        mode_value = df[column].mode()[0]
        df[column].fillna(mode_value, inplace=True)
        print(f"Filled missing '{column}' with mode: {mode_value}\n")

# making new df and processing in that one
processed_train_data = og_train_data.copy()

# dropping ID column since no use of it in training classifier
processed_train_data.drop("ID", axis=1, inplace=True)

# calling missing values function on columns as below
fill_missing_based_on_skewness(processed_train_data, "Work_Experience")
fill_missing_based_on_skewness(processed_train_data, "Family_Size")

# calling missing values function on categorical columns
fill_missing_categorical(processed_train_data, "Var_1")
fill_missing_categorical(processed_train_data, "Gender")
fill_missing_categorical(processed_train_data, "Ever_Married")
fill_missing_categorical(processed_train_data, "Graduated")
fill_missing_categorical(processed_train_data, "Profession")
fill_missing_categorical(processed_train_data, "Spending_Score")

# calling transformation function on columns as below
transform_if_skewed(processed_train_data, "Work_Experience")
transform_if_skewed(processed_train_data, "Family_Size")
transform_if_skewed(processed_train_data, "Age")

# printing
processed_train_data

Filled missing 'Work_Experience' with median: 1.0

Filled missing 'Family_Size' with median: 2.0

Filled missing 'Var_1' with mode: Cat_6

Filled missing 'Ever_Married' with mode: Yes

Filled missing 'Graduated' with mode: Yes

Filled missing 'Profession' with mode: Artist

Skewness of 'Work_Experience': 1.569560677667064
Applied log transformation to 'Work_Experience' due to skewness.

Skewness of 'Family_Size': 1.0098346177467532
Applied log transformation to 'Family_Size' due to skewness.

Skewness of 'Age': 0.7207833242162786
Applied log transformation to 'Age' due to skewness.



Unnamed: 0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,Female,No,3.367296,Yes,Artist,2.197225,Low,0.693147,Cat_6,B
1,Male,Yes,3.850148,Yes,Entertainment,0.693147,Average,1.609438,Cat_7,D
2,Male,No,4.276666,No,Lawyer,0.693147,Low,1.098612,Cat_6,D
3,Female,Yes,3.970292,No,Doctor,1.609438,Low,0.693147,Cat_4,C
4,Male,No,3.295837,Yes,Healthcare,0.000000,Low,1.386294,Cat_6,D
...,...,...,...,...,...,...,...,...,...,...
1833,Female,Yes,3.258097,Yes,Healthcare,2.564949,High,1.098612,Cat_6,B
1834,Female,Yes,3.637586,Yes,Engineer,0.693147,Average,1.386294,Cat_4,B
1835,Male,Yes,3.850148,Yes,Artist,0.000000,High,1.609438,Cat_6,D
1836,Male,Yes,3.332205,Yes,Healthcare,1.386294,Low,1.098612,Cat_6,D


In [14]:
# initializing the MinMaxScaler
scaler = MinMaxScaler()

# selecting columns to be scaled
columns_to_scale = ["Age", "Work_Experience", "Family_Size"]

# scaling columns to the range [0, 1]
processed_train_data[columns_to_scale] = scaler.fit_transform(
    processed_train_data[columns_to_scale]
)

# printing
processed_train_data

Unnamed: 0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,Female,No,0.271869,Yes,Artist,0.811368,Low,0.000000,Cat_6,B
1,Male,Yes,0.582310,Yes,Entertainment,0.255958,Average,0.569323,Cat_7,D
2,Male,No,0.856534,No,Lawyer,0.255958,Low,0.251930,Cat_6,D
3,Female,Yes,0.659555,No,Doctor,0.594316,Low,0.000000,Cat_4,C
4,Male,No,0.225925,Yes,Healthcare,0.000000,Low,0.430677,Cat_6,D
...,...,...,...,...,...,...,...,...,...,...
1833,Female,Yes,0.201661,Yes,Healthcare,0.947157,High,0.251930,Cat_6,B
1834,Female,Yes,0.445648,Yes,Engineer,0.255958,Average,0.430677,Cat_4,B
1835,Male,Yes,0.582310,Yes,Artist,0.000000,High,0.569323,Cat_6,D
1836,Male,Yes,0.249307,Yes,Healthcare,0.511916,Low,0.251930,Cat_6,D


In [15]:
# list of categorical columns to be encoded
categorical_columns = [
    "Gender",
    "Ever_Married",
    "Graduated",
    "Profession",
    "Spending_Score",
    "Var_1",
]

# label encoding for binary categorical features
label_encoder = LabelEncoder()
for col in ["Gender", "Ever_Married", "Graduated"]:
    processed_train_data[col] = label_encoder.fit_transform(processed_train_data[col])

# one-hot encoding for multi-class categorical features
processed_train_data = pd.get_dummies(
    processed_train_data,
    columns=["Profession", "Spending_Score", "Var_1"],
    drop_first=True,
)

# printing
processed_train_data

Unnamed: 0,Gender,Ever_Married,Age,Graduated,Work_Experience,Family_Size,Segmentation,Profession_Doctor,Profession_Engineer,Profession_Entertainment,...,Profession_Lawyer,Profession_Marketing,Spending_Score_High,Spending_Score_Low,Var_1_Cat_2,Var_1_Cat_3,Var_1_Cat_4,Var_1_Cat_5,Var_1_Cat_6,Var_1_Cat_7
0,0,0,0.271869,1,0.811368,0.000000,B,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,1,1,0.582310,1,0.255958,0.569323,D,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,1,0,0.856534,0,0.255958,0.251930,D,0,0,0,...,1,0,0,1,0,0,0,0,1,0
3,0,1,0.659555,0,0.594316,0.000000,C,1,0,0,...,0,0,0,1,0,0,1,0,0,0
4,1,0,0.225925,1,0.000000,0.430677,D,0,0,0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1833,0,1,0.201661,1,0.947157,0.251930,B,0,0,0,...,0,0,1,0,0,0,0,0,1,0
1834,0,1,0.445648,1,0.255958,0.430677,B,0,1,0,...,0,0,0,0,0,0,1,0,0,0
1835,1,1,0.582310,1,0.000000,0.569323,D,0,0,0,...,0,0,1,0,0,0,0,0,1,0
1836,1,1,0.249307,1,0.511916,0.251930,D,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [16]:
# preparing testing dataset
processed_test_data = og_test_data.copy()

# dropping ID column since even training data has total only 22 columns (ID was not there)
processed_test_data.drop("ID", axis=1, inplace=True)

# same preprocessing steps to the test data
fill_missing_based_on_skewness(processed_test_data, "Work_Experience")
fill_missing_based_on_skewness(processed_test_data, "Family_Size")

fill_missing_categorical(processed_test_data, "Var_1")
fill_missing_categorical(processed_test_data, "Gender")
fill_missing_categorical(processed_test_data, "Ever_Married")
fill_missing_categorical(processed_test_data, "Graduated")
fill_missing_categorical(processed_test_data, "Profession")
fill_missing_categorical(processed_test_data, "Spending_Score")

transform_if_skewed(processed_test_data, "Work_Experience")
transform_if_skewed(processed_test_data, "Family_Size")
transform_if_skewed(processed_test_data, "Age")

# scaling test dataset with same scaler
columns_to_scale = ["Age", "Work_Experience", "Family_Size"]
processed_test_data[columns_to_scale] = scaler.transform(
    processed_test_data[columns_to_scale]
)

# label encoding for binary categorical features
label_encoder = LabelEncoder()
for col in ["Gender", "Ever_Married", "Graduated"]:
    processed_test_data[col] = label_encoder.fit_transform(processed_test_data[col])

# one-hot encoding for multi-class categorical features
processed_test_data = pd.get_dummies(
    processed_test_data,
    columns=["Profession", "Spending_Score", "Var_1"],
    drop_first=True,
)

# printing
processed_test_data

Filled missing 'Work_Experience' with median: 1.0

Filled missing 'Family_Size' with median: 2.0

Filled missing 'Var_1' with mode: Cat_6

Filled missing 'Ever_Married' with mode: Yes

Filled missing 'Graduated' with mode: Yes

Filled missing 'Profession' with mode: Artist

Skewness of 'Work_Experience': 1.5263872720417662
Applied log transformation to 'Work_Experience' due to skewness.

Skewness of 'Family_Size': 1.2005644040644883
Applied log transformation to 'Family_Size' due to skewness.

Skewness of 'Age': 0.6667761490536537
Applied log transformation to 'Age' due to skewness.



Unnamed: 0,Gender,Ever_Married,Age,Graduated,Work_Experience,Family_Size,Profession_Doctor,Profession_Engineer,Profession_Entertainment,Profession_Executive,...,Profession_Lawyer,Profession_Marketing,Spending_Score_High,Spending_Score_Low,Var_1_Cat_2,Var_1_Cat_3,Var_1_Cat_4,Var_1_Cat_5,Var_1_Cat_6,Var_1_Cat_7
0,1,0,0.000000,0,0.255958,0.682606,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,1,0,0.478626,1,0.767874,0.000000,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,0,0,0.509995,1,0.255958,0.000000,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
3,1,1,0.595846,1,0.000000,0.569323,0,0,0,1,...,0,0,1,0,0,0,0,0,1,0
4,0,1,0.760395,1,0.000000,0.430677,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
784,1,0,0.122836,0,0.000000,0.000000,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
785,0,0,0.539904,1,0.255958,0.569323,1,0,0,0,...,0,0,0,1,1,0,0,0,0,0
786,1,0,0.032978,0,0.255958,0.682606,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
787,0,0,0.374137,0,0.000000,0.778385,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0


In [39]:
class RandomForestClassifier:
    def __init__(self, n_trees=100, max_depth=None, min_samples_split=2, 
                 min_samples_leaf=1, max_features='sqrt', random_state=None):
        """
        Initialize the Random Forest Classifier
        
        Parameters:
        n_trees (int): Number of trees in the forest
        max_depth (int or None): Maximum depth of the trees
        min_samples_split (int): Minimum samples required to split a node
        min_samples_leaf (int): Minimum samples required at each leaf node
        max_features (str or float): Strategy for selecting max features {'sqrt', 'log2'} or float
        random_state (int): Random state for reproducibility
        """
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.max_features = max_features
        self.random_state = random_state
        self.trees = []
        
    def _bootstrap_sample(self, X, y):
        """
        Create a bootstrap sample of the data
        
        Parameters:
        X (array-like): Training features
        y (array-like): Target values
        
        Returns:
        tuple: Bootstrap sample of features and targets
        """
        n_samples = X.shape[0]
        idxs = np.random.choice(n_samples, size=n_samples, replace=True)
        return X[idxs], y[idxs]
    
    def _get_max_features(self, n_features):
        """
        Determine the number of features to consider for splitting
        
        Parameters:
        n_features (int): Total number of features
        
        Returns:
        int: Number of features to consider
        """
        if isinstance(self.max_features, str):
            if self.max_features == 'sqrt':
                return int(np.sqrt(n_features))
            elif self.max_features == 'log2':
                return int(np.log2(n_features))
        elif isinstance(self.max_features, float):
            return int(self.max_features * n_features)
        elif isinstance(self.max_features, int):
            return min(self.max_features, n_features)
        return n_features
    
    def fit(self, X, y):
        """
        Fit the random forest classifier
        
        Parameters:
        X (array-like): Training features
        y (array-like): Target values
        
        Returns:
        self: The fitted classifier
        """
        if self.random_state is not None:
            np.random.seed(self.random_state)
            
        n_features = X.shape[1]
        max_features = self._get_max_features(n_features)
        
        for _ in range(self.n_trees):
            # Create bootstrap sample
            X_sample, y_sample = self._bootstrap_sample(X, y)
            
            # Create and train decision tree
            tree = DecisionTreeClassifier(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                max_features=max_features,
                random_state=self.random_state
            )
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)
            
        return self
    
    def predict(self, X):
        """
        Predict class labels for samples in X
        
        Parameters:
        X (array-like): Features to predict
        
        Returns:
        array: Predicted class labels
        """
        # Get predictions from all trees
        tree_predictions = np.array([tree.predict(X) for tree in self.trees])
        
        # Take majority vote
        predictions = []
        for i in range(X.shape[0]):
            pred = Counter(tree_predictions[:, i]).most_common(1)[0][0]
            predictions.append(pred)
            
        return np.array(predictions)
    
    def get_params(self):
        """
        Get parameters for this estimator
        
        Returns:
        dict: Parameter names mapped to their values
        """
        return {
            'n_trees': self.n_trees,
            'max_depth': self.max_depth,
            'min_samples_split': self.min_samples_split,
            'min_samples_leaf': self.min_samples_leaf,
            'max_features': self.max_features,
            'random_state': self.random_state
        }

In [41]:
# Cross-validation function
def cross_validate_rf(X, y, rf_params, n_folds=5):
    from sklearn.model_selection import StratifiedKFold
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    scores = []
    
    for train_idx, val_idx in skf.split(X, y):
        X_train_fold, X_val_fold = X[train_idx], X[val_idx]
        y_train_fold, y_val_fold = y[train_idx], y[val_idx]
        
        rf = RandomForestClassifier(**rf_params)
        rf.fit(X_train_fold, y_train_fold)
        val_predictions = rf.predict(X_val_fold)
        score = accuracy_score(y_val_fold, val_predictions)
        scores.append(score)
    
    return np.mean(scores), np.std(scores)

# Prepare the data
X_train_copy = processed_train_data.copy()
y_train = processed_train_data["Segmentation"].values
X_train = X_train_copy.drop("Segmentation", axis=1).values
X_test = processed_test_data.values

# Extended hyperparameter lists
n_trees_list = [10, 25, 50, 75, 100]  # More trees
max_depth_list = [None, 5, 10, 15, 20, 25]  # More depth options
min_samples_split_list = [2, 5, 10]  # Different split thresholds
min_samples_leaf_list = [1, 2, 4]  # Different leaf sizes
max_features_list = ['sqrt', 'log2', 0.7]  # Different feature selection strategies

print("Starting Grid Search with Cross-validation...")
best_score = 0
best_params = None
best_results = None

# Grid search with cross-validation
for n_trees in n_trees_list:
    for max_depth in max_depth_list:
        for min_samples_split in min_samples_split_list:
            for min_samples_leaf in min_samples_leaf_list:
                for max_features in max_features_list:
                    params = {
                        'n_trees': n_trees,
                        'max_depth': max_depth,
                        'min_samples_split': min_samples_split,
                        'min_samples_leaf': min_samples_leaf,
                        'max_features': max_features,
                        'random_state': 42
                    }
                    
                    print(f"\nTesting parameters: {params}")
                    mean_score, std_score = cross_validate_rf(X_train, y_train, params)
                    
                    if mean_score > best_score:
                        best_score = mean_score
                        best_params = params
                        print(f"New best score: {best_score:.4f} (±{std_score:.4f})")
                        print(f"Best parameters so far: {params}")

print("\nBest Configuration Found:")
print(f"Parameters: {best_params}")
print(f"Cross-validation Score: {best_score:.4f}")

# Train final model with best parameters
print("\nTraining final model with best parameters...")
final_rf = RandomForestClassifier(**best_params)
final_rf.fit(X_train, y_train)

# Make predictions on test set
print("\nMaking predictions on test set...")
test_predictions = final_rf.predict(X_test)

# Save predictions to CSV
predictions_df = pd.DataFrame(test_predictions, columns=['Segmentation'])
predictions_df.to_csv('random_forest_predictions.csv', index=False)
print("\nPredictions saved to 'random_forest_predictions.csv'")

# Calculate and display validation metrics
X_train_final, X_val_final, y_train_final, y_val_final = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)
final_rf.fit(X_train_final, y_train_final)
val_predictions_final = final_rf.predict(X_val_final)

print("\nFinal Validation Metrics:")
print(f"Accuracy: {accuracy_score(y_val_final, val_predictions_final):.4f}")
print("\nClassification Report:")
print(classification_report(y_val_final, val_predictions_final))
print("\nConfusion Matrix:")
print(confusion_matrix(y_val_final, val_predictions_final))

Starting Grid Search with Cross-validation...

Testing parameters: {'n_trees': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'random_state': 42}
New best score: 0.3036 (±0.0128)
Best parameters so far: {'n_trees': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'random_state': 42}

Testing parameters: {'n_trees': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'random_state': 42}

Testing parameters: {'n_trees': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.7, 'random_state': 42}

Testing parameters: {'n_trees': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'random_state': 42}
New best score: 0.3118 (±0.0151)
Best parameters so far: {'n_trees': 10, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'random_state': 42}

Te

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [43]:
print("\nConfusion Matrix:")
cm = confusion_matrix(y_val_final, val_predictions_final)
classes = np.unique(y_train)

print("Predicted classes:")
print("            " + " ".join(f"{c:>8}" for c in classes))
print("True classes:")

for i, class_name in enumerate(classes):
    print(f"{class_name:>11} {' '.join(f'{x:>8}' for x in cm[i])}")

print("\nPer-class accuracy:")
for i, class_name in enumerate(classes):
    class_correct = cm[i][i]
    class_total = np.sum(cm[i])
    class_accuracy = class_correct / class_total
    print(f"Class {class_name}: {class_accuracy:.4f} ({class_correct}/{class_total})")


Confusion Matrix:
Predicted classes:
                   A        B        C        D
True classes:
          A       79        0        0       35
          B       65        0        0       13
          C       47        0        0       11
          D       58        1        0       59

Per-class accuracy:
Class A: 0.6930 (79/114)
Class B: 0.0000 (0/78)
Class C: 0.0000 (0/58)
Class D: 0.5000 (59/118)
