<a href="https://colab.research.google.com/github/Francis-Mwaniki/personality-predictor/blob/main/product_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import f1_score, hamming_loss
import xgboost as xgb

# Load the data
df = pd.read_csv('test.csv')

# Select features and target variables
features = ['age', 'seniority', 'income', 'sex', 'segment', 'cust_type', 'residence_index', 'foreigner_index']
products = ['Saving Account', 'Guarantees', 'Current Accounts', 'Derivada Account', 'Payroll Account',
            'Junior Account', 'Más particular Account', 'particular Account', 'particular Plus Account',
            'Short-term deposits', 'Medium-term deposits', 'Long-term deposits', 'e-account', 'Funds',
            'Mortgage', 'Pensions', 'Loans', 'Taxes', 'Credit Card', 'Securities', 'Home Account', 'Payroll',
            'Pensions', 'Direct Debit']

# Preprocess the data
le = LabelEncoder()
for col in features:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col].astype(str))

# Split the data
X = df[features]
y = df[products]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the model
model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
f1_micro = f1_score(y_test, y_pred, average='micro')
f1_macro = f1_score(y_test, y_pred, average='macro')
h_loss = hamming_loss(y_test, y_pred)

print(f"Micro-averaged F1 Score: {f1_micro:.4f}")
print(f"Macro-averaged F1 Score: {f1_macro:.4f}")
print(f"Hamming Loss: {h_loss:.4f}")

# Function to predict products for a new customer
def predict_products(customer_data):
    # Ensure customer_data has the same features in the same order
    customer_array = np.array(customer_data).reshape(1, -1)
    customer_scaled = scaler.transform(customer_array)
    predictions = model.predict(customer_scaled)
    recommended_products = [products[i] for i, pred in enumerate(predictions[0]) if pred == 1]
    return recommended_products

# Example usage
new_customer = [35, 6, 87218.1, 0, 2, 1, 1, 0]  # Example values for features
recommended = predict_products(new_customer)
print("Recommended products:", recommended)

Micro-averaged F1 Score: 1.0000
Macro-averaged F1 Score: 0.0417
Hamming Loss: 0.0000
Recommended products: ['Current Accounts']


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


# **Improved**

---




In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import f1_score, hamming_loss, make_scorer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.multioutput import MultiOutputClassifier
import xgboost as xgb

# Load the data
df = pd.read_csv('test.csv')

# Select features and target variables
features = ['age', 'seniority', 'income', 'sex', 'segment', 'cust_type', 'residence_index', 'foreigner_index']
products = ['Saving Account', 'Guarantees', 'Current Accounts', 'Derivada Account', 'Payroll Account',
            'Junior Account', 'Más particular Account', 'particular Account', 'particular Plus Account',
            'Short-term deposits', 'Medium-term deposits', 'Long-term deposits', 'e-account', 'Funds',
            'Mortgage', 'Pensions', 'Loans', 'Taxes', 'Credit Card', 'Securities', 'Home Account', 'Payroll',
            'Pensions', 'Direct Debit']

# Preprocess the data
le = LabelEncoder()
for col in features:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col].astype(str))

# Split the data
X = df[features]
y = df[products]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Feature selection
selector = SelectKBest(score_func=f_classif, k=5)
X_train_selected = selector.fit_transform(X_train_scaled, y_train.sum(axis=1))
X_test_selected = selector.transform(X_test_scaled)

# Train the model
base_model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
model = MultiOutputClassifier(base_model)
model.fit(X_train_selected, y_train)

# Cross-validation
cv_score = cross_val_score(model, X_train_selected, y_train, cv=5, scoring=make_scorer(f1_score, average='micro'))
print(f"Cross-validation F1 score: {cv_score.mean():.4f} (+/- {cv_score.std() * 2:.4f})")

# Make predictions
y_pred = model.predict(X_test_selected)

# Evaluate the model
f1_micro = f1_score(y_test, y_pred, average='micro')
f1_macro = f1_score(y_test, y_pred, average='macro')
h_loss = hamming_loss(y_test, y_pred)

print(f"Micro-averaged F1 Score: {f1_micro:.4f}")
print(f"Macro-averaged F1 Score: {f1_macro:.4f}")
print(f"Hamming Loss: {h_loss:.4f}")

# Function to predict products for a new customer
def predict_products(customer_data):
    customer_array = np.array(customer_data).reshape(1, -1)
    customer_scaled = scaler.transform(customer_array)
    customer_selected = selector.transform(customer_scaled)
    predictions = model.predict(customer_selected)
    recommended_products = [products[i] for i, pred in enumerate(predictions[0]) if pred == 1]
    return recommended_products

# Example usage
new_customer = [35, 6, 87218.1, 0, 2, 1, 1, 0]  # Example values for features
recommended = predict_products(new_customer)
print("Recommended products:", recommended)

# Print feature importances
selected_features = selector.get_support(indices=True)
for i, feature in enumerate(np.array(features)[selected_features]):
    print(f"Selected feature: {feature}")

  msb = ssbn / float(dfbn)
1 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/multioutput.py", line 450, in fit
    super().fit(X, Y, sample_weight, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/multioutput.py", line 216, in fit
    self.estimators_ = Parallel(n_jobs=self.n_jobs)(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/parallel.py", line 63, in __call__
    return sup

Cross-validation F1 score: nan (+/- nan)
Micro-averaged F1 Score: 1.0000
Macro-averaged F1 Score: 0.0417
Hamming Loss: 0.0000
Recommended products: ['Current Accounts']
Selected feature: sex
Selected feature: segment
Selected feature: cust_type
Selected feature: residence_index
Selected feature: foreigner_index


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import f1_score, hamming_loss, make_scorer
from sklearn.feature_selection import VarianceThreshold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

# Load the data
df = pd.read_csv('test.csv')

# Select features and target variables
features = ['age', 'seniority', 'income', 'sex', 'segment', 'cust_type', 'residence_index', 'foreigner_index']
products = ['Saving Account', 'Guarantees', 'Current Accounts', 'Derivada Account', 'Payroll Account',
            'Junior Account', 'Más particular Account', 'particular Account', 'particular Plus Account',
            'Short-term deposits', 'Medium-term deposits', 'Long-term deposits', 'e-account', 'Funds',
            'Mortgage', 'Pensions', 'Loans', 'Taxes', 'Credit Card', 'Securities', 'Home Account', 'Payroll',
            'Pensions', 'Direct Debit']

# Preprocess the data
le = LabelEncoder()
for col in features:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col].astype(str))

# Split the data
X = df[features]
y = df[products]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Feature selection (remove low variance features)
selector = VarianceThreshold(threshold=0.01)
X_train_selected = selector.fit_transform(X_train_scaled)
X_test_selected = selector.transform(X_test_scaled)

# Train the model
base_model = RandomForestClassifier(n_estimators=100, random_state=42)
model = MultiOutputClassifier(base_model)
model.fit(X_train_selected, y_train)

# Make predictions
y_pred = model.predict(X_test_selected)

# Evaluate the model
f1_micro = f1_score(y_test, y_pred, average='micro', zero_division=1)
f1_macro = f1_score(y_test, y_pred, average='macro', zero_division=1)
h_loss = hamming_loss(y_test, y_pred)

print(f"Micro-averaged F1 Score: {f1_micro:.4f}")
print(f"Macro-averaged F1 Score: {f1_macro:.4f}")
print(f"Hamming Loss: {h_loss:.4f}")

# Function to predict products for a new customer
def predict_products(customer_data):
    customer_array = np.array(customer_data).reshape(1, -1)
    customer_imputed = imputer.transform(customer_array)
    customer_scaled = scaler.transform(customer_imputed)
    customer_selected = selector.transform(customer_scaled)
    predictions = model.predict(customer_selected)
    recommended_products = [products[i] for i, pred in enumerate(predictions[0]) if pred == 1]
    return recommended_products

# Example usage
new_customer = [35, 6, 87218.1, 0, 2, 1, 1, 0]  # Example values for features
recommended = predict_products(new_customer)
print("Recommended products:", recommended)

# Print selected features
selected_features = np.array(features)[selector.get_support()]
print("Selected features:", selected_features)

Micro-averaged F1 Score: 1.0000
Macro-averaged F1 Score: 1.0000
Hamming Loss: 0.0000




Recommended products: ['Current Accounts']
Selected features: ['age' 'income' 'sex' 'segment']
