This notebook uses code from https://ariepratama.github.io/How-to-do-conjoint-analysis-in-python/ verbatim.

We need data on whether the user is "interested" in a property or not, instead of user rating.
This makes it less burdensome for users to fill out the survey, enabling us to show more listings to users for better accuracy.

Price data is used as categorical data, not numeric.

In [4]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('bmh')

ModuleNotFoundError: No module named 'pandas'

In [None]:
# import data

# create mock data for demo purpose
room_count = [2, 2, 1, 3, 1] # bedroom count
rent = [2400, 2600, 2100, 3200, 2000] # USD per month per apartment
distance_min = [4, 3, 5, 2, 20] # minute drive
direct_bus = [0, 1, 1, 0, 1] # whether the apartment has a direct bus route to campus
user_rating = [8, 7, 3, 10, 1] # 1 to 10 rating from user
interested = [1, 1, 0, 1, 0] # whether user is interested in the property


matrix = np.matrix([room_count, rent, distance_min, direct_bus, user_rating, interested]).T

df = pd.DataFrame(
    data=matrix,
    columns = ['room_count', 'rent', 'distance', 'direct_bus', 'user_rating', 'interested'])
df.head()

In [None]:
# check missing data
df.isnull().sum()

In [None]:
# remove empty rows
clean_df = df[~df['user_rating'].isnull()]

In [None]:
# selecte data for choice-based conjoint analysis
y = clean_df['interested']
x = clean_df.drop(['user_rating', 'interested'], axis=1)

In [None]:
xdum = pd.get_dummies(x, columns=x.columns)
xdum.head()

In [None]:
res = sm.OLS(y, xdum, family=sm.families.Binomial()).fit()

In [None]:
# need to assemble per attribute for every level of that attribute in dicionary
range_per_feature = dict()
for key, coeff in res.params.items():
    sk = key.split('_')
    feature = sk[0]
    if len(sk) == 1:
        feature = key
    if feature not in range_per_feature:
        range_per_feature[feature] = list()
        
    range_per_feature[feature].append(coeff)

In [None]:
# importance per feature is range of coef in a feature
# while range is simply max(x) - min(x)
importance_per_feature = {
    k: max(v) - min(v) for k, v in range_per_feature.items()
}

# compute relative importance per feature
# or normalized feature importance by dividing 
# sum of importance for all features
total_feature_importance = sum(importance_per_feature.values())
relative_importance_per_feature = {
    k: 100 * round(v/total_feature_importance, 3) for k, v in importance_per_feature.items()
}

In [None]:
alt_data = pd.DataFrame(
    list(relative_importance_per_feature.items()), 
    columns=['attr', 'relative_importance (pct)']
).sort_values(by='relative_importance (pct)', ascending=False)


f, ax = plt.subplots(figsize=(12, 8))
xbar = np.arange(len(alt_data['attr']))
plt.title('Relative importance / Normalized importance')
plt.barh(xbar, alt_data['relative_importance (pct)'])
for i, v in enumerate(alt_data['relative_importance (pct)']):
    ax.text(v , i + .25, '{:.2f}%'.format(v))
plt.ylabel('attributes')
plt.xlabel('% relative importance')
plt.yticks(xbar, alt_data['attr'])
plt.show()