In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv('mcdonalds_market_segmentation.csv')

In [None]:
data = pd.read_csv('mcdonalds_market_segmentation.csv')

# Display the first few rows of the dataset
data.head()

In [None]:
data = pd.read_csv('mcdonalds_market_segmentation.csv')

# Display the first few rows of the dataset
data.head()

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Extract the segmentation variables
MD_x = data.iloc[:, :11]

# Convert YES/NO to numeric binary (0 and 1)
MD_x = (MD_x == "Yes").astype(int)

# Perform Principal Components Analysis
pca = PCA(n_components=2)
MD_pca = pca.fit_transform(MD_x)

# Create a scatter plot for the perceptual map
plt.scatter(MD_pca[:, 0], MD_pca[:, 1], c='grey')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('Perceptual Map')

# Add arrows for the original segmentation variables
for i in range(len(pca.components_)):
    plt.arrow(0, 0, pca.components_[i, 0], pca.components_[i, 1], color='red', alpha=0.5, width=0.005)
    plt.text(pca.components_[i, 0], pca.components_[i, 1], MD_x.columns[i], color='red', fontsize=12)

plt.show()


In [None]:
# A.5 Step 5: Extracting Segments

# A.5.1 Using k-Means

from sklearn.cluster import KMeans
import numpy as np

# Set random seed for reproducibility
np.random.seed(1234)

# Perform k-means analysis for 2 to 8 segments
n_segments = range(2, 9)
kmeans_solutions = {}
for n in n_segments:
    kmeans = KMeans(n_clusters=n, n_init=10, random_state=1234)
    kmeans.fit(MD_x)
    kmeans_solutions[str(n)] = kmeans

# Relabel segment numbers for consistency
for n in n_segments:
    labels = kmeans_solutions[str(n)].labels_
    unique_labels = np.unique(labels)
    relabeled_labels = range(1, n + 1)
    relabeling_dict = {unique_label: relabeled_label for unique_label, relabeled_label in zip(unique_labels, relabeled_labels)}
    kmeans_solutions[str(n)].labels_ = np.vectorize(relabeling_dict.get)(labels)

# Plot a scree plot to assess the number of segments
inertia_values = [kmeans_solutions[str(n)].inertia_ for n in n_segments]
plt.plot(n_segments, inertia_values, marker='o', linestyle='-', color='b')
plt.xlabel('Number of Segments')
plt.ylabel('Sum of Distances Within Segments')
plt.title('Scree Plot')
plt.show()


In [None]:
# A.5.2 Using Mixtures of Distributions

# Calculate latent class analysis using a finite mixture of binary distributions
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import LabelEncoder

# Convert binary data to numerical labels (0 and 1)
label_encoder = LabelEncoder()
MD_x_encoded = MD_x.apply(lambda col: label_encoder.fit_transform(col))

# Set random seed for reproducibility
np.random.seed(1234)

# Perform finite mixture model analysis for 2 to 8 segments
n_components = range(2, 9)
bic_values = []
aic_values = []
icl_values = []
models = {}

for n in n_components:
    model = GaussianMixture(n_components=n, n_init=10, random_state=1234)
    model.fit(MD_x_encoded)
    bic_values.append(model.bic(MD_x_encoded))
    aic_values.append(model.aic(MD_x_encoded))

    models[str(n)] = model

# Plot the information criteria values
plt.plot(n_components, aic_values, marker='o', linestyle='-', color='b', label='AIC')
plt.plot(n_components, bic_values, marker='o', linestyle='-', color='g', label='BIC')
# Add ICL if available
# plt.plot(n_components, icl_values, marker='o', linestyle='-', color='r', label='ICL')
plt.xlabel('Number of Segments')
plt.ylabel('Value of Information Criteria')
plt.legend(loc='best')
plt.title('Information Criteria for Mixture Models')
plt.show()

# Compare mixture model with k-means model using cross-tabulation
# Initialize mixture model using segment memberships from k-means
kmeans_labels = clusters(MD_k4)
mixture_model = GaussianMixture(n_components=4, n_init=10, random_state=1234)
mixture_model.fit(MD_x_encoded)
mixture_labels = mixture_model.predict(MD_x_encoded)

# Create a cross-tabulation table
cross_tab = pd.crosstab(kmeans_labels, mixture_labels)
print(cross_tab)


In [None]:
# A.5.3 Using Mixtures of Regression Models

from sklearn.mixture import GaussianMixture
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder

# Convert ordinal variable 'Like' to a numerical variable
label_encoder = LabelEncoder()
mcdonalds['Like.n'] = 6 - label_encoder.fit_transform(mcdonalds['Like'])

# Define the independent variables (perceptions of McDonald's)
independent_vars = mcdonalds.columns[1:12]

# Fit a finite mixture of linear regression models with 2 components
n_components = 2
gmm = GaussianMixture(n_components=n_components, n_init=10, random_state=1234)

# Create the dependent variable as a numpy array
y = mcdonalds['Like.n'].values.reshape(-1, 1)

# Fit the Gaussian Mixture Model
gmm.fit(y)

# Get the cluster labels
cluster_labels = gmm.predict(y)

# Create a linear regression model for each cluster
regression_models = []
for cluster in range(n_components):
    cluster_indices = (cluster_labels == cluster)
    X_cluster = mcdonalds.loc[cluster_indices, independent_vars]
    y_cluster = mcdonalds.loc[cluster_indices, 'Like.n']
    regression_model = LinearRegression().fit(X_cluster, y_cluster)
    regression_models.append(regression_model)

# Print coefficients for each cluster's linear regression model
for cluster, model in enumerate(regression_models):
    print(f"Cluster {cluster + 1} (Component {cluster + 1})")
    print("Intercept:", model.intercept_)
    print("Coefficients:", model.coef_)
    print()
