In [53]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
from factor_analyzer import FactorAnalyzer
from factor_analyzer.factor_analyzer import calculate_kmo
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

data = pd.read_csv("customer_satisfaction_data.csv")
data

Unnamed: 0,customer_id,quarter,survey_date,account_manager_responsive,billing_accuracy,budget_control,change_management,communication_clarity,competitive_pricing,cost_transparency,...,technical_expertise,timeline_adherence,training_quality,trust_reliability,value_for_money,overall_satisfaction,nps_score,renewal_likelihood,revenue_growth_pct,referrals_generated
0,CUST_001,Q1_2024,2024-03-22,5.0,6.0,4.0,5.0,4.0,5.0,5.0,...,4.0,4.0,2.0,6.0,6.0,4,6,4,12.3,1
1,CUST_002,Q1_2024,2024-03-20,5.0,3.0,4.0,5.0,5.0,3.0,3.0,...,4.0,4.0,4.0,4.0,4.0,4,5,3,17.8,0
2,CUST_003,Q1_2024,2024-03-17,4.0,3.0,4.0,3.0,4.0,4.0,3.0,...,4.0,4.0,3.0,4.0,3.0,3,4,3,-7.8,0
3,CUST_004,Q1_2024,2024-03-08,3.0,4.0,4.0,3.0,2.0,3.0,4.0,...,4.0,4.0,4.0,2.0,4.0,4,5,2,-13.4,0
4,CUST_005,Q1_2024,2024-03-12,5.0,5.0,5.0,4.0,5.0,4.0,3.0,...,6.0,5.0,4.0,5.0,4.0,5,5,4,3.7,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3395,CUST_846,Q4_2024,2024-12-16,4.0,6.0,4.0,5.0,5.0,4.0,3.0,...,5.0,4.0,5.0,4.0,5.0,4,5,3,6.8,3
3396,CUST_847,Q4_2024,2024-12-14,3.0,5.0,4.0,3.0,4.0,3.0,4.0,...,3.0,4.0,2.0,5.0,5.0,4,6,3,4.3,0
3397,CUST_848,Q4_2024,2024-12-16,3.0,3.0,6.0,5.0,4.0,4.0,4.0,...,6.0,6.0,4.0,5.0,4.0,5,9,4,12.7,2
3398,CUST_849,Q4_2024,2024-12-19,5.0,4.0,4.0,3.0,3.0,5.0,4.0,...,4.0,4.0,3.0,4.0,4.0,3,6,3,5.4,2


In [54]:
print(f"Variables: {data.shape[1]}, Observaciones: {data.shape[0]}")
print("Tipos de datos:")
print(data.dtypes.value_counts())

Variables: 31, Observaciones: 3400
Tipos de datos:
float64    24
int64       4
object      3
Name: count, dtype: int64


In [55]:
# ESTADISTICAS DESCRIPTIVAS
print(data.describe())

       account_manager_responsive  billing_accuracy  budget_control  \
count                 3384.000000       3387.000000     3383.000000   
mean                     4.115248          4.101270        4.100798   
std                      0.970303          0.962109        0.983397   
min                      1.000000          1.000000        1.000000   
25%                      3.000000          3.000000        3.000000   
50%                      4.000000          4.000000        4.000000   
75%                      5.000000          5.000000        5.000000   
max                      7.000000          7.000000        7.000000   

       change_management  communication_clarity  competitive_pricing  \
count        3388.000000            3384.000000          3385.000000   
mean            4.109504               4.092494             4.079468   
std             0.977813               0.964956             0.979341   
min             1.000000               1.000000             1.000000   


In [56]:
# VALORES NULOS
print(data.isnull().sum())

customer_id                    0
quarter                        0
survey_date                    0
account_manager_responsive    16
billing_accuracy              13
budget_control                17
change_management             12
communication_clarity         16
competitive_pricing           15
cost_transparency             10
documentation_help            14
executive_access              10
innovation_solutions          10
long_term_partnership         14
problem_solving               20
project_management            17
quality_deliverables          13
roi_demonstration             17
support_responsiveness        17
system_integration            14
technical_documentation       19
technical_expertise            9
timeline_adherence             8
training_quality              17
trust_reliability             22
value_for_money               15
overall_satisfaction           0
nps_score                      0
renewal_likelihood             0
revenue_growth_pct             0
referrals_

In [57]:
# Definir las variables por dimension
technical_vars = ['technical_expertise', 'problem_solving', 'innovation_solutions', 
                  'technical_documentation', 'system_integration']

relationship_vars = ['account_manager_responsive', 'executive_access', 'trust_reliability', 
                     'long_term_partnership', 'communication_clarity']

project_vars = ['project_management', 'timeline_adherence', 'budget_control', 
                'quality_deliverables', 'change_management']

value_vars = ['cost_transparency', 'value_for_money', 'roi_demonstration', 
              'competitive_pricing', 'billing_accuracy']

support_vars = ['support_responsiveness', 'training_quality', 'documentation_help']

outcome_vars = ['overall_satisfaction', 'nps_score', 'renewal_likelihood', 
                'revenue_growth_pct', 'referrals_generated']

# Combinar todas las variables de satisfacción
satisfaction_vars = technical_vars + relationship_vars + project_vars + value_vars + support_vars

print(f"\nTotal variables de satisfacción: {len(satisfaction_vars)}")
print(f"Total variables de outcome: {len(outcome_vars)}")


Total variables de satisfacción: 23
Total variables de outcome: 5


In [58]:
# Visualizacion de variables usando Plotly
def create_interactive_boxplots(data, variable_groups, titles):
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=titles,
        vertical_spacing=0.12
    )
    
    # Grupo 1: Excelencia Técnica
    for i, var in enumerate(variable_groups[0]):
        fig.add_trace(
            go.Box(y=data[var], name=var, showlegend=False),
            row=1, col=1
        )
    
    # Grupo 2: Gestión de Relaciones
    for i, var in enumerate(variable_groups[1]):
        fig.add_trace(
            go.Box(y=data[var], name=var, showlegend=False),
            row=1, col=2
        )
    
    # Grupo 3: Variables de Outcome (1-3)
    for i, var in enumerate(variable_groups[2]):
        fig.add_trace(
            go.Box(y=data[var], name=var, showlegend=False),
            row=2, col=1
        )
    
    # Grupo 4: Variables de Outcome (4-5)
    for i, var in enumerate(variable_groups[3]):
        fig.add_trace(
            go.Box(y=data[var], name=var, showlegend=False),
            row=2, col=2
        )
    
    fig.update_layout(
        height=800,
        title_text="Distribución de Variables - Análisis de Satisfacción",
        showlegend=False
    )
    
    return fig

# visualizacion
variable_groups = [
    technical_vars,
    relationship_vars,
    outcome_vars[:3],
    outcome_vars[3:]
]

titles = [
    'Distribución - Excelencia Técnica',
    'Distribución - Gestión de Relaciones', 
    'Distribución - Variables de Outcome (1-3)',
    'Distribución - Variables de Outcome (4-5)'
]

fig = create_interactive_boxplots(data, variable_groups, titles)
fig.show()

In [59]:
# Matriz de correlación para variables de satisfacción
correlation_matrix = data[satisfaction_vars].corr()

# heatmap con Plotly
fig = px.imshow(
    correlation_matrix,
    x=correlation_matrix.columns,
    y=correlation_matrix.index,
    color_continuous_scale='RdBu_r',
    zmin=-1, zmax=1,
    aspect="auto"
)

fig.update_layout(
    title='Matriz de Correlación Variables de Satisfaccion',
    width=800,
    height=800,
    xaxis_title="Variables",
    yaxis_title="Variables"
)

fig.show()

In [60]:
# Test de KMO (ayuda de Deeepseek)
def check_factor_analysis_suitability(data, variables):
    """Verificar la adecuación del análisis factorial"""
    
    # Limpiar y calcular KMO
    data_clean = data[variables].fillna(data[variables].median())
    kmo_all, kmo_model = calculate_kmo(data_clean)
    
    print(f"KMO Overall: {kmo_model:.3f}")
    
    # Visualización
    kmo_df = pd.DataFrame({'Variable': variables, 'KMO': kmo_all}).sort_values('KMO')
    
    fig = px.bar(kmo_df, x='KMO', y='Variable', orientation='h', 
                 title='KMO por Variable', color='KMO')
    fig.add_vline(x=0.6, line_dash="dash", line_color="red")
    fig.show()
    
    # Evaluación
    adequacy = "Adecuado" if kmo_model > 0.6 else "Inadecuado"
    print(f"Evaluación: {adequacy}")
    
    return kmo_model, kmo_df

# Aplicar test
kmo_overall, kmo_variables = check_factor_analysis_suitability(data, satisfaction_vars)

KMO Overall: 0.959


Evaluación: Adecuado


In [61]:
# evaluacion de correlaciones
def assess_correlations(data, variables, threshold=0.3):    
    corr_matrix = data[variables].corr()
    
    # Porcentaje de correlaciones significativas
    significant_corrs = (abs(corr_matrix) >= threshold).sum().sum() - len(variables)
    total_possible_corrs = len(variables) * (len(variables) - 1)
    percent_significant = (significant_corrs / total_possible_corrs) * 100
    
    print(f"Umbral de correlación: |r| ≥ {threshold}")
    print(f"Correlaciones significativas: {significant_corrs}/{total_possible_corrs}")
    print(f"Porcentaje: {percent_significant:.1f}%")
    
    # Visualización de distribución de correlaciones
    corr_values = corr_matrix.values[np.triu_indices_from(corr_matrix.values, k=1)]
    
    fig = px.histogram(
        x=corr_values,
        nbins=50,
        title='Distribución de Correlaciones entre Variables',
        labels={'x': 'Coeficiente de Correlación', 'y': 'Frecuencia'}
    )
    
    fig.add_vline(x=threshold, line_dash="dash", line_color="red", 
                  annotation_text=f"Umbral {threshold}")
    fig.add_vline(x=-threshold, line_dash="dash", line_color="red")
    
    fig.update_layout(showlegend=False)
    fig.show()
    
    # Matriz de correlaciones
    corr_det = np.linalg.det(corr_matrix)
    print(f"Determinante de la matriz de correlación: {corr_det:.6f}")
    
    # Test de esfericidad de Bartlett (aproximado)
    n = len(data)
    p = len(variables)
    chi_square = -((n - 1) - (2 * p + 5) / 6) * np.log(corr_det)
    df = p * (p - 1) / 2
    
    print(f"Chi-cuadrado aproximado: {chi_square:.2f}")
    print(f"Grados de libertad: {df}")
    
    return percent_significant

# Evaluar correlaciones
corr_percentage = assess_correlations(data, satisfaction_vars)

Umbral de correlación: |r| ≥ 0.3
Correlaciones significativas: 244/506
Porcentaje: 48.2%


Determinante de la matriz de correlación: 0.000037
Chi-cuadrado aproximado: 34601.05
Grados de libertad: 253.0


El análisis de adecuación para la aplicación del análisis factorial mostró resultados favorables. Tenemos un KMO de 0.959, superando el umbral mínimo recomendado de 0.6, lo que indica una alta adecuación muestral y sugiere que las variables comparten una cantidad suficiente de varianza común para justificar el uso de este tipo de análisis. Tambien el 48.2% de las correlaciones entre las variables presentaron valores iguales o superiores a 0.3, porcentaje que también supera el mínimo sugerido del 30%, evidenciando relaciones significativas entre los ítems. En conjunto, ambos indicadores confirman que los datos son adecuados para proceder con un análisis factorial.

## Extracción y determinación de factores