# **<p align="center"><font size=5 color=#006600>Diplomado en Ciencia de Datos**

# **<p align="center"><font size=5 color=#006600>Análisis de correspondencias múltiples (ACM)**

# **<font size=5 color=#0000FF>Autor**

L. Felipe Castañeda G., lufecasta@gmail.com, https://github.com/lfelipecas.

Adaptado de [ACMcienciaDatos.ipynb](https://github.com/AprendizajeProfundo/diplomado-ciencia-de-datos/blob/main/Metodos-exploratorios-mutlivariados/Cuadernos/ACMcienciaDatos.ipynb).

Este cuaderno es una adaptación a Python del cuaderno [ACMcienciaDatos.ipynb](https://github.com/AprendizajeProfundo/diplomado-ciencia-de-datos/blob/main/Metodos-exploratorios-mutlivariados/Cuadernos/ACMcienciaDatos.ipynb), en el cual se realiza el Análisis de Correspondecias Múltiples (ACM) usando R, de los admitidos a la Facultad de Ciencias de la Universidad Nacional de Colombia 2013-1.

Este cuaderno no es tan detallado como el original en R, pero es un ejemplo práctico en Python en el que se muestran los principales resultados del cuaderno original, y por lo tanto se puede llegar a las mismas conclusiones. Para mayor información de los detalles remitirse a [ACMcienciaDatos.ipynb](https://github.com/AprendizajeProfundo/diplomado-ciencia-de-datos/blob/main/Metodos-exploratorios-mutlivariados/Cuadernos/ACMcienciaDatos.ipynb).

In [1]:
# Importar las librerías necesarias
import numpy as np # Para cálculos numéricos
import pandas as pd # Para manipulación y análisis de datos
import plotly.express as px # Para crear visualizaciones
import plotly.graph_objects as go # Para crear visualizaciones personalizadas
import prince # Para realizar el análisis de correspondecias múltiples (ACM)

### Tabla de datos del ejemplo admitidos

In [2]:
# Cargar archivo CSV de datos (admi.csv) en un DataFrame
# y visualizar las primeras filas del DataFrame admi
admi = pd.read_csv('../Data/admi.csv')
admi.head()

Unnamed: 0,carr,mate,cien,soci,text,imag,exam,gene,estr,orig,edad,niLE,niMa,stra,age
0,Biol,12.03,10.84,12.08,10.62,10.68,696.4424,F,alto,Bogo,a17,noLE,siMa,E4,17
1,Biol,11.75,11.16,11.71,10.35,11.6,703.2127,M,medio,Bogo,a17,noLE,siMa,E3,17
2,Biol,10.03,10.51,10.7,9.57,8.8,503.9774,F,bajo,Bogo,a18,siLE,siMa,E2,18
3,Biol,11.48,11.48,11.71,10.91,11.6,714.7407,F,bajo,Bogo,a18,noLE,siMa,E2,18
4,Biol,11.21,10.84,12.08,11.21,11.26,693.1449,M,medio,Bogo,a17,noLE,siMa,E3,17


In [3]:
# Mostrar información del DataFrame admi
admi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445 entries, 0 to 444
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   carr    445 non-null    object 
 1   mate    445 non-null    float64
 2   cien    445 non-null    float64
 3   soci    445 non-null    float64
 4   text    445 non-null    float64
 5   imag    445 non-null    float64
 6   exam    445 non-null    float64
 7   gene    445 non-null    object 
 8   estr    445 non-null    object 
 9   orig    445 non-null    object 
 10  edad    445 non-null    object 
 11  niLE    445 non-null    object 
 12  niMa    445 non-null    object 
 13  stra    445 non-null    object 
 14  age     445 non-null    int64  
dtypes: float64(6), int64(1), object(8)
memory usage: 52.3+ KB


### Tabla de datos Y

In [4]:
# Seleccionar las columnas 'carr', 'gene', 'estr', 'orig', 'edad' del DataFrame admi
# y crear un nuevo DataFrame llamado Y_raw
Y_raw = admi.loc[:,['carr','gene','estr','orig','edad']]
Y_raw.head()

Unnamed: 0,carr,gene,estr,orig,edad
0,Biol,F,alto,Bogo,a17
1,Biol,M,medio,Bogo,a17
2,Biol,F,bajo,Bogo,a18
3,Biol,F,bajo,Bogo,a18
4,Biol,M,medio,Bogo,a17


In [5]:
# Crear nuevas columnas en el DataFrame Y_raw para almacenar las columnas seleccionadas
# de admi con nombres de columna más descriptivos
Y_raw['Carrera'] = Y_raw['carr']
Y_raw['Genero'] = Y_raw['gene']
Y_raw['Edad'] = Y_raw['edad']
Y_raw['Estrato'] = Y_raw['estr']
Y_raw['Origen'] = Y_raw['orig']

# Aplicar la función pd.get_dummies() para convertir las columnas categóricas en variables
# dummy (binarias) y reemplazarlas en el DataFrame Y_raw
Y_raw = pd.get_dummies(data=Y_raw, columns=['Carrera','Genero','Edad','Estrato','Origen'], dtype=int)
Y_raw.head()

Unnamed: 0,carr,gene,estr,orig,edad,Carrera_Biol,Carrera_Esta,Carrera_Farm,Carrera_Fisi,Carrera_Geol,...,Edad_a16m,Edad_a17,Edad_a18,Edad_a19M,Estrato_alto,Estrato_bajo,Estrato_medio,Origen_Bogo,Origen_Cund,Origen_Otro
0,Biol,F,alto,Bogo,a17,1,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
1,Biol,M,medio,Bogo,a17,1,0,0,0,0,...,0,1,0,0,0,0,1,1,0,0
2,Biol,F,bajo,Bogo,a18,1,0,0,0,0,...,0,0,1,0,0,1,0,1,0,0
3,Biol,F,bajo,Bogo,a18,1,0,0,0,0,...,0,0,1,0,0,1,0,1,0,0
4,Biol,M,medio,Bogo,a17,1,0,0,0,0,...,0,1,0,0,0,0,1,1,0,0


In [6]:
# Mostrar información del DataFrame Y_raw
Y_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445 entries, 0 to 444
Data columns (total 24 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   carr           445 non-null    object
 1   gene           445 non-null    object
 2   estr           445 non-null    object
 3   orig           445 non-null    object
 4   edad           445 non-null    object
 5   Carrera_Biol   445 non-null    int32 
 6   Carrera_Esta   445 non-null    int32 
 7   Carrera_Farm   445 non-null    int32 
 8   Carrera_Fisi   445 non-null    int32 
 9   Carrera_Geol   445 non-null    int32 
 10  Carrera_Mate   445 non-null    int32 
 11  Carrera_Quim   445 non-null    int32 
 12  Genero_F       445 non-null    int32 
 13  Genero_M       445 non-null    int32 
 14  Edad_a16m      445 non-null    int32 
 15  Edad_a17       445 non-null    int32 
 16  Edad_a18       445 non-null    int32 
 17  Edad_a19M      445 non-null    int32 
 18  Estrato_alto   445 non-null   

In [7]:
# Obtener los valores únicos en la columna 'carr' del DataFrame Y_raw
Y_raw['carr'].unique()

array(['Biol', 'Esta', 'Farm', 'Fisi', 'Geol', 'Mate', 'Quim'],
      dtype=object)

In [8]:
# Crear un diccionario que mapea los nombres de columna originales a los nombres de columna deseados
dict_Y_raw = {'Carrera_Biol':'Biol','Carrera_Esta':'Esta','Carrera_Farm':'Farm','Carrera_Fisi':'Fisi',
              'Carrera_Geol':'Geol','Carrera_Mate':'Mate','Carrera_Quim':'Quim',
              'Genero_F':'F','Genero_M':'M',
              'Edad_a16m':'16-','Edad_a17':'17','Edad_a18':'18','Edad_a19M':'19+',
              'Estrato_alto':'al','Estrato_bajo':'ba','Estrato_medio':'me',
              'Origen_Bogo':'Bo','Origen_Cund':'Cu','Origen_Otro':'Ot'}

# Utilizar el diccionario para renombrar las columnas del DataFrame Y_raw
Y_raw.rename(columns=dict_Y_raw, inplace=True)

# Mostrar las primeras filas del DataFrame Y_raw actualizado
Y_raw.head()

Unnamed: 0,carr,gene,estr,orig,edad,Biol,Esta,Farm,Fisi,Geol,...,16-,17,18,19+,al,ba,me,Bo,Cu,Ot
0,Biol,F,alto,Bogo,a17,1,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
1,Biol,M,medio,Bogo,a17,1,0,0,0,0,...,0,1,0,0,0,0,1,1,0,0
2,Biol,F,bajo,Bogo,a18,1,0,0,0,0,...,0,0,1,0,0,1,0,1,0,0
3,Biol,F,bajo,Bogo,a18,1,0,0,0,0,...,0,0,1,0,0,1,0,1,0,0
4,Biol,M,medio,Bogo,a17,1,0,0,0,0,...,0,1,0,0,0,0,1,1,0,0


In [9]:
# Mostrar información del DataFrame Y_raw
Y_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445 entries, 0 to 444
Data columns (total 24 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   carr    445 non-null    object
 1   gene    445 non-null    object
 2   estr    445 non-null    object
 3   orig    445 non-null    object
 4   edad    445 non-null    object
 5   Biol    445 non-null    int32 
 6   Esta    445 non-null    int32 
 7   Farm    445 non-null    int32 
 8   Fisi    445 non-null    int32 
 9   Geol    445 non-null    int32 
 10  Mate    445 non-null    int32 
 11  Quim    445 non-null    int32 
 12  F       445 non-null    int32 
 13  M       445 non-null    int32 
 14  16-     445 non-null    int32 
 15  17      445 non-null    int32 
 16  18      445 non-null    int32 
 17  19+     445 non-null    int32 
 18  al      445 non-null    int32 
 19  ba      445 non-null    int32 
 20  me      445 non-null    int32 
 21  Bo      445 non-null    int32 
 22  Cu      445 non-null    in

In [10]:
# Seleccionar las columnas 'gene', 'estr', 'orig' y 'edad' del DataFrame Y_raw
Y = Y_raw[['gene','estr','orig','edad']]

# Mostrar las primeras filas del DataFrame Y
Y.head()

Unnamed: 0,gene,estr,orig,edad
0,F,alto,Bogo,a17
1,M,medio,Bogo,a17
2,F,bajo,Bogo,a18
3,F,bajo,Bogo,a18
4,M,medio,Bogo,a17


### Tabla disyuntiva completa (TDC) Z

In [11]:
# Seleccionar todas las filas y las columnas desde la posición 12 hasta el final del DataFrame Y_raw
Z = Y_raw.iloc[:,12:]

# Mostrar las primeras filas del DataFrame Z
Z.head()

Unnamed: 0,F,M,16-,17,18,19+,al,ba,me,Bo,Cu,Ot
0,1,0,0,1,0,0,1,0,0,1,0,0
1,0,1,0,1,0,0,0,0,1,1,0,0
2,1,0,0,0,1,0,0,1,0,1,0,0
3,1,0,0,0,1,0,0,1,0,1,0,0
4,0,1,0,1,0,0,0,0,1,1,0,0


### Tabla de Burt

$B = Z^TZ$

In [12]:
# Transponer el DataFrame Z y realizar la multiplicación de matrices con el DataFrame Z
B = Z.transpose().dot(Z)

# Mostrar el DataFrame B resultante
B

Unnamed: 0,F,M,16-,17,18,19+,al,ba,me,Bo,Cu,Ot
F,128,0,46,45,18,19,23,46,59,89,9,30
M,0,317,72,126,38,81,58,133,126,222,29,66
16-,46,72,118,0,0,0,27,44,47,70,9,39
17,45,126,0,171,0,0,39,58,74,116,19,36
18,18,38,0,0,56,0,8,22,26,47,2,7
19+,19,81,0,0,0,100,7,55,38,78,8,14
al,23,58,27,39,8,7,81,0,0,65,5,11
ba,46,133,44,58,22,55,0,179,0,95,22,62
me,59,126,47,74,26,38,0,0,185,151,11,23
Bo,89,222,70,116,47,78,65,95,151,311,0,0


### Tabla de frecuencias relativas F

In [13]:
# Calcular el número de filas y columnas del DataFrame Y
n = Y.shape[0] # Número de filas
s = Y.shape[1] # Número de columnas

# Calcular la matriz F dividiendo cada valor en el DataFrame Z por el producto de n y s
F = 1/(n*s)*Z

# Calcular la suma de las filas y columnas en el DataFrame F y agregarlas como filas y columnas adicionales
F.loc['Sum'] = F.sum(numeric_only=True, axis=0) # Suma por columnas
F.loc[:,'Sum'] = F.sum(numeric_only=True, axis=1) # Suma por filas

# Mostrar las primeras filas del DataFrame F
F.head()

Unnamed: 0,F,M,16-,17,18,19+,al,ba,me,Bo,Cu,Ot,Sum
0,0.000562,0.0,0.0,0.000562,0.0,0.0,0.000562,0.0,0.0,0.000562,0.0,0.0,0.002247
1,0.0,0.000562,0.0,0.000562,0.0,0.0,0.0,0.0,0.000562,0.000562,0.0,0.0,0.002247
2,0.000562,0.0,0.0,0.0,0.000562,0.0,0.0,0.000562,0.0,0.000562,0.0,0.0,0.002247
3,0.000562,0.0,0.0,0.0,0.000562,0.0,0.0,0.000562,0.0,0.000562,0.0,0.0,0.002247
4,0.0,0.000562,0.0,0.000562,0.0,0.0,0.0,0.0,0.000562,0.000562,0.0,0.0,0.002247


## Análisis de correspondencias múltiples

In [14]:
# Calcular p y s
p = Z.shape[1]  # Número de categorías o columnas en Z
s = Y.shape[1]  # Número de variables o columnas en Y

# Realizar el análisis de correspondencias múltiples (MCA) con p - s componentes
n_componentes = p - s
mca = prince.MCA(n_components=n_componentes)
mca_Z = mca.fit(Z)

Valores propios del ACM de admitidos

In [15]:
# Calcular un resumen de los valores propios del análisis de MCA en el objeto mca_Z
eigenvalues_summary_Z = mca_Z.eigenvalues_summary
eigenvalues_summary_Z

Unnamed: 0_level_0,eigenvalue,% of variance,% of variance (cumulative)
component,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.337,16.83%,16.83%
1,0.313,15.64%,32.47%
2,0.283,14.13%,46.60%
3,0.247,12.36%,58.96%
4,0.241,12.04%,71.00%
5,0.229,11.47%,82.46%
6,0.208,10.38%,92.84%
7,0.143,7.16%,100.00%


In [16]:
# Crear nombres de filas para el resumen de valores propios
row_names = [f"Componente {i+1}" for i in range(eigenvalues_summary_Z.shape[0])]

# Crear nombres de columnas para el resumen de valores propios
column_names = ['Valor propio', '% de varianza', '% de varianza acumulada']

# Crear un DataFrame con el resumen de valores propios
eigenvalues_summary_Z = pd.DataFrame(data=eigenvalues_summary_Z.values, index=row_names, columns=column_names, dtype=str)

# Eliminar el símbolo de porcentaje (%) en los valores del DataFrame
eigenvalues_summary_Z.replace('%','',inplace=True,regex=True)

# Convertir los valores del DataFrame a tipo float
eigenvalues_summary_Z = pd.DataFrame(data=eigenvalues_summary_Z.values, index=eigenvalues_summary_Z.index,
                                     columns=eigenvalues_summary_Z.columns, dtype=float)

# Multiplicar los valores por 0.01 para convertirlos a porcentaje y redondear a 3 decimales
eigenvalues_summary_Z = eigenvalues_summary_Z.mul([1,0.01,0.01], axis='columns').round(3)

# Redondear todos los valores del DataFrame a 2 decimales
eigenvalues_summary_Z.round(2)

Unnamed: 0,Valor propio,% de varianza,% de varianza acumulada
Componente 1,0.34,0.17,0.17
Componente 2,0.31,0.16,0.32
Componente 3,0.28,0.14,0.47
Componente 4,0.25,0.12,0.59
Componente 5,0.24,0.12,0.71
Componente 6,0.23,0.12,0.82
Componente 7,0.21,0.1,0.93
Componente 8,0.14,0.07,1.0


In [17]:
# Calcular el valor promedio de los valores propios
mean_val = np.mean(eigenvalues_summary_Z['Valor propio'])
mean_val

0.25012500000000004

In [18]:
# Crear una lista de etiquetas para el eje x del gráfico
x = ['Comp 1','Comp 2','Comp 3','Comp 4','Comp 5','Comp 6','Comp 7','Comp 8']

# Crear un gráfico de barras con los valores propios y el valor promedio
fig = px.bar(eigenvalues_summary_Z,
             x=x,
            #  ['Comp 1','Comp 2','Comp 3','Comp 4','Comp 5','Comp 6','Comp 7','Comp 8'],
             y='Valor propio', text_auto='.3',
             title="Valores propios y valor propio promedio")

# Actualizar la apariencia del gráfico
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.update_layout(xaxis_tickangle=-90)
fig.update_layout(width=500)
fig.update_layout(yaxis=dict(dtick=0.05))

# Agregar una línea de trazo para el valor promedio
fig.add_trace(go.Scatter(x=x, y=[mean_val] * len(x),
                         mode='lines',
                         name='0.25',
                         line=dict(color='red'),
                         showlegend=False))

# Mostrar el gráfico
fig.show()

### Nube de individuos

In [19]:
# Calcular las coordenadas de las filas usando MCA en el dataframe Z
row_coordinates_Z = mca_Z.row_coordinates(Z)

# Mostrar las primeras filas de las coordenadas de las filas
row_coordinates_Z.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,-0.585759,0.70151,0.649514,-0.157755,0.053737,0.202652,-0.957437,0.090956
1,-0.563779,-0.220058,0.315105,0.171639,-0.137415,-0.617858,0.208022,-0.147212
2,-0.196604,0.140595,-0.907408,0.21009,1.166903,0.395276,-0.543991,-0.308536
3,-0.196604,0.140595,-0.907408,0.21009,1.166903,0.395276,-0.543991,-0.308536
4,-0.563779,-0.220058,0.315105,0.171639,-0.137415,-0.617858,0.208022,-0.147212


In [20]:
# # Generar los nombres de las filas y las columnas del Dataframe row_coordinates_Z
row_names = row_coordinates_Z.index

# Crear nombres de columnas para las coordenadas de las filas
column_names = [f"Componente {i+1}" for i in range(row_coordinates_Z.shape[1])]

# Crear un dataframe con las coordenadas de las filas
row_coordinates_Z = pd.DataFrame(data=row_coordinates_Z.values, index=row_names, columns=column_names)

# Mostrar las primeras filas del dataframe de coordenadas de las filas
row_coordinates_Z.head()

Unnamed: 0,Componente 1,Componente 2,Componente 3,Componente 4,Componente 5,Componente 6,Componente 7,Componente 8
0,-0.585759,0.70151,0.649514,-0.157755,0.053737,0.202652,-0.957437,0.090956
1,-0.563779,-0.220058,0.315105,0.171639,-0.137415,-0.617858,0.208022,-0.147212
2,-0.196604,0.140595,-0.907408,0.21009,1.166903,0.395276,-0.543991,-0.308536
3,-0.196604,0.140595,-0.907408,0.21009,1.166903,0.395276,-0.543991,-0.308536
4,-0.563779,-0.220058,0.315105,0.171639,-0.137415,-0.617858,0.208022,-0.147212


In [21]:
# Crear la figura
fig = go.Figure()

# Añadir una traza de dispersión con texto para la nube de individuos en los componentes 1 y 2
fig.add_trace(go.Scatter(x=row_coordinates_Z['Componente 1'], y=row_coordinates_Z['Componente 2'],
                         mode='markers+text', text=row_coordinates_Z.index,
                        marker=dict(size=3, color='black'),
                         textposition='middle right',
                         name='Nube de individuos',
                         textfont=dict(size=15)))

# Actualizar la configuración de la figura
fig.update_layout(width=800, height=800,
                  title='Nube de individuos plano 1-2',
                  xaxis=dict(dtick=0.25, range=[-1,1.5], title='Componente 1 (16.8%)',
                             title_font=dict(size=20), tickfont=dict(size=15)),
                  yaxis=dict(dtick=0.25, range=[-1.25,1.55], title='Componente 2 (15.6%)',
                             title_font=dict(size=20), tickfont=dict(size=15)),
                  title_font=dict(size=25),
                  showlegend=False)

# Mostrar la figura
fig.show()

### Nube de categorías

In [22]:
# Calcular las coordenadas de columna
column_coordinates_Z = mca_Z.column_coordinates(Z)

# Mostrar las primeras filas del DataFrame resultante
column_coordinates_Z.head()

Unnamed: 0,0,1,2,3,4,5,6,7
F,-0.129213,0.889864,-0.51186,0.68227,-0.085236,0.215781,-0.939615,0.061736
M,0.052175,-0.359314,0.206682,-0.275491,0.034417,-0.087129,0.379403,-0.024928
16-,0.392574,1.066189,-0.291666,-0.111559,-0.574764,0.514161,0.800535,-0.383702
17,-0.18512,0.031211,0.883722,0.261505,0.159235,-0.736572,-0.341502,-0.182289
18,-0.74908,-0.135249,-1.130052,-0.060332,2.170654,0.396093,0.336964,0.322779


In [23]:
# Generar los nombres de las filas y las columnas para Dataframe column_coordinates_Z
row_names = column_coordinates_Z.index
column_names = [f"Componente {i+1}" for i in range(column_coordinates_Z.shape[1])]

# Crear un DataFrame a partir de column_coordinates_Z con los nombres de fila y columna generados
column_coordinates_Z = pd.DataFrame(data=column_coordinates_Z.values, index=row_names, columns=column_names)

# Mostrar el DataFrame resultante
column_coordinates_Z

Unnamed: 0,Componente 1,Componente 2,Componente 3,Componente 4,Componente 5,Componente 6,Componente 7,Componente 8
F,-0.129213,0.889864,-0.51186,0.68227,-0.085236,0.215781,-0.939615,0.061736
M,0.052175,-0.359314,0.206682,-0.275491,0.034417,-0.087129,0.379403,-0.024928
16-,0.392574,1.066189,-0.291666,-0.111559,-0.574764,0.514161,0.800535,-0.383702
17,-0.18512,0.031211,0.883722,0.261505,0.159235,-0.736572,-0.341502,-0.182289
18,-0.74908,-0.135249,-1.130052,-0.060332,2.170654,0.396093,0.336964,0.322779
19+,0.272804,-1.235734,-0.534169,-0.281749,-0.809636,0.431016,-0.549364,0.583726
al,-0.535905,0.775493,1.116254,-1.094155,0.135951,0.834673,-0.364628,0.560294
ba,0.931125,-0.312811,-0.181127,-0.040864,0.309392,0.070986,-0.289736,-0.549547
me,-0.666287,-0.036874,-0.313486,0.518601,-0.358882,-0.434135,0.439987,0.286405
Bo,-0.509071,-0.127293,-0.106773,-0.163314,-0.104479,0.074306,-0.098843,-0.302047


Plano factorial 1-2 de las categorías

In [24]:
# Crear la figura
fig = go.Figure()

# Definir los colores para las categorías
colors = {'F':'blue','M':'blue','16-':'green','17':'green','18':'green','19+':'green',
          'al':'orange','ba':'orange','me':'orange','Bo':'red','Cu':'red','Ot':'red'}

# Agregar una nueva traza a la figura con las coordenadas de columna
fig.add_trace(go.Scatter(x=column_coordinates_Z['Componente 1'], y=column_coordinates_Z['Componente 2'],
                         mode='markers+text', text=column_coordinates_Z.index,
                         marker=dict(size=6, color=[colors.get(x, 'black') for x in column_coordinates_Z.index]),
                         textposition='middle right',
                         name='Nube de categorías plano 1-2',
                         textfont=dict(size=15)))

# Actualizar el diseño de la figura
fig.update_layout(width=800, height=800,
                  title='Nube de categorías plano 1-2',
                  xaxis=dict(dtick=0.25, range=[-1,1.5], title='Componente 1 (16.8%)',
                             title_font=dict(size=20), tickfont=dict(size=15)),
                  yaxis=dict(dtick=0.25, range=[-1.5,1.25], title='Componente 2 (15.6%)',
                             title_font=dict(size=20), tickfont=dict(size=15)),
                  title_font=dict(size=25),
                  showlegend=False)

# Mostrar la figura
fig.show()

### Coordenadas y ayudas para la interpretación de las categorías para los tres primeros ejes

In [25]:
# Calcular las contribuciones de las categorías a cada componente
column_contributions_Z = mca_Z.column_contributions_

# Mostrar el DataFrame de contribuciones
column_contributions_Z

Unnamed: 0,0,1,2,3,4,5,6,7
F,0.003567,0.182065,0.066648,0.13545,0.00217,0.0146,0.305946,0.001913
M,0.00144,0.073515,0.026912,0.054693,0.000876,0.005895,0.123537,0.000773
16-,0.030355,0.240946,0.019949,0.003338,0.090957,0.076419,0.204728,0.06814
17,0.009781,0.000299,0.2654,0.026584,0.010117,0.227272,0.05399,0.022287
18,0.05245,0.00184,0.142121,0.000463,0.615661,0.021523,0.017214,0.022884
19+,0.012422,0.274296,0.056706,0.018046,0.152951,0.04551,0.081706,0.133644
al,0.038829,0.0875,0.200579,0.220444,0.003493,0.138241,0.029155,0.099735
ba,0.259042,0.031462,0.011671,0.000679,0.03998,0.00221,0.040681,0.212028
me,0.137087,0.000452,0.036131,0.113108,0.055597,0.085416,0.096959,0.05952
Bo,0.13453,0.009052,0.007046,0.018857,0.007921,0.004207,0.008226,0.111286


In [26]:
# Generar los nombres de las filas y las columnas para Dataframe column_contributions_Z
row_names = column_contributions_Z.index
column_names = [f"Cont{i+1}" for i in range(column_contributions_Z.shape[1])]

# Crear un DataFrame a partir de column_contributions_Z con los nombres de fila y columna generados
column_contributions_Z = pd.DataFrame(data=column_contributions_Z.values, index=row_names, columns=column_names)

# Mostrar el DataFrame resultante
column_contributions_Z

Unnamed: 0,Cont1,Cont2,Cont3,Cont4,Cont5,Cont6,Cont7,Cont8
F,0.003567,0.182065,0.066648,0.13545,0.00217,0.0146,0.305946,0.001913
M,0.00144,0.073515,0.026912,0.054693,0.000876,0.005895,0.123537,0.000773
16-,0.030355,0.240946,0.019949,0.003338,0.090957,0.076419,0.204728,0.06814
17,0.009781,0.000299,0.2654,0.026584,0.010117,0.227272,0.05399,0.022287
18,0.05245,0.00184,0.142121,0.000463,0.615661,0.021523,0.017214,0.022884
19+,0.012422,0.274296,0.056706,0.018046,0.152951,0.04551,0.081706,0.133644
al,0.038829,0.0875,0.200579,0.220444,0.003493,0.138241,0.029155,0.099735
ba,0.259042,0.031462,0.011671,0.000679,0.03998,0.00221,0.040681,0.212028
me,0.137087,0.000452,0.036131,0.113108,0.055597,0.085416,0.096959,0.05952
Bo,0.13453,0.009052,0.007046,0.018857,0.007921,0.004207,0.008226,0.111286


In [27]:
# Calcular las similitudes coseno de las columnas en Z
column_cosine_similarities_Z = mca_Z.column_cosine_similarities(Z)

# Mostrar el objeto column_cosine_similarities_Z
column_cosine_similarities_Z

Unnamed: 0,0,1,2,3,4,5,6,7
F,0.006742,0.319741,0.105792,0.187959,0.002934,0.018801,0.356493,0.001539
M,0.006742,0.319741,0.105792,0.187959,0.002934,0.018801,0.356493,0.001539
16-,0.055613,0.410206,0.030698,0.004491,0.11921,0.095397,0.231257,0.053128
17,0.021387,0.000608,0.48739,0.042678,0.015824,0.338591,0.072783,0.020738
18,0.080778,0.002633,0.183838,0.000524,0.678296,0.022586,0.016346,0.014999
19+,0.021572,0.44262,0.082706,0.023009,0.190003,0.053848,0.087478,0.098764
al,0.063909,0.133826,0.277274,0.266404,0.004113,0.15503,0.029586,0.069858
ba,0.583429,0.065847,0.022077,0.001124,0.064415,0.003391,0.056491,0.203227
me,0.31588,0.000967,0.069925,0.191366,0.091644,0.134106,0.137746,0.058366
Bo,0.601468,0.037606,0.026459,0.061902,0.025335,0.012815,0.022675,0.21174


In [28]:
# Generar los nombres de fila para el nuevo DataFrame de similitudes coseno
row_names = column_cosine_similarities_Z.index

# Generar los nombres de columna para el nuevo DataFrame de similitudes coseno
column_names = [f"Cos2 {i+1}" for i in range(column_cosine_similarities_Z.shape[1])]

# Crear un nuevo DataFrame con column_cosine_similarities_Z
column_cosine_similarities_Z = pd.DataFrame(data=column_cosine_similarities_Z.values, index=row_names, columns=column_names)

# Mostrar el DataFrame
column_cosine_similarities_Z

Unnamed: 0,Cos2 1,Cos2 2,Cos2 3,Cos2 4,Cos2 5,Cos2 6,Cos2 7,Cos2 8
F,0.006742,0.319741,0.105792,0.187959,0.002934,0.018801,0.356493,0.001539
M,0.006742,0.319741,0.105792,0.187959,0.002934,0.018801,0.356493,0.001539
16-,0.055613,0.410206,0.030698,0.004491,0.11921,0.095397,0.231257,0.053128
17,0.021387,0.000608,0.48739,0.042678,0.015824,0.338591,0.072783,0.020738
18,0.080778,0.002633,0.183838,0.000524,0.678296,0.022586,0.016346,0.014999
19+,0.021572,0.44262,0.082706,0.023009,0.190003,0.053848,0.087478,0.098764
al,0.063909,0.133826,0.277274,0.266404,0.004113,0.15503,0.029586,0.069858
ba,0.583429,0.065847,0.022077,0.001124,0.064415,0.003391,0.056491,0.203227
me,0.31588,0.000967,0.069925,0.191366,0.091644,0.134106,0.137746,0.058366
Bo,0.601468,0.037606,0.026459,0.061902,0.025335,0.012815,0.022675,0.21174


Tabla de coordenadas y ayudas para la interpretación de las categorías para los tres primeros ejes

In [29]:
# Concatenar los primeros 3 componentes de column_coordinates_Z
# con los primeros 3 componentes de column_contributions_Z
# y los primeros 3 componentes de column_cosine_similarities_Z
help_columns_Z = pd.concat([column_coordinates_Z.iloc[:,:3],
                            column_contributions_Z.iloc[:,:3],
                            column_cosine_similarities_Z.iloc[:,:3]],
                            axis=1)

# Redondear los valores del nuevo DataFrame a 3 decimales
help_columns_Z.round(3)

Unnamed: 0,Componente 1,Componente 2,Componente 3,Cont1,Cont2,Cont3,Cos2 1,Cos2 2,Cos2 3
F,-0.129,0.89,-0.512,0.004,0.182,0.067,0.007,0.32,0.106
M,0.052,-0.359,0.207,0.001,0.074,0.027,0.007,0.32,0.106
16-,0.393,1.066,-0.292,0.03,0.241,0.02,0.056,0.41,0.031
17,-0.185,0.031,0.884,0.01,0.0,0.265,0.021,0.001,0.487
18,-0.749,-0.135,-1.13,0.052,0.002,0.142,0.081,0.003,0.184
19+,0.273,-1.236,-0.534,0.012,0.274,0.057,0.022,0.443,0.083
al,-0.536,0.775,1.116,0.039,0.088,0.201,0.064,0.134,0.277
ba,0.931,-0.313,-0.181,0.259,0.031,0.012,0.583,0.066,0.022
me,-0.666,-0.037,-0.313,0.137,0.0,0.036,0.316,0.001,0.07
Bo,-0.509,-0.127,-0.107,0.135,0.009,0.007,0.601,0.038,0.026


In [30]:
# Calcular el promedio de las contribuciones de columna, que es 1/p = 1/12
mean_cont = 1/p
mean_cont

0.08333333333333333

### Nubes de individuos y de categorías el plano factorial 1-2

In [31]:
# Crear la figura
fig = go.Figure()

# Agregar la nube de individuos en el plano 1-2
fig.add_trace(go.Scatter(x=row_coordinates_Z['Componente 1'], y=row_coordinates_Z['Componente 2'],
                         mode='markers+text', text=row_coordinates_Z.index,
                         marker=dict(size=3, color=[colors.get(x, 'black') for x in row_coordinates_Z.index]),
                         textposition='middle right',
                         name='Nube de individuos',
                         textfont=dict(size=7)))

# Definir los colores para las categorías
colors = {'F':'blue','M':'blue','16-':'green','17':'green','18':'green','19+':'green',
          'al':'orange','ba':'orange','me':'orange','Bo':'red','Cu':'red','Ot':'red'}

# Agregar la nube de categorías en el plano 1-2
fig.add_trace(go.Scatter(x=column_coordinates_Z['Componente 1'], y=column_coordinates_Z['Componente 2'],
                         mode='markers+text', text=column_coordinates_Z.index,
                         marker=dict(size=6, color=[colors.get(x, 'black') for x in column_coordinates_Z.index]),
                         textposition='middle right',
                         name='Nube de categorías plano 1-2',
                         textfont=dict(size=15)))

# Actualizar el diseño de la figura
fig.update_layout(width=800, height=800,
                  title='Nubes de individuos y de categorías plano 1-2',
                  xaxis=dict(dtick=0.25, range=[-1,1.5], title='Componente 1 (16.8%)',
                             title_font=dict(size=20), tickfont=dict(size=15)),
                  yaxis=dict(dtick=0.25, range=[-1.3,1.55], title='Componente 2 (15.6%)',
                             title_font=dict(size=20), tickfont=dict(size=15)),
                  title_font=dict(size=25),
                  showlegend=False)

# Mostrar la figura
fig.show()

### Nubes de individuos y de categorías el plano factorial 2-3

In [32]:
# Crear la figura
fig = go.Figure()

# Agregar la nube de individuos en el plano 2-3
fig.add_trace(go.Scatter(x=row_coordinates_Z['Componente 2'], y=row_coordinates_Z['Componente 3'],
                         mode='markers+text', text=row_coordinates_Z.index,
                         marker=dict(size=3, color=[colors.get(x, 'black') for x in row_coordinates_Z.index]),
                         textposition='middle right',
                         name='Nube de individuos',
                         textfont=dict(size=7)))

# Definir los colores para las categorías
colors = {'F':'blue','M':'blue','16-':'green','17':'green','18':'green','19+':'green',
          'al':'orange','ba':'orange','me':'orange','Bo':'red','Cu':'red','Ot':'red'}

# Agregar la nube de categorías en el plano 2-3
fig.add_trace(go.Scatter(x=column_coordinates_Z['Componente 2'], y=column_coordinates_Z['Componente 3'],
                         mode='markers+text', text=column_coordinates_Z.index,
                         marker=dict(size=6, color=[colors.get(x, 'black') for x in column_coordinates_Z.index]),
                         textposition='middle right',
                         name='Nube de categorías plano 2-3',
                         textfont=dict(size=15)))

# Actualizar el diseño de la figura
fig.update_layout(width=800, height=800,
                  title='Nubes de individuos y de categorías plano 2-3',
                  xaxis=dict(dtick=0.25, range=[-1.25,1.75], title='Componente 2 (15.6%)',
                             title_font=dict(size=20), tickfont=dict(size=15)),
                  yaxis=dict(dtick=0.25, range=[-1.3,1.75], title='Componente 3 (14.1%)',
                             title_font=dict(size=20), tickfont=dict(size=15)),
                  title_font=dict(size=25),
                  showlegend=False)

# Mostrar la figura
fig.show()

### Razones de correlación de las variables cualitativas sobre los ejes

In [33]:
# Asignarle nuevos nombres a las columnas del DataFrame column_contributions_Z
# Generar los nombres de las filas y las columnas para Dataframe column_contributions_Z
row_names = column_contributions_Z.index
column_names = [f"Componente {i+1}" for i in range(column_contributions_Z.shape[1])]

# Crear un DataFrame a partir de column_contributions_Z con los nombres de fila y columna generados
column_contributions_Z = pd.DataFrame(data=column_contributions_Z.values, index=row_names, columns=column_names)
column_contributions_Z

Unnamed: 0,Componente 1,Componente 2,Componente 3,Componente 4,Componente 5,Componente 6,Componente 7,Componente 8
F,0.003567,0.182065,0.066648,0.13545,0.00217,0.0146,0.305946,0.001913
M,0.00144,0.073515,0.026912,0.054693,0.000876,0.005895,0.123537,0.000773
16-,0.030355,0.240946,0.019949,0.003338,0.090957,0.076419,0.204728,0.06814
17,0.009781,0.000299,0.2654,0.026584,0.010117,0.227272,0.05399,0.022287
18,0.05245,0.00184,0.142121,0.000463,0.615661,0.021523,0.017214,0.022884
19+,0.012422,0.274296,0.056706,0.018046,0.152951,0.04551,0.081706,0.133644
al,0.038829,0.0875,0.200579,0.220444,0.003493,0.138241,0.029155,0.099735
ba,0.259042,0.031462,0.011671,0.000679,0.03998,0.00221,0.040681,0.212028
me,0.137087,0.000452,0.036131,0.113108,0.055597,0.085416,0.096959,0.05952
Bo,0.13453,0.009052,0.007046,0.018857,0.007921,0.004207,0.008226,0.111286


In [34]:
# Crear un nuevo DataFrame con los valores propios
valores_propios = pd.DataFrame(data=eigenvalues_summary_Z['Valor propio'].values,index=eigenvalues_summary_Z.index,columns=['Valor propio'])
valores_propios

Unnamed: 0,Valor propio
Componente 1,0.337
Componente 2,0.313
Componente 3,0.283
Componente 4,0.247
Componente 5,0.241
Componente 6,0.229
Componente 7,0.208
Componente 8,0.143


In [35]:
# Calcular la correlación entre las contribuciones de las columnas y los valores propios de Z
# s = número de variables o columnas en Y
corr = s * column_contributions_Z.values * valores_propios.values.T

# Crear un nuevo DataFrame con los resultados
corr_df = pd.DataFrame(corr, index=column_contributions_Z.index, columns=column_contributions_Z.columns)

# Mostrar el DataFrame
corr_df

Unnamed: 0,Componente 1,Componente 2,Componente 3,Componente 4,Componente 5,Componente 6,Componente 7,Componente 8
F,0.004809,0.227945,0.075446,0.133824,0.002092,0.013374,0.254547,0.001094
M,0.001942,0.092041,0.030464,0.054036,0.000845,0.0054,0.102782,0.000442
16-,0.040918,0.301664,0.022583,0.003298,0.087682,0.069999,0.170334,0.038976
17,0.013185,0.000375,0.300433,0.026265,0.009753,0.208181,0.04492,0.012748
18,0.070703,0.002304,0.160881,0.000458,0.593498,0.019715,0.014322,0.01309
19+,0.016745,0.343418,0.064192,0.017829,0.147445,0.041687,0.06798,0.076444
al,0.052342,0.10955,0.227056,0.217799,0.003367,0.126629,0.024257,0.057048
ba,0.349188,0.03939,0.013211,0.000671,0.038541,0.002024,0.033847,0.12128
me,0.184793,0.000566,0.0409,0.111751,0.053595,0.078241,0.08067,0.034046
Bo,0.181346,0.011333,0.007976,0.01863,0.007636,0.003853,0.006844,0.063655


### Tabla de razones de correlación de las variables sobre los ejes

In [36]:
# Calcular la suma de correlaciones por grupo de variables
corr_gen = pd.DataFrame(corr_df.iloc[0:2,0:3].sum(axis=0), columns=['gen'])
corr_edad = pd.DataFrame(corr_df.iloc[2:6,0:3].sum(axis=0), columns=['edad'])
corr_estr = pd.DataFrame(corr_df.iloc[6:9,0:3].sum(axis=0), columns=['estr'])
corr_orig = pd.DataFrame(corr_df.iloc[9:,0:3].sum(axis=0), columns=['orig'])

# Concatenar los resultados en un nuevo DataFrame
corr_rat = pd.concat([corr_gen.T, corr_edad.T, corr_estr.T, corr_orig.T])

# Mostrar el DataFrame
corr_rat

Unnamed: 0,Componente 1,Componente 2,Componente 3
gen,0.00675,0.319986,0.105909
edad,0.141551,0.64776,0.548089
estr,0.586323,0.149506,0.281167
orig,0.613375,0.134747,0.196834


### Razones de correlación sobre el plano factorial 1-2

In [37]:
# Crear la figura
fig = go.Figure()

# Agregar la gráfica de dispersión del DataFrame corr_rat para usando los ejes Componente 1 y Componente 2
fig.add_trace(go.Scatter(x=corr_rat['Componente 1'], y=corr_rat['Componente 2'],
                         mode='markers+text', text=corr_rat.index,
                         marker=dict(size=6, color='black'),
                         textposition='middle right',
                         name='Razones de correlación de las variables sobre los ejes 1 y 2',
                         textfont=dict(size=15)))

# Actualizar el diseño de la figura
fig.update_layout(width=800, height=800,
                  title='Razones de correlación de las variables sobre los ejes 1 y 2',
                  xaxis=dict(dtick=0.05, range=[-0.1,0.7], title='Componente 1',
                             title_font=dict(size=20), tickfont=dict(size=15)),
                  yaxis=dict(dtick=0.05, range=[-0.1,0.7], title='Componente 2',
                             title_font=dict(size=20), tickfont=dict(size=15)),
                  title_font=dict(size=25),
                  showlegend=False)

# Mostrar la figura
fig.show()

### Razones de correlación sobre el plano factorial 2-3

In [38]:
# Crear la figura
fig = go.Figure()

# Agregar la gráfica de dispersión del DataFrame corr_rat usando los ejes Componente 2 y Componente 3
fig.add_trace(go.Scatter(x=corr_rat['Componente 2'], y=corr_rat['Componente 3'],
                         mode='markers+text', text=corr_rat.index,
                         marker=dict(size=6, color='black'),
                         textposition='middle right',
                         name='Razones de correlación de las variables sobre los ejes 2 y 3',
                         textfont=dict(size=15)))

# Actualizar el diseño de la figura
fig.update_layout(width=800, height=800,
                  title='Razones de correlación de las variables sobre los ejes 2 y 3',
                  xaxis=dict(dtick=0.05, range=[-0.1,0.7], title='Componente 2',
                             title_font=dict(size=20), tickfont=dict(size=15)),
                  yaxis=dict(dtick=0.05, range=[-0.1,0.7], title='Componente 3',
                             title_font=dict(size=20), tickfont=dict(size=15)),
                  title_font=dict(size=25),
                  showlegend=False)

# Mostrar la figura
fig.show()

## Elementos suplementarios

In [39]:
# Selección de columnas específicas del DataFrame Y_raw
# Columnas seleccionadas: 'carr', 'Biol', 'Esta', 'Farm', 'Fisi', 'Geol', 'Mate', 'Quim'
carr = Y_raw.loc[:,['carr','Biol', 'Esta', 'Farm', 'Fisi', 'Geol', 'Mate', 'Quim']]

# Vista previa de las primeras filas del DataFrame carr
carr.head()

Unnamed: 0,carr,Biol,Esta,Farm,Fisi,Geol,Mate,Quim
0,Biol,1,0,0,0,0,0,0
1,Biol,1,0,0,0,0,0,0
2,Biol,1,0,0,0,0,0,0
3,Biol,1,0,0,0,0,0,0
4,Biol,1,0,0,0,0,0,0


In [40]:
# Selección de columnas específicas del DataFrame carr
# Columnas seleccionadas: 'Biol', 'Esta', 'Farm', 'Fisi', 'Geol', 'Mate', 'Quim'
Z_carr = carr[['Biol','Esta','Farm','Fisi','Geol','Mate','Quim']]

# Vista previa de las primeras filas del DataFrame Z_carr
Z_carr.head()

Unnamed: 0,Biol,Esta,Farm,Fisi,Geol,Mate,Quim
0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,0
2,1,0,0,0,0,0,0
3,1,0,0,0,0,0,0
4,1,0,0,0,0,0,0


In [41]:
# Cálculo de las coordenadas de las columnas en el espacio de componentes principales utilizando MCA
column_coordinates_Z_carr = mca_Z.column_coordinates(Z_carr)

# Vista previa de las primeras filas del DataFrame column_coordinates_Z_carr
column_coordinates_Z_carr.head()

Unnamed: 0,0,1,2,3,4,5,6,7
Biol,0.018234,0.145516,0.067624,0.105336,0.105171,-0.01236,-0.110675,0.202049
Esta,-0.028989,-0.103706,0.01822,0.048353,-0.139245,-0.073012,0.020556,-0.266394
Farm,-0.167754,0.197705,-0.392484,0.321936,0.116477,-0.007801,-0.323876,-0.095436
Fisi,-0.094701,-0.029141,0.126517,-0.203833,0.075708,-0.108546,0.219167,0.072781
Geol,0.040216,0.076405,0.635285,-0.326797,0.047158,0.103181,-0.094607,-0.060513


In [42]:
# Asignación de nombres a las filas y columnas del DataFrame column_coordinates_Z_carr
row_names = column_coordinates_Z_carr.index
column_names = [f"Componente {i+1}" for i in range(column_coordinates_Z_carr.shape[1])]
column_coordinates_Z_carr = pd.DataFrame(data=column_coordinates_Z_carr.values, index=row_names, columns=column_names)

# Vista previa del DataFrame column_coordinates_Z_carr con los nombres de filas y columnas asignados
column_coordinates_Z_carr

Unnamed: 0,Componente 1,Componente 2,Componente 3,Componente 4,Componente 5,Componente 6,Componente 7,Componente 8
Biol,0.018234,0.145516,0.067624,0.105336,0.105171,-0.01236,-0.110675,0.202049
Esta,-0.028989,-0.103706,0.01822,0.048353,-0.139245,-0.073012,0.020556,-0.266394
Farm,-0.167754,0.197705,-0.392484,0.321936,0.116477,-0.007801,-0.323876,-0.095436
Fisi,-0.094701,-0.029141,0.126517,-0.203833,0.075708,-0.108546,0.219167,0.072781
Geol,0.040216,0.076405,0.635285,-0.326797,0.047158,0.103181,-0.094607,-0.060513
Mate,0.049945,-0.461958,-0.11213,-0.188788,-0.247928,0.011462,0.1634,0.201561
Quim,0.259035,0.106028,-0.156046,0.128527,-0.017912,0.155827,0.109273,-0.03346


### Valores test de las carreras

In [43]:
# Creación de un DataFrame nCarr con el conteo de valores únicos en la columna 'carr' de admi
nCarr = admi['carr'].value_counts().to_frame().reset_index()

# Renombrar las columnas del DataFrame nCarr
nCarr.columns = ['carr', 'nCarr']

# Vista previa del DataFrame nCarr con las columnas renombradas
nCarr

Unnamed: 0,carr,nCarr
0,Fisi,82
1,Farm,73
2,Esta,66
3,Biol,63
4,Quim,63
5,Mate,53
6,Geol,45


### Valores test de cada carrera para la Componente 1

In [44]:
# Creación de un diccionario vt_comp1_dict para almacenar los valores transformados del componente 1 para cada 'carr'
vt_comp1_dict = {}

# Iteración sobre cada 'carr' en orden ascendente
for carr in sorted(nCarr['carr']):
    # Obtención del valor de 'nCarr' para la 'carr' actual
    n = nCarr.loc[nCarr['carr'] == carr, 'nCarr'].values[0]
    # Cálculo del valor transformado del componente 1 para la 'carr' actual
    vt_comp1_dict[carr] = np.sqrt(n*(admi.shape[0]-1)/(admi.shape[0]-n)) * column_coordinates_Z_carr.loc[carr, 'Componente 1']

# Creación de un DataFrame vt_comp1 a partir del diccionario vt_comp1_dict
vt_comp1 = pd.DataFrame.from_dict(vt_comp1_dict, orient='index', columns=['VT1'])

### Valores test de cada carrera para la Componente 2

In [45]:
# Creación de un diccionario vt_comp1_dict para almacenar los valores transformados del componente 1 para cada 'carr'
vt_comp2_dict = {}

# Iteración sobre cada 'carr' en orden ascendente
for carr in sorted(nCarr['carr']):
    # Obtención del valor de 'nCarr' para la 'carr' actual
    n = nCarr.loc[nCarr['carr'] == carr, 'nCarr'].values[0]
    # Cálculo del valor transformado del componente 2 para la 'carr' actual
    vt_comp1_dict[carr] = np.sqrt(n*(admi.shape[0]-1)/(admi.shape[0]-n)) * column_coordinates_Z_carr.loc[carr, 'Componente 2']

# Creación de un DataFrame vt_comp2 a partir del diccionario vt_comp2_dict
vt_comp2 = pd.DataFrame.from_dict(vt_comp1_dict, orient='index', columns=['VT2'])

### Valores test de cada carrera para la Componente 3

In [46]:
# Creación de un diccionario vt_comp1_dict para almacenar los valores transformados del componente 1 para cada 'carr'
vt_comp3_dict = {}

# Iteración sobre cada 'carr' en orden ascendente
for carr in sorted(nCarr['carr']):
    # Obtención del valor de 'nCarr' para la 'carr' actual
    n = nCarr.loc[nCarr['carr'] == carr, 'nCarr'].values[0]
    # Cálculo del valor transformado del componente 3 para la 'carr' actual
    vt_comp1_dict[carr] = np.sqrt(n*(admi.shape[0]-1)/(admi.shape[0]-n)) * column_coordinates_Z_carr.loc[carr, 'Componente 3']

# Creación de un DataFrame vt_comp3 a partir del diccionario vt_comp3_dict
vt_comp3 = pd.DataFrame.from_dict(vt_comp1_dict, orient='index', columns=['VT3'])

### Tabla de coordenadas y ayudas para la interpretación de las categorías suplementarias

In [47]:
# Creación de un nuevo DataFrame help_columns_Z_carr a partir de la concatenación de column_coordinates_Z_carr
# con vt_comp1, vt_comp2 y vt_comp3
help_columns_Z_carr = pd.concat([column_coordinates_Z_carr.iloc[:,:3],
                            vt_comp1, vt_comp2, vt_comp3],
                            axis=1)
# Redondear los valores del DataFrame a 3 decimales
help_columns_Z_carr.round(3)

Unnamed: 0,Componente 1,Componente 2,Componente 3,VT1,VT2,VT3
Biol,0.018,0.146,0.068,0.156,1.245,0.579
Esta,-0.029,-0.104,0.018,-0.255,-0.912,0.16
Farm,-0.168,0.198,-0.392,-1.566,1.845,-3.664
Fisi,-0.095,-0.029,0.127,-0.948,-0.292,1.267
Geol,0.04,0.076,0.635,0.284,0.54,4.49
Mate,0.05,-0.462,-0.112,0.387,-3.579,-0.869
Quim,0.259,0.106,-0.156,2.217,0.907,-1.335


### Proyección de las carreras sobre el plano factorial 1-2

In [48]:
# Creación de la figura
fig = go.Figure()

# Definición de los colores para los diferentes valores de column_coordinates_Z.index
colors = {'F':'blue','M':'blue','16-':'green','17':'green','18':'green','19+':'green',
          'al':'orange','ba':'orange','me':'orange','Bo':'red','Cu':'red','Ot':'red'}

# Añadir un trazo para la nube de categorías en el plano 1-2
fig.add_trace(go.Scatter(x=column_coordinates_Z['Componente 1'], y=column_coordinates_Z['Componente 2'],
                         mode='markers+text', text=column_coordinates_Z.index,
                         marker=dict(size=6, color=[colors.get(x, 'black') for x in column_coordinates_Z.index]),
                         textposition='middle right',
                         name='Nube de categorías',
                         textfont=dict(size=15)))

# Añadir un trazo para la nube de carreras en el plano 1-2
fig.add_trace(go.Scatter(x=column_coordinates_Z_carr['Componente 1'], y=column_coordinates_Z_carr['Componente 2'],
                         mode='markers+text', text=column_coordinates_Z_carr.index,
                         marker=dict(size=6, color='black'),
                         textposition='middle right',
                         name='Nube de carreras',
                         textfont=dict(size=15)))

# Actualizar la configuración del layout de la figura
fig.update_layout(width=800, height=800,
                  title='Nubes de categorías y carreras plano 1-2',
                  xaxis=dict(dtick=0.25, range=[-1,1.5], title='Componente 1 (16.8%)',
                             title_font=dict(size=20), tickfont=dict(size=15)),
                  yaxis=dict(dtick=0.25, range=[-1.3,1.25], title='Componente 2 (15.6%)',
                             title_font=dict(size=20), tickfont=dict(size=15)),
                  title_font=dict(size=25),
                  showlegend=False)

# Mostrar la figura
fig.show()

### Proyección de las carreras sobre el plano factorial 2-3

In [49]:
# Creación de la figura
fig = go.Figure()

# Definición de los colores para los diferentes valores de column_coordinates_Z.index
colors = {'F':'blue','M':'blue','16-':'green','17':'green','18':'green','19+':'green',
          'al':'orange','ba':'orange','me':'orange','Bo':'red','Cu':'red','Ot':'red'}

fig.add_trace(go.Scatter(x=column_coordinates_Z['Componente 2'], y=column_coordinates_Z['Componente 3'],
                         mode='markers+text', text=column_coordinates_Z.index,
                         marker=dict(size=6, color=[colors.get(x, 'black') for x in column_coordinates_Z.index]),
                         textposition='middle right',
                         name='Nube de categorías',
                         textfont=dict(size=15)))

# Añadir un trazo para la nube de categorías en el plano 2-3
fig.add_trace(go.Scatter(x=column_coordinates_Z_carr['Componente 2'], y=column_coordinates_Z_carr['Componente 3'],
                         mode='markers+text', text=column_coordinates_Z_carr.index,
                         marker=dict(size=6, color='black'),
                         textposition='middle right',
                         name='Nube de carreras',
                         textfont=dict(size=15)))

# Actualizar la configuración del layout de la figura
fig.update_layout(width=800, height=800,
                  title='Nubes de categorías y carreras plano 2-3',
                  xaxis=dict(dtick=0.25, range=[-1.5,1.25], title='Componente 2 (15.6%)',
                             title_font=dict(size=20), tickfont=dict(size=15)),
                  yaxis=dict(dtick=0.25, range=[-1.25,1.5], title='Componente 3 (14.1%)',
                             title_font=dict(size=20), tickfont=dict(size=15)),
                  title_font=dict(size=25),
                  showlegend=False)

# Mostrar la figura
fig.show()

## Retorno a los datos

### Tablas de contingencia de las carreras con respecto a las variables activas

In [50]:
# Crear una tabla de contingencia entre las columnas 'carr' y 'estr' del DataFrame 'admi'
K_carr_est = pd.crosstab(index=admi['carr'], columns=admi['estr'], margins=True, margins_name='Suma')

# Seleccionar las columnas 'bajo', 'medio', 'alto' y 'Suma' de la tabla de contingencia
K_carr_est = K_carr_est[['bajo','medio','alto','Suma']]

# Mostrar la tabla de contingencia 'K_carr_est' con los datos de las carreras y estratos
K_carr_est

estr,bajo,medio,alto,Suma
carr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Biol,23,26,14,63
Esta,29,29,8,66
Farm,30,36,7,73
Fisi,27,36,19,82
Geol,18,9,18,45
Mate,21,25,7,53
Quim,31,24,8,63
Suma,179,185,81,445


In [51]:
# Calcular porcentajes de la tabla de contingencia 'K_carr_est' excluyendo la fila y columna totales
K_carr_est_perc = K_carr_est.iloc[:-1, :-1].apply(lambda x: x/x.sum(), axis=1)

# Crear la figura
fig = go.Figure()

# Iterar sobre las columnas de la tabla de contingencia
for col in K_carr_est_perc.columns:
    # Crear un gráfico de barras horizontal para cada columna
    category_percents = [f'{val:.2f}' for val in K_carr_est_perc[col]]
    fig.add_trace(go.Bar(
        x=K_carr_est_perc[col],
        y=K_carr_est_perc.index,
        name=col,
        orientation='h',
        text=category_percents,
        textposition='inside',
        textfont=dict(color='black')
    ))

# Actualizar la configuración de la figura
fig.update_layout(
    title='Estrato',
    xaxis=dict(dtick=0.10),
    yaxis_title='Carrera',
    barmode='stack',
    width=700, height=400
)

# Mostrar la figura
fig.show()

In [52]:
# Crear una tabla de contingencia entre las columnas 'carr' y 'orig' del DataFrame 'admi'
K_carr_orig = pd.crosstab(index=admi['carr'], columns=admi['orig'], margins=True, margins_name='Suma')

# Mostrar la tabla de contingencia 'K_carr_orig' con los datos de las carreras y los orígenes
K_carr_orig

orig,Bogo,Cund,Otro,Suma
carr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Biol,39,7,17,63
Esta,51,5,10,66
Farm,56,4,13,73
Fisi,58,5,19,82
Geol,31,5,9,45
Mate,38,4,11,53
Quim,38,8,17,63
Suma,311,38,96,445


In [53]:
# Calcular porcentajes de la tabla de contingencia 'K_carr_est' excluyendo la fila y columna totales
K_carr_orig_perc = K_carr_orig.iloc[:-1, :-1].apply(lambda x: x/x.sum(), axis=1)

# Crear la figura
fig = go.Figure()

# Iterar sobre las columnas de la tabla de contingencia
for col in K_carr_orig_perc.columns:
    # Crear un gráfico de barras horizontal para cada columna
    category_percents = [f'{val:.2f}' for val in K_carr_orig_perc[col]]
    fig.add_trace(go.Bar(
        x=K_carr_orig_perc[col],
        y=K_carr_orig_perc.index,
        name=col,
        orientation='h',
        text=category_percents,
        textposition='inside',
        textfont=dict(color='black')
    ))

# Actualizar la configuración de la figura
fig.update_layout(
    title='Origen',
    xaxis=dict(dtick=0.10),
    yaxis_title='Carrera',
    barmode='stack',
    width=700, height=400
)

# Mostrar la figura
fig.show()

In [54]:
# Crear una tabla de contingencia entre las columnas 'carr' y 'gene' del DataFrame 'admi'
K_carr_gene = pd.crosstab(index=admi['carr'], columns=admi['gene'], margins=True, margins_name='Suma')

# Mostrar la tabla de contingencia 'K_carr_gene' con los datos de las carreras y los orígenes
K_carr_gene

gene,F,M,Suma
carr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Biol,23,40,63
Esta,17,49,66
Farm,40,33,73
Fisi,13,69,82
Geol,8,37,45
Mate,6,47,53
Quim,21,42,63
Suma,128,317,445


In [55]:
# Calcular porcentajes de la tabla de contingencia 'K_carr_est' excluyendo la fila y columna totales
K_carr_gene_perc = K_carr_gene.iloc[:-1, :-1].apply(lambda x: x/x.sum(), axis=1)

# Crear la figura
fig = go.Figure()

# Iterar sobre las columnas de la tabla de contingencia
for col in K_carr_gene_perc.columns:
    # Crear un gráfico de barras horizontal para cada columna
    category_percents = [f'{val:.2f}' for val in K_carr_gene_perc[col]]
    fig.add_trace(go.Bar(
        x=K_carr_gene_perc[col],
        y=K_carr_gene_perc.index,
        name=col,
        orientation='h',
        text=category_percents,
        textposition='inside',
        textfont=dict(color='black')
    ))

# Actualizar la configuración de la figura
fig.update_layout(
    title='Género',
    xaxis=dict(dtick=0.10),
    yaxis_title='Carrera',
    barmode='stack',
    width=700, height=400
)

# Mostrar la figura
fig.show()

In [56]:
# Crear una tabla de contingencia entre las columnas 'carr' y 'edad' del DataFrame 'admi'
K_carr_edad = pd.crosstab(index=admi['carr'], columns=admi['edad'], margins=True, margins_name='Suma')

# Mostrar la tabla de contingencia 'K_carr_gene' con los datos de las carreras y los orígenes
K_carr_edad

edad,a16m,a17,a18,a19M,Suma
carr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Biol,15,27,9,12,63
Esta,18,28,5,15,66
Farm,18,26,15,14,73
Fisi,21,34,12,15,82
Geol,11,25,2,7,45
Mate,11,14,5,23,53
Quim,24,17,8,14,63
Suma,118,171,56,100,445


In [57]:
# Calcular porcentajes de la tabla de contingencia 'K_carr_est' excluyendo la fila y columna totales
K_carr_edad_perc = K_carr_edad.iloc[:-1, :-1].apply(lambda x: x/x.sum(), axis=1)

# Crear la figura
fig = go.Figure()

# Iterar sobre las columnas de la tabla de contingencia
for col in K_carr_edad_perc.columns:
    # Crear un gráfico de barras horizontal para cada columna
    category_percents = [f'{val:.2f}' for val in K_carr_edad_perc[col]]
    fig.add_trace(go.Bar(
        x=K_carr_edad_perc[col],
        y=K_carr_edad_perc.index,
        name=col,
        orientation='h',
        text=category_percents,
        textposition='inside',
        textfont=dict(color='black')
    ))

# Actualizar la configuración de la figura
fig.update_layout(
    title='Edad',
    xaxis=dict(dtick=0.10),
    yaxis_title='Carrera',
    barmode='stack',
    width=700, height=400
)

# Mostrar la figura
fig.show()