# Master Thesis: Family Firms
### EHG

In [106]:
!pip install pandas openpyxl nltk matplotlib unidecode



## Data Cleaning Processs

In [107]:
import pandas as pd
%cd
df = pd.read_excel("Thesis/data_Master_thesisFMBR2_2025-07-22_12-26.xlsx", engine="openpyxl")
df.head()


/home/jovyan


Unnamed: 0,CASE,SERIAL,REF,QUESTNNR,MODE,STARTED,AL01,AL01x01,AL01x02,AL01x03,...,MAILSENT,LASTDATA,STATUS,FINISHED,Q_VIEWER,LASTPAGE,MAXPAGE,MISSING,MISSREL,TIME_RSI
0,Interview number (sequential),Serial number (if provided),Reference (if provided in link),Questionnaire that has been used in the interview,Interview mode,Time the interview has started (Europe/Berlin),Tarea de libre asociacion: Number of mentions,Tarea de libre asociacion: Mention 1,Tarea de libre asociacion: Mention 2,Tarea de libre asociacion: Mention 3,...,Time when the invitation mailing was sent (per...,Time when the data was most recently updated,Interview status marker,Has the interview been finished (reached last ...,Did the respondent only view the questionnaire...,Last page that the participant has handled in ...,Hindmost page handled by the participant,Missing answers in percent,Missing answers (weighted by relevance),Completion Speed (relative)
1,23,,,base,interview,2025-06-12 10:22:51,5,Familia,Tradicion,Oportunidades,...,,2025-06-12 10:24:41,,1,0,3,3,0,0,1.71
2,27,,,base,interview,2025-06-13 20:27:35,5,Patrimonio familiar,Hermanos,Prosperidad familiar,...,,2025-06-13 20:29:58,,1,0,3,3,0,0,0.99
3,32,,,base,interview,2025-06-18 19:46:54,5,Equipo,Problemas,Compromiso,...,,2025-06-21 12:21:30,,1,0,3,3,0,0,0.4
4,35,,,base,interview,2025-06-21 10:17:57,5,Dinero,Bienestar,Familiar,...,,2025-06-21 10:21:27,,1,0,3,3,0,0,0.94


In [108]:
rows, cols = df.shape
print(f"This dataset has {rows} rows and {cols} columns.")

This dataset has 293 rows and 37 columns.


In [109]:
df = df.drop(index=0).reset_index(drop=True)

In [110]:
rows, cols = df.shape
print(f"This dataset has {rows} rows and {cols} columns.")

This dataset has 292 rows and 37 columns.


In [111]:
# Standardize country column
df["GI03"] = df["GI03"].astype(str).str.lower().str.strip()

# Step 1: Keep only Mexican respondents
is_mexican = ~df["GI03"].str.startswith("otro")
non_mexican_count = (~is_mexican).sum()

# Step 2: Prepare association columns and clean meaningless entries
assoc_cols = [f"AL01x0{i}" for i in range(1, 8)]
df[assoc_cols] = df[assoc_cols].fillna('').astype(str)

# Define what counts as meaningless (adjustable)
def is_meaningless(entry):
    stripped = entry.strip().lower()
    meaningless_entries = {"", ".", "..", "...", "-", "_", "na", "n/a", "none", "ninguno", "ninguna", "hola", "solo", "lugar"}
    return stripped in meaningless_entries or stripped.isdigit()

# Clean associations by replacing meaningless entries with empty string
df[assoc_cols] = df[assoc_cols].applymap(lambda x: "" if is_meaningless(x) else x)

# Step 3: Count remaining meaningful associations
valid_assoc_count = df[assoc_cols].apply(lambda row: sum(cell.strip() != '' for cell in row), axis=1)
has_5_or_more = valid_assoc_count >= 5
less_than_5_count = (~has_5_or_more).sum()

# Step 4: Combine filters and apply
df_filtered = df[is_mexican & has_5_or_more].copy()
total_excluded = (~(is_mexican & has_5_or_more)).sum()

# Output summary
print(f"Exclusion Summary:")
print(f"‚Ä¢ Non-Mexican respondents: {non_mexican_count}")
print(f"‚Ä¢ Respondents with <5 meaningful associations: {less_than_5_count}")
print(f"‚Ä¢ Total excluded: {total_excluded}")
print(f"Remaining respondents: {df_filtered.shape[0]}")



Exclusion Summary:
‚Ä¢ Non-Mexican respondents: 11
‚Ä¢ Respondents with <5 meaningful associations: 41
‚Ä¢ Total excluded: 49
Remaining respondents: 243


In [112]:
# Respondents who were excluded due to not being identified as Mexican
df[~is_mexican][["GI03", "GI03_02"]].drop_duplicates()

Unnamed: 0,GI03,GI03_02
8,otro (por favor especifica):,Usa
53,otro (por favor especifica):,Colombia
63,otro (por favor especifica):,Espa√±a
81,otro (por favor especifica):,Austria
82,otro (por favor especifica):,Inglaterra
137,otro (por favor especifica):,Dinamarca
227,otro (por favor especifica):,Culopenelolandia
243,otro (por favor especifica):,Namekusei
249,otro (por favor especifica):,Pa√≠ses Bajos


In [113]:
# Normalize casing and spacing for all associations

# List of association columns
assoc_cols = [f"AL01x0{i}" for i in range(1, 8)]

# Apply lowercase and strip whitespace for each column
for col in assoc_cols:
    df_filtered[col] = df_filtered[col].astype(str).str.lower().str.strip()

# Preview the first 3 rows of cleaned associations
print("‚úÖ Sample normalized associations:")
display(df_filtered[assoc_cols].head(3))



‚úÖ Sample normalized associations:


Unnamed: 0,AL01x01,AL01x02,AL01x03,AL01x04,AL01x05,AL01x06,AL01x07
0,familia,tradicion,oportunidades,trabajar,independencia,,
1,patrimonio familiar,hermanos,prosperidad familiar,conflicto de intereses,inconformidad,,
2,equipo,problemas,compromiso,apoyo,buena organizaci√≥n,,


In [114]:
import re

# Function to remove punctuation, digits, and extra spaces
def remove_punct_and_digits(text):
    text = re.sub(r"[^\w\s]", " ", text)  # Replace punctuation with space
    text = re.sub(r"\d+", "", text)       # Remove digits
    text = re.sub(r"\s+", " ", text)      # Collapse multiple spaces
    return text.strip()

# Apply cleaning function to each association column
for col in assoc_cols:
    df_filtered[col] = df_filtered[col].apply(remove_punct_and_digits)

# Preview cleaned data
print("Sample after cleaning punctuation and digits:")
display(df_filtered[assoc_cols].head(3))



Sample after cleaning punctuation and digits:


Unnamed: 0,AL01x01,AL01x02,AL01x03,AL01x04,AL01x05,AL01x06,AL01x07
0,familia,tradicion,oportunidades,trabajar,independencia,,
1,patrimonio familiar,hermanos,prosperidad familiar,conflicto de intereses,inconformidad,,
2,equipo,problemas,compromiso,apoyo,buena organizaci√≥n,,


In [115]:
import nltk
from nltk.corpus import stopwords
from unidecode import unidecode

nltk.download('stopwords')

# Load and normalize Spanish stopwords
spanish_stopwords = set(unidecode(w.lower()) for w in stopwords.words('spanish'))

# Custom stopwords
custom_stopwords = set([
    "al", "algo", "algunas", "algun", "algunos", "cada", "cosa", "cosas", "dela", 
    "de", "del", "dia", "dias", "el", "en", "etc", "la", "las", "lugar", "los", 
    "mmm", "nada", "ninguno", "ojala", "otro", "otras", "otros", "ser", "sirva", 
    "solo", "tener", "toda", "todo", "x", "y"
])

# Combine Spanish and custom stopwords
all_stopwords = spanish_stopwords.union(custom_stopwords)

# Clean and filter stopwords from associations
def clean_words(row):
    return [unidecode(word.strip()) for word in row if word and word not in all_stopwords and len(word) > 2]

# Merge all 7 association columns into a list and clean words
df_filtered["cleaned_words"] = df_filtered[assoc_cols].values.tolist()
df_filtered["cleaned_words"] = df_filtered["cleaned_words"].apply(clean_words)

# Preview cleaned data
print("Cleaned word lists (stopwords removed):")
display(df_filtered[["cleaned_words"]].head(3))


Cleaned word lists (stopwords removed):


[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,cleaned_words
0,"[familia, tradicion, oportunidades, trabajar, ..."
1,"[patrimonio familiar, hermanos, prosperidad fa..."
2,"[equipo, problemas, compromiso, apoyo, buena o..."


In [116]:
# Custom Dictionaries for Spelling Corrections and Translations

# Spelling corrections and English-to-Spanish translations
combined_corrections = {
    # English-to-Spanish translations
    "communication": "comunicacion",
    "entrepreneur": "emprendedor",
    "small business": "pequeno_negocio",
    "support": "apoyo",
    "teamwork": "trabajo_en_equipo",
    "values": "valores",

    # Spelling corrections
    "abilidades": "habilidades",
    "auto empleo": "autoempleo",
    "comunos": "comun",
    "diner": "dinero",
    "enyre": "entre",
    "entusiamo": "entusiasmo",
    "equpo": "equipo",
    "famialia": "familia",
    "familiates": "familiares",
    "famolia": "familia",
    "integrridad": "integridad",
    "jrs": "jr",
    "juniors": "jr",
    "legaso": "legado",
    "mayos": "mayor",
    "negicio": "negocio",
    "plaso": "plazo",
    "tewponsibility": "responsabilidad",
    "trbajo": "trabajo",
    "tinedas": "tiendas"
}

# Apply corrections to each word in a list
def apply_corrections(word_list, correction_dict):
    return [correction_dict.get(w, w) for w in word_list]

# Apply corrections to the cleaned words column
df_filtered["cleaned_words"] = df_filtered["cleaned_words"].apply(
    lambda words: apply_corrections(words, combined_corrections)
)

# Preview the result
print("‚úÖ After applying spelling and translation corrections:")
display(df_filtered[["cleaned_words"]].head(3))


‚úÖ After applying spelling and translation corrections:


Unnamed: 0,cleaned_words
0,"[familia, tradicion, oportunidades, trabajar, ..."
1,"[patrimonio familiar, hermanos, prosperidad fa..."
2,"[equipo, problemas, compromiso, apoyo, buena o..."


In [117]:
collocations = dict(sorted({
    "abuso poder": "abuso_poder",
    "alto ingreso": "ingreso_alto",
    "aprender a separar los negocios": "separar_personal_negocio",
    "auto empleo": "autoempleo",
    "baja rotacion": "baja_rotacion",
    "buena comunicacion": "buena_comunicacion",
    "buena inversion": "buena_inversion",
    "buena organizacion": "buena_organizacion",
    "buen negocio": "buen_negocio",
    "buen proposito": "buen_proposito",
    "buen sustento economico": "buen_sustento",
    "buen sustento econ√≥mico": "buen_sustento",
    "buen trato": "buen_trato",
    "capital social": "capital_social",
    "clase baja": "clase_baja",
    "clase media": "clase_media",
    "colaboradores unidos": "colaboracion",
    "comunicacion asertiva": "comunicacion_asertiva",
    "confianza mutua": "confianza_mutua",
    "conflicto intereses": "conflicto_intereses",
    "control total": "control_total",
    "creacion empleo": "creacion_empleo",
    "crecimento economico": "crecimento_economico",
    "dificil las relaciones hay que separar la familiar de la laborar": "dificultad_separar_personal_negocio",
    "dificultades para separar papeles familiares y laborales": "dificultad_separar_personal_negocio",
    "empresa familiar": "empresa_familiar",
    "es muy buen negocio": "buen_negocio",
    "familia nuclear": "familia_nuclear",
    "familia trabajando": "trabajo_familiar",
    "falta control": "falta_control",
    "falta de apoyo": "falta_apoyo",
    "falta de apoyo a familias emprendedoras": "falta_apoyo_emprendimiento",
    "falta de estructura": "falta_estructura",
    "falta de estructura corporativa": "falta_estructura",
    "falta de procesos": "falta_procesos",
    "falta de procedimientos": "falta_procesos",
    "fuente ingresos": "fuente_ingresos",
    "futuro seguro":"futuro_seguro",
    "generacion empleo": "generacion_empleo",
    "gente de nivel socioeconomico alto": "nivel_socioeconomico_alto",
    "gobierno corporativo": "gobierno_corporativo",
    "habilidades limitadas": "habilidades_limitadas",
    "herencia familiar": "herencia_familiar",
    "ideas nuevas": "ideas_nuevas",
    "ingreso alto": "ingreso_alto",
    "ingreso seguro":"ingreso_seguro",
    "inteligencia emocional": "inteligencia_emocional",
    "intereses personales": "intereses_personales",
    "largo plazo": "largo_plazo",
    "largo plaso": "largo_plazo",
    "mal negocio": "mal_negocio",
    "mayos atenci√≥n al detalle": "atencion_detalle",
    "mayor control": "mayor_control",
    "mayor paciencia": "mayor_paciencia",
    "mayor compromiso": "mayor_compromiso",
    "mayor enfoque": "mayor_enfoque",
    "mayor tiempo": "mayor_tiempo",
    "hermano mayor": "hermano_mayor",
    "mediana empresa": "mediana_empresa",
    "mediano negocio": "mediano_negocio",
    "mejor futuro": "mejor_futuro",
    "mejor trato": "mejor_trato",
    "mismo fin": "mismo_fin",
    "mucha inversion": "mucha_inversion",
    "muy buen negocio": "buen_negocio",
    "negocio familiar": "negocio_familiar",
    "negocio propio": "negocio_propio",
    "nivel alto": "nivel_alto",
    "nivel socioeconomico alto": "nivel_socioeconomico_alto",
    "patrimonio familiar": "patrimonio_familiar",
    "personas de confianza": "personas_confianza",
    "personas de confianza maxima": "personas_confianza",
    "apoyo de personas de confianza": "personas_confianza",
    "personas de la familia nuclear": "familia_nuclear",
    "pequeno negocio": "pequeno_negocio",
    "poca duracion": "poca_duracion",
    "poca inversion": "poca_inversion",
    "poca organizacion": "poca_organizacion",
    "problemas familiares": "problemas_familiares",
    "propio jefe": "propio_jefe",
    "el padre": "padre",
    "mi padre": "padre",
    "padre/hijo": "relacion_padre_hijo",
    "padre e hijo": "relacion_padre_hijo",
    "padre hijo": "relacion_padre_hijo",
    "pieza clave": "pieza_clave",  
    "paternalista": "paternalismo",  
    "salir adelante": "salir_adelante",
    "servicio reparacion padre hijo": "negocio_padre_hijo",
    "servicio reparacion (padre/hijo)": "negocio_padre_hijo",
    "separar el trabajo con lo familiar": "separar_personal_negocio",
    "separar lo familiar con el trabajo": "separar_personal_negocio",
    "separar los temas personales del negocio": "separar_personal_negocio",
    "separar problemas": "separar_personal_negocio",
    "sueldos altos": "ingreso_alto",
    "sueldo alto": "ingreso_alto",
    "trabajo duro": "trabajo_duro",
    "trabajo en equipo": "trabajo_en_equipo",
    "trabajo independiente": "trabajo_independiente",
    "un buen proposito": "buen_proposito",
    "trabajan hijos": "trabajo_familiar",
    "trabajan nietos": "trabajo_familiar",
    "trabajan yernos": "trabajo_familiar",
    "trabajan nueras": "trabajo_familiar",
    "trabajan hijos nietos yernos nueras": "trabajo_familiar",
    "solo trabajan familiares en el negocio": "trabajo_familiar",
    "trabajan miembros de una familia en una empresa": "trabajo_familiar",
    "trabajan miembros de una familia en una microempresa": "trabajo_familiar",
    "union familiar": "union_familiar",
    "valores familiares": "valores_familiares",
    "el emprendimiento de mi padre": "negocio_padre"
}.items()))

In [118]:
import re

# Function to merge collocations by replacing multi-word expressions
def merge_collocations(text, colloc_dict):
    # Ensure the input is a string
    if not isinstance(text, str):
        return text

    # Replace multi-word expressions with underscored versions
    for phrase, replacement in colloc_dict.items():
        # Use word boundaries to avoid partial matches
        pattern = r'\b' + re.escape(phrase) + r'\b'
        text = re.sub(pattern, replacement, text)
    
    return text

In [119]:
# Convert cleaned words back to sentence to apply collocations
df_filtered["temp_sentence"] = df_filtered["cleaned_words"].apply(lambda words: " ".join(words))
df_filtered["temp_sentence"] = df_filtered["temp_sentence"].apply(lambda txt: merge_collocations(txt, collocations))
df_filtered["cleaned_words"] = df_filtered["temp_sentence"].apply(lambda txt: txt.split())
df_filtered.drop("temp_sentence", axis=1, inplace=True)

In [120]:
#Clean merged words (again)
df_filtered["cleaned_words"] = df_filtered["cleaned_words"].apply(clean_words)

# Apply spelling + translation corrections again
df_filtered["cleaned_words"] = df_filtered["cleaned_words"].apply(
    lambda words: apply_corrections(words, combined_corrections)
)


In [121]:
# Backup original associations for diagnostics before filtering
df_filtered["original_assoc"] = df_filtered[assoc_cols].values.tolist()


In [122]:
print("Long associations BEFORE filtering (more than 4 words after stopword removal):\n")

long_count = 0
for idx, assoc_list in df_filtered["original_assoc"].items():
    for assoc in assoc_list:
        tokenized = tokenize_and_filter(str(assoc))
        if len(tokenized) > MAX_LEN:
            print(f" Respondent {idx}: {assoc} ‚Üí {tokenized}")
            long_count += 1
            if long_count >= 5:
                break
    if long_count >= 5:
        break


Long associations BEFORE filtering (more than 4 words after stopword removal):

 Respondent 25: iniciativa de emprender negocio por tus propios medios e intereses ‚Üí ['iniciativa', 'emprender', 'negocio', 'propios', 'medios', 'intereses']
 Respondent 42: puestos dentro de la empresa para familiares vs eficiencia y o preparaci√≥n ‚Üí ['puestos', 'dentro', 'empresa', 'familiares', 'eficiencia', 'preparacion']
 Respondent 51: dificultades para separar papeles familiares y laborales ‚Üí ['dificultades', 'separar', 'papeles', 'familiares', 'laborales']
 Respondent 51: sue√±os de alg√∫n os integrantes de la familia hechos realidad ‚Üí ['suenos', 'integrantes', 'familia', 'hechos', 'realidad']
 Respondent 75: peque√±o negocio mediana empresa algo chico ‚Üí ['pequeno', 'negocio', 'mediana', 'empresa', 'chico']


In [123]:
# Cell 15 ‚Äî Remove rows where ANY individual association is too long

assoc_cols = [f"AL01x0{i}" for i in range(1, 8)]

# Check columns exist
missing = [col for col in assoc_cols if col not in df_filtered.columns]
if missing:
    raise ValueError(f"‚ùå These expected association columns are missing: {missing}")

# Define max length of words allowed in a single association (AFTER cleaning)
MAX_LEN = 4

# Define tokenization + stopword filtering
def tokenize_and_filter(text):
    words = [
        unidecode(w.strip().lower())
        for w in text.split()
        if w and len(w) > 2
    ]
    return [w for w in words if w not in all_stopwords]

# Diagnostic print: Show long associations (AFTER stopword filtering)
print("üîç Long associations BEFORE filtering (more than 4 words in one cell):\n")
long_assoc_rows = set()
long_count = 0

for idx, row in df_filtered.iterrows():
    for assoc in row[assoc_cols]:
        tokenized = tokenize_and_filter(str(assoc))
        if len(tokenized) > MAX_LEN:
            print(f"üßæ Respondent {idx}: {assoc} ‚Üí {tokenized}")
            long_assoc_rows.add(idx)
            long_count += 1
            if long_count >= 5:
                break
    if long_count >= 5:
        break

# Drop rows where ANY association is too long
df_filtered = df_filtered[~df_filtered.index.isin(long_assoc_rows)].reset_index(drop=True)

# Rebuild assoc_sentence from already-cleaned `cleaned_words` (just in case)
df_filtered["assoc_sentence"] = df_filtered["cleaned_words"].apply(lambda words: " ".join(words))

# Re-tokenize (in case of small reintroductions)
df_filtered["cleaned_words"] = df_filtered["assoc_sentence"].apply(tokenize_and_filter)

# Keep only rows with at least 1 word left
df_filtered = df_filtered[df_filtered["cleaned_words"].apply(len) > 0].reset_index(drop=True)

# ‚úÖ Final preview
print(f"\n‚úÖ Remaining respondents after dropping rows with long associations: {len(df_filtered)}")
display(df_filtered[["cleaned_words"]].head(3))


üîç Long associations BEFORE filtering (more than 4 words in one cell):

üßæ Respondent 25: iniciativa de emprender negocio por tus propios medios e intereses ‚Üí ['iniciativa', 'emprender', 'negocio', 'propios', 'medios', 'intereses']
üßæ Respondent 42: puestos dentro de la empresa para familiares vs eficiencia y o preparaci√≥n ‚Üí ['puestos', 'dentro', 'empresa', 'familiares', 'eficiencia', 'preparacion']
üßæ Respondent 51: dificultades para separar papeles familiares y laborales ‚Üí ['dificultades', 'separar', 'papeles', 'familiares', 'laborales']
üßæ Respondent 51: sue√±os de alg√∫n os integrantes de la familia hechos realidad ‚Üí ['suenos', 'integrantes', 'familia', 'hechos', 'realidad']
üßæ Respondent 75: peque√±o negocio mediana empresa algo chico ‚Üí ['pequeno', 'negocio', 'mediana', 'empresa', 'chico']

‚úÖ Remaining respondents after dropping rows with long associations: 239


Unnamed: 0,cleaned_words
0,"[familia, tradicion, oportunidades, trabajar, ..."
1,"[patrimonio_familiar, hermanos, prosperidad, f..."
2,"[equipo, problemas, compromiso, apoyo, buena_o..."


In [124]:
for idx, row in df_filtered.iterrows():
    for col in assoc_cols:
        tokens = tokenize_and_filter(str(row[col]))
        if len(tokens) > MAX_LEN:
            print(f"Row {idx}, Col {col}: {tokens}")


Row 59, Col AL01x03: ['personas', 'mayores', 'familia', 'creen', 'edad', 'derecho', 'opinion', 'importante']
Row 59, Col AL01x05: ['necesitan', 'definir', 'respetar', 'roles', 'familiar']
Row 59, Col AL01x06: ['disque', 'dificil', 'sobreviva', 'transicion', 'segunda', 'tercera', 'generacion']
Row 66, Col AL01x03: ['crecimiento', 'estanca', 'falta', 'especialistas', 'familiares']
Row 66, Col AL01x04: ['falta', 'criterio', 'juzgar', 'situaciones', 'familia']
Row 67, Col AL01x01: ['crecimiento', 'empresa', 'confianza', 'trabajar', 'familia', 'ideas', 'ayudan', 'empresa', 'familias', 'ayudamos']
Row 85, Col AL01x03: ['usar', 'termino', 'familia', 'hablar', 'valor', 'empresa']
Row 91, Col AL01x01: ['trabajo', 'familia', 'confort', 'confianza', 'seguridad']
Row 95, Col AL01x01: ['respeto', 'claridad', 'honradez', 'confianza', 'claridad']
Row 95, Col AL01x02: ['compromiso', 'comunicacion', 'lealtad', 'principios', 'morales']
Row 107, Col AL01x01: ['negocio', 'personas', 'toman', 'decisiones',

In [125]:
# Reapply corrections after long-association filtering

def apply_corrections(word_list, correction_dict):
    return [correction_dict.get(w, w) for w in word_list]

# Re-apply correction to cleaned_words AFTER filtering
df_filtered["cleaned_words"] = df_filtered["cleaned_words"].apply(
    lambda words: apply_corrections(words, combined_corrections)
)

# Preview
print("‚úÖ Corrections reapplied after long-association filtering:")
display(df_filtered["cleaned_words"].head(3))


‚úÖ Corrections reapplied after long-association filtering:


0    [familia, tradicion, oportunidades, trabajar, ...
1    [patrimonio_familiar, hermanos, prosperidad, f...
2    [equipo, problemas, compromiso, apoyo, buena_o...
Name: cleaned_words, dtype: object

In [126]:
# Manual lemmatization dictionary: unify plural/synonym concepts
lemmatization_dict = dict(sorted({
    "abuelos": "abuelo",
    "abusos": "abuso",
    "actividades": "actividad",
    "actualidad": "actual",
    "administra": "administracion",
    "altos": "alto",
    "apoyarce": "apoyo",
    "apoyarse": "apoyo",
    "aprender": "aprender",
    "aprendizaje": "aprender",
    "armonioso": "armonia",
    "asignado": "asignacion",
    "colaboran": "colaboracion",
    "colaborar": "colaboracion",
    "colaboracion": "colaboracion",
    "conocimientos": "conocimiento",
    "conservadoras": "conservador",
    "conservadores": "conservador",
    "conflictos": "conflicto",
    "decisiones": "decision",
    "dirigir": "direccion",
    "disputas": "conflicto",
    "emocionales": "emocional",
    "emprendedoras": "emprendedor",
    "emprendedores": "emprendedor",
    "esfuerzos": "esfuerzo",
    "exitoso": "exito",
    "familia_nuclear": "nucleo_familiar",
    "familiares": "familia",
    "familias": "familia",
    "falta_apoyo": "falta_apoyo",
    "falta_apoyo_emprendimiento": "falta_apoyo",
    "falta_control": "falta_gestion",
    "falta_estructura": "falta_gestion",
    "falta_procesos": "falta_gestion",
    "ganancias": "ganancia",
    "generacional": "generacion",
    "generaciones": "generacion",
    "herencia": "heredado",
    "hermano_mayor": "figura_paterna",
    "ideas": "idea",
    "importancia": "importante",
    "informalidad": "informal",
    "ingreso_alto": "ingresos_altos",
    "inversiones": "inversion",
    "interes": "intereses",
    "jefes": "jefe",
    "jerarquia": "jerarquia",
    "jerarquico": "jerarquia",
    "jerarquicos": "jerarquia",
    "laborales": "laboral",
    "liderar": "liderazgo",
    "mejores": "mejor",
    "miembros": "miembro",
    "negocio_padre": "figura_paterna",
    "negocio_padre_hijo": "figura_paterna",
    "negocios": "negocio",
    "nivel_socioeconomico_alto": "ingresos_altos",
    "no formal": "informal",
    "objetivos": "objetivo",
    "obligaciones": "obligacion",
    "oportunidades": "oportunidad",
    "padre": "figura_paterna",
    "papa_mayor": "figura_paterna",
    "paternalismo": "figura_paterna",
    "paternalista": "paternalismo",
    "peleas": "conflicto",
    "pequena": "pequeno",
    "personales": "personal",
    "personas_confianza": "confianza_personal",
    "potenciales": "potencial",
    "problemas": "problema",
    "propias": "propio",
    "propios": "propio",
    "puestos": "puesto",
    "relacion_padre_hijo": "figura_paterna",
    "relaciones": "relacion",
    "responsabilidades": "responsabilidad",
    "retos": "reto",
    "rentas": "renta",
    "sociedades": "sociedad",
    "soluciones": "solucion",
    "sueldo_alto": "ingreso_alto",
    "tradicional": "tradicion",
    "tradiciones": "tradicion",
    "utilidades": "utilidad",
    "valores": "valor"
}.items()))



# Apply the lemmatization
def apply_lemmatization(word_list, lemma_dict):
    return [lemma_dict.get(word, word) for word in word_list]

# Show before/after for the first few rows
print("Sample before and after lemmatization:")
for i in range(min(3, len(df_filtered))):
    original = df_filtered.iloc[i]["cleaned_words"]
    lemmatized = apply_lemmatization(original, lemmatization_dict)
    print(f"\nOriginal:   {original}")
    print(f"Lemmatized: {lemmatized}")

# Apply to full column
df_filtered["cleaned_words"] = df_filtered["cleaned_words"].apply(
    lambda words: apply_lemmatization(words, lemmatization_dict)
)

print("\nLemmatization applied to entire dataset.")


Sample before and after lemmatization:

Original:   ['familia', 'tradicion', 'oportunidades', 'trabajar', 'independencia']
Lemmatized: ['familia', 'tradicion', 'oportunidad', 'trabajar', 'independencia']

Original:   ['patrimonio_familiar', 'hermanos', 'prosperidad', 'familiar', 'conflicto', 'intereses', 'inconformidad']
Lemmatized: ['patrimonio_familiar', 'hermanos', 'prosperidad', 'familiar', 'conflicto', 'intereses', 'inconformidad']

Original:   ['equipo', 'problemas', 'compromiso', 'apoyo', 'buena_organizacion']
Lemmatized: ['equipo', 'problema', 'compromiso', 'apoyo', 'buena_organizacion']

Lemmatization applied to entire dataset.


## Building Co-ocurrence Matrix

In [127]:
# Co-ocurrence matrix

from itertools import combinations
from collections import Counter

# Create unordered co-occurrence pairs per respondent
pair_counter = Counter()

for word_list in df_filtered["cleaned_words"]:
    unique_words = sorted(set(word_list))  # Avoid duplicates within a respondent
    if len(unique_words) >= 2:
        for pair in combinations(unique_words, 2):  # Generate all 2-combinations
            pair_counter[pair] += 1

# Convert counter to DataFrame for easy manipulation
edges_df = pd.DataFrame([
    {"source": w1, "target": w2, "weight": count}
    for (w1, w2), count in pair_counter.items()
])

# Preview result
print(f"Co-occurrence matrix created with {len(edges_df)} edges.")
display(edges_df.sort_values(by="weight", ascending=False).head(10))



Co-occurrence matrix created with 6585 edges.


Unnamed: 0,source,target,weight
1845,familia,negocio,17
268,familia,miembro,9
1432,familia,familiar,9
586,apoyo,trabajo,8
411,confianza,union,8
1901,confianza,lealtad,8
3,familia,tradicion,8
213,legado,tradicion,8
191,dinero,familia,8
351,empresa,familia,7


In [128]:
#Create file ready to be exported to Gephi. This file excludes words that appear less than 3 times

# Minimum frequency
min_freq = 3

# Get word frequencies
all_words = [word for words in df_filtered["cleaned_words"] for word in words]
word_freq = Counter(all_words)

# Keep only words that appear at least min_freq times
valid_words = {word for word, freq in word_freq.items() if freq >= min_freq}

# Build co-occurrence pairs only with valid words
pair_counter = Counter()
for word_list in df_filtered["cleaned_words"]:
    unique_words = sorted(set(w for w in word_list if w in valid_words))
    if len(unique_words) >= 2:
        for pair in combinations(unique_words, 2):
            pair_counter[pair] += 1

# Convert to DataFrame and save to CSV
edges_excluded_df = pd.DataFrame([
    {"source": w1, "target": w2, "weight": count}
    for (w1, w2), count in pair_counter.items()
])

edges_excluded_df.to_csv("2excludededge.csv", index=False)
print(f"Saved: 2excludededge.csv with {len(edges_excluded_df)} edges (min {min_freq} appearances).")


Saved: 2excludededge.csv with 1615 edges (min 3 appearances).


## General demographics

In [129]:
# General demographics

import pandas as pd

# Column names for demographic data 
GENDER_COL        = "GI02"    
AGE_COL           = "GI01"    
EDU_COL           = "GI04"  
OWN_BUS_COL       = "GI05"
WORK_BUS_COL      = "GI06"    


demo = df_filtered.copy()

# Normalize text 
for col in [GENDER_COL, AGE_COL, EDU_COL, OWN_BUS_COL, WORK_BUS_COL]:
    demo[col] = demo[col].astype(str).str.lower().str.strip()

total = len(demo)
print(f"Total respondents after initial cleaning: {total}\n")

def summarize(col, label=None):
    """Generate summary statistics for a given column."""
    label = label or col
    counts = demo[col].value_counts(dropna=False)
    perc = (counts / total * 100).round(1)
    summary = pd.DataFrame({"count": counts, "percent (%)": perc})
    print(f"‚Äî {label} ‚Äî")
    display(summary)
    print()

# Summaries for each demographic category
summarize(GENDER_COL,   label="Gender")
summarize(AGE_COL,      label="Age group")
summarize(EDU_COL,      label="Education level")
summarize(OWN_BUS_COL,  label="Own family business?")
summarize(WORK_BUS_COL, label="Work in family business?")

Total respondents after initial cleaning: 239

‚Äî Gender ‚Äî


Unnamed: 0,count,percent (%)
femenino,159,66.5
masculino,79,33.1
otro (por favor especifica):,1,0.4



‚Äî Age group ‚Äî


Unnamed: 0,count,percent (%)
25‚Äì34 a√±os,105,43.9
55‚Äì64 a√±os,38,15.9
18‚Äì24 a√±os,33,13.8
35‚Äì44 a√±os,32,13.4
45‚Äì54 a√±os,20,8.4
65 a√±os o m√°s,6,2.5
13‚Äì17 a√±os,5,2.1



‚Äî Education level ‚Äî


Unnamed: 0,count,percent (%)
licenciatura o ingenier√≠a,128,53.6
maestr√≠a o posgrado,55,23.0
bachillerato o preparatoria,32,13.4
doctorado,16,6.7
secundaria,5,2.1
otro (por favor especifica):,2,0.8
prefiero no decirlo,1,0.4



‚Äî Own family business? ‚Äî


Unnamed: 0,count,percent (%)
si,170,71.1
no,62,25.9
no estoy seguro/a,7,2.9



‚Äî Work in family business? ‚Äî


Unnamed: 0,count,percent (%)
si,149,62.3
no,89,37.2
no estoy seguro/a,1,0.4





## Word frequency results in general and by group

In [130]:
# Word frequency results

from collections import Counter

# Flatten all word lists into one
all_words = [word for words in df_filtered["cleaned_words"] for word in words]

# Count occurrences
word_freq = Counter(all_words)

# Display top 10
top_words = word_freq.most_common(10)
print("Top 10 most frequently mentioned words:")
for word, freq in top_words:
    print(f"‚Ä¢ {word}: {freq} mentions")



Top 10 most frequently mentioned words:
‚Ä¢ familia: 83 mentions
‚Ä¢ negocio: 39 mentions
‚Ä¢ trabajo: 33 mentions
‚Ä¢ confianza: 32 mentions
‚Ä¢ tradicion: 31 mentions
‚Ä¢ apoyo: 31 mentions
‚Ä¢ compromiso: 30 mentions
‚Ä¢ dinero: 29 mentions
‚Ä¢ familiar: 27 mentions
‚Ä¢ heredado: 27 mentions


In [131]:
# Top 10 words for owners vs. non-owners 


df_filtered["GI05"] = df_filtered["GI05"].astype(str).str.lower().str.strip()


owners = df_filtered[df_filtered["GI05"] == "si"]
non_owners = df_filtered[df_filtered["GI05"] == "no"]


owners_words = [word for words in owners["cleaned_words"] for word in words]
non_owners_words = [word for words in non_owners["cleaned_words"] for word in words]

owners_freq = Counter(owners_words).most_common(10)
non_owners_freq = Counter(non_owners_words).most_common(10)

print("Top 10 words used by family business owners (GI05 = 'si'):")
for word, freq in owners_freq:
    print(f"‚Ä¢ {word}: {freq} mentions")

print("\nTop 10 words used by non-owners (GI05 = 'no'):")
for word, freq in non_owners_freq:
    print(f"‚Ä¢ {word}: {freq} mentions")




Top 10 words used by family business owners (GI05 = 'si'):
‚Ä¢ familia: 56 mentions
‚Ä¢ negocio: 27 mentions
‚Ä¢ trabajo: 26 mentions
‚Ä¢ apoyo: 25 mentions
‚Ä¢ compromiso: 23 mentions
‚Ä¢ confianza: 22 mentions
‚Ä¢ familiar: 20 mentions
‚Ä¢ conflicto: 20 mentions
‚Ä¢ tradicion: 19 mentions
‚Ä¢ crecimiento: 19 mentions

Top 10 words used by non-owners (GI05 = 'no'):
‚Ä¢ familia: 24 mentions
‚Ä¢ esfuerzo: 12 mentions
‚Ä¢ tradicion: 11 mentions
‚Ä¢ pequeno: 10 mentions
‚Ä¢ dinero: 9 mentions
‚Ä¢ negocio: 9 mentions
‚Ä¢ confianza: 8 mentions
‚Ä¢ responsabilidad: 7 mentions
‚Ä¢ heredado: 6 mentions
‚Ä¢ union: 6 mentions


In [132]:
# Top 10 words by ownership/work status combinations

from collections import Counter

df_filtered["GI05"] = df_filtered["GI05"].astype(str).str.lower().str.strip()
df_filtered["GI06"] = df_filtered["GI06"].astype(str).str.lower().str.strip()

def top_words(mask, label):
    words = [w for words in df_filtered.loc[mask, "cleaned_words"] for w in words]
    freq = Counter(words).most_common(10)
    print(f"Top 10 words for {label}:")
    for word, count in freq:
        print(f"  ‚Ä¢ {word}: {count}")
    print()

owners       = df_filtered["GI05"] == "si"
non_owners   = df_filtered["GI05"] == "no"
workers      = df_filtered["GI06"] == "si"
non_workers  = df_filtered["GI06"] == "no"
own_work     = owners & workers
nown_work    = non_owners & workers

# Summary
top_words(owners,      "owners (GI05 = 'si')")
top_words(non_owners,  "non-owners (GI05 = 'no')")
top_words(workers,     "workers in family business (GI06 = 'si')")
top_words(non_workers, "non-workers (GI06 = 'no')")
top_words(own_work,    "owners & workers (GI05 = 'si', GI06 = 'si')")
top_words(nown_work,   "non-owners & workers (GI05 = 'no', GI06 = 'si')")


Top 10 words for owners (GI05 = 'si'):
  ‚Ä¢ familia: 56
  ‚Ä¢ negocio: 27
  ‚Ä¢ trabajo: 26
  ‚Ä¢ apoyo: 25
  ‚Ä¢ compromiso: 23
  ‚Ä¢ confianza: 22
  ‚Ä¢ familiar: 20
  ‚Ä¢ conflicto: 20
  ‚Ä¢ tradicion: 19
  ‚Ä¢ crecimiento: 19

Top 10 words for non-owners (GI05 = 'no'):
  ‚Ä¢ familia: 24
  ‚Ä¢ esfuerzo: 12
  ‚Ä¢ tradicion: 11
  ‚Ä¢ pequeno: 10
  ‚Ä¢ dinero: 9
  ‚Ä¢ negocio: 9
  ‚Ä¢ confianza: 8
  ‚Ä¢ responsabilidad: 7
  ‚Ä¢ heredado: 6
  ‚Ä¢ union: 6

Top 10 words for workers in family business (GI06 = 'si'):
  ‚Ä¢ familia: 52
  ‚Ä¢ negocio: 22
  ‚Ä¢ trabajo: 22
  ‚Ä¢ confianza: 20
  ‚Ä¢ apoyo: 20
  ‚Ä¢ crecimiento: 19
  ‚Ä¢ familiar: 18
  ‚Ä¢ compromiso: 18
  ‚Ä¢ dinero: 16
  ‚Ä¢ responsabilidad: 16

Top 10 words for non-workers (GI06 = 'no'):
  ‚Ä¢ familia: 30
  ‚Ä¢ negocio: 17
  ‚Ä¢ tradicion: 16
  ‚Ä¢ heredado: 15
  ‚Ä¢ dinero: 13
  ‚Ä¢ compromiso: 12
  ‚Ä¢ confianza: 12
  ‚Ä¢ conflicto: 11
  ‚Ä¢ apoyo: 11
  ‚Ä¢ trabajo: 11

Top 10 words for owners & workers (GI05 = 'si', GI06

In [133]:
# Top 5 words by Gender and by Age Group

import pandas as pd
from collections import Counter

df = df_filtered.copy()

df["GI02"] = df["GI02"].astype(str).str.lower().str.strip()  # Gender
df["GI01"] = df["GI01"].astype(str).str.lower().str.strip()  # Age group

def top_n_words(df, group_col, n=5):
    """
    Returns a DataFrame of the top n words for each category in group_col.
    """
    records = []
    for cat in df[group_col].unique():
        # Flatten all cleaned_words lists for this category
        words = [w for words_list in df[df[group_col] == cat]["cleaned_words"] for w in words_list]
        for word, cnt in Counter(words).most_common(n):
            records.append({group_col: cat, "word": word, "count": cnt})
    return pd.DataFrame(records)

# Compute top 5 by Gender
top_by_gender = top_n_words(df, "GI02", n=5).rename(columns={"GI02": "Gender"})

# Compute top 5 by Age Group
top_by_age = top_n_words(df, "GI01", n=5).rename(columns={"GI01": "Age_Group"})

# Display results
print("Top 5 Words by Gender")
display(top_by_gender)

print("\nTop 5 Words by Age Group")
display(top_by_age)


Top 5 Words by Gender


Unnamed: 0,Gender,word,count
0,femenino,familia,50
1,femenino,compromiso,26
2,femenino,confianza,24
3,femenino,negocio,24
4,femenino,trabajo,21
5,masculino,familia,32
6,masculino,negocio,15
7,masculino,familiar,12
8,masculino,emprendimiento,12
9,masculino,trabajo,12



Top 5 Words by Age Group


Unnamed: 0,Age_Group,word,count
0,25‚Äì34 a√±os,familia,30
1,25‚Äì34 a√±os,negocio,17
2,25‚Äì34 a√±os,heredado,17
3,25‚Äì34 a√±os,tradicion,14
4,25‚Äì34 a√±os,crecimiento,14
5,55‚Äì64 a√±os,familia,17
6,55‚Äì64 a√±os,trabajo,6
7,55‚Äì64 a√±os,confianza,6
8,55‚Äì64 a√±os,apoyo,6
9,55‚Äì64 a√±os,compromiso,6
