Limpieza de Datos

In [10]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## 1. Carga del dataset

In [11]:
raw = pd.read_stata('Mexico-2023-full-data.dta', convert_categoricals=False)

## 2. Selección de variables

In [12]:
var_map = {
    'd2'               : 'sales',
    'l1'               : 'employees',
    'b7'               : 'age',
    'h1'               : 'training',
    'e1'               : 'exporter',
    'k7'               : 'financing',
    'competition_select': 'competition',
    'n2k'              : 'labor_cost',
}

df = raw[list(var_map.keys())].rename(columns=var_map).copy()

## 3. Limpieza de datos

In [13]:

df.replace(-9, np.nan, inplace=True)

df['training']  = df['training'].map({1: 1, 2: 0})
df['exporter']  = df['exporter'].map({1: 1, 2: 0, 3: 0}) 
df['financing'] = df['financing'].map({1: 1, 2: 0})

df.dropna(inplace=True)

df = df[(df['sales'] > 0) & (df['employees'] > 0) & (df['age'] > 0)].copy()
df['labor_prod'] = df['sales'] / df['employees']

sales_mean = df['sales'].mean()
df['successful'] = (df['sales'] > sales_mean).astype(int)


## 5. Dataset final documentado

In [14]:
print('=== Estadísticos descriptivos ===')
df.describe().round(2)

=== Estadísticos descriptivos ===


Unnamed: 0,sales,employees,age,training,exporter,financing,competition,labor_cost,labor_prod,successful
count,1275.0,1275.0,1275.0,1275.0,1275.0,1275.0,1275.0,1275.0,1275.0,1275.0
mean,89683070.0,83.48,10.9,0.16,0.56,0.13,0.52,279083.81,1007165.92,0.3
std,138362300.0,107.74,7.41,0.37,0.5,0.34,0.5,588878.53,629997.16,0.46
min,380000.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,76000.0,0.0
25%,9000000.0,12.0,5.0,0.0,0.0,0.0,0.0,25200.0,605767.46,0.0
50%,28000000.0,30.0,10.0,0.0,1.0,0.0,1.0,70000.0,869565.22,0.0
75%,123200000.0,132.0,15.0,0.0,1.0,0.0,1.0,310000.0,1250000.0,1.0
max,1400000000.0,1680.0,45.0,1.0,1.0,1.0,1.0,12000000.0,5111111.11,1.0


In [15]:
print('=== Tipos de datos ===')
print(df.dtypes)
print(f'\nNulos restantes: {df.isnull().sum().sum()}')

=== Tipos de datos ===
sales            int32
employees        int16
age            float64
training       float64
exporter         int64
financing      float64
competition       int8
labor_cost     float64
labor_prod     float64
successful       int64
dtype: object

Nulos restantes: 0


In [16]:
# Guardar dataset limpio para uso en los demás notebooks
df.to_csv('mexico_wbes_clean.csv', index=False)
print('Dataset guardado como mexico_wbes_clean.csv')
df.head()

Dataset guardado como mexico_wbes_clean.csv


Unnamed: 0,sales,employees,age,training,exporter,financing,competition,labor_cost,labor_prod,successful
0,30700000,53,3.0,0.0,0,0.0,0,320000.0,579245.3,0
1,29000000,22,1.0,0.0,1,0.0,0,33500.0,1318182.0,0
2,43000000,107,12.0,0.0,0,0.0,1,120000.0,401869.2,0
3,135000000,270,25.0,0.0,0,0.0,0,711000.0,500000.0,1
4,6000000,15,5.0,0.0,1,0.0,1,8000.0,400000.0,0
