In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('Data/cnae/2020/RAIS_vinculos_2020.csv')
m = df.pivot_table(index='Municipality ID', 
                      columns='Class ID', 
                      values='Workers', 
                      aggfunc='sum',  # Ensure summing workers
                      fill_value=0).values
# Calculate totals
X = np.sum(m)          # Grand total ✓ Correct
X_c = np.sum(m, axis=0, keepdims=True)  # Sum over locations (for each activity)
X_p = np.sum(m, axis=1, keepdims=True)  # Sum over activities (for each location)

# The RCA formula is: RCA = (m[p,i] / X_p[p]) / (X_c[i] / X)
# Which simplifies to: RCA = (m[p,i] * X) / (X_p[p] * X_c[i])
with np.errstate(divide='ignore', invalid='ignore'):
    R = (m * X) / (X_p * X_c)
    R[~np.isfinite(R)] = 0 


# Convert back to DataFrame
municipalities = df['Municipality ID'].unique()
classes = df['Class'].unique()

Rcp_df = pd.DataFrame(R, 
                     index=municipalities, 
                     columns=classes)

In [4]:
df[df['Municipality ID'] == 3550308]['Workers'].sum()
#Rcp_df.loc[3550308].sum()

np.int64(6919484)

In [42]:
df[df['Municipality ID'] == 2922656]['Workers'].sum()
#Rcp_df.loc[3550308].sum()

np.int64(898)

In [41]:
Rcp_df.loc[2922656].sum()


np.float64(19454.968263624134)

In [38]:
#df[df['Municipality ID'] == 3550308]['Workers'].sum()
Rcp_df.loc[3550308].sum()

np.float64(654.6754603000617)

In [40]:
max_municipality_id = Rcp_df.sum(axis=1).idxmax()
max_sum_value = Rcp_df.sum(axis=1).max()

print(f"Municipality ID: {max_municipality_id}, Sum: {max_sum_value:.4f}")

Municipality ID: 2922656, Sum: 19454.9683


In [5]:
#Rcp_df.max().max()
# Display the first few rows to verify strucdf=ture
df=Rcp_df
print("RCA data:")
print(df.head())
print(f"Shape: {df.shape}")
print(f"Locality IDs: {df.index.tolist()[:5]}...")
print(f"Activity IDs: {df.columns.tolist()[:5]}...")

# Create binary matrix (1 if RCA >= 1, else 0)
binary_matrix = (df >= 1.0).astype(int)

print("\nBinary matrix:")
print(binary_matrix.head())
print(f"Binary matrix shape: {binary_matrix.shape}")

# Check some statistics
print(f"\nStatistics:")
print(f"Total entries: {binary_matrix.size}")
print(f"Ones (RCA >= 1): {binary_matrix.sum().sum()}")
print(f"Zeros (RCA < 1): {binary_matrix.size - binary_matrix.sum().sum()}")
print(f"Percentage with RCA >= 1: {binary_matrix.sum().sum()/binary_matrix.size*100:.2f}%")

# Calculate diversity and ubiquity (key metrics for economic complexity)
diversity = binary_matrix.sum(axis=1)  # Number of activities per location
ubiquity = binary_matrix.sum(axis=0)   # Number of locations per activity

print("\nDiversity (activities per location):")
print(diversity.head())

print("\nUbiquity (locations per activity):")
print(ubiquity.head())



RCA data:
         Cultivo de cereais  Cultivo de soja  Cultivo de café  \
1100015            0.183781         0.000000              0.0   
1100023            0.552860         0.841929              0.0   
1100031            4.189756         0.000000              0.0   
1100049            0.125720         0.000000              0.0   
1100056            2.489547         0.000000              0.0   

         Criação de bovinos  \
1100015                 0.0   
1100023                 0.0   
1100031                 0.0   
1100049                 0.0   
1100056                 0.0   

         Criação de animais não especificados anteriormente  \
1100015                                           1.536928    
1100023                                           0.482266    
1100031                                          39.487452    
1100049                                           0.000000    
1100056                                          18.341096    

         Atividades de apoio à ag

In [25]:
binary_matrix.loc[3550308].sum()

np.int64(242)