In [2]:
import numpy as np
import pandas as pd

In [10]:
df = pd.read_csv('Data/cnae/2020/RAIS_vinculos_2020.csv')
m = df.pivot_table(index='Municipality ID', 
                      columns='Class ID', 
                      values='Workers', 
                      aggfunc='sum',  # Ensure summing workers
                      fill_value=0).values
 # Calculate totals
X = np.sum(m)          # Grand total
X_c = np.sum(m, axis=1, keepdims=True)  # Column sums (locations)
X_p = np.sum(m, axis=0, keepdims=True)  # Row sums (activities)

# Calculate Rcp with broadcasting and zero division handling
with np.errstate(divide='ignore', invalid='ignore'):
    R = (m * X) / (X_c * X_p)
    R[~np.isfinite(R)] = 0  # Replace inf/nan with 0

# Convert back to DataFrame
municipalities = df['Municipality ID'].unique()
classes = df['Class'].unique()

Rcp_df = pd.DataFrame(R, 
                     index=municipalities, 
                     columns=classes)

In [30]:
#Rcp_df.max().max()
# Display the first few rows to verify strucdf=ture
df=Rcp_df
print("RCA data:")
print(df.head())
print(f"Shape: {df.shape}")
print(f"Locality IDs: {df.index.tolist()[:5]}...")
print(f"Activity IDs: {df.columns.tolist()[:5]}...")

# Create binary matrix (1 if RCA >= 1, else 0)
binary_matrix = (df >= 1.0).astype(int)

print("\nBinary matrix:")
print(binary_matrix.head())
print(f"Binary matrix shape: {binary_matrix.shape}")

# Check some statistics
print(f"\nStatistics:")
print(f"Total entries: {binary_matrix.size}")
print(f"Ones (RCA >= 1): {binary_matrix.sum().sum()}")
print(f"Zeros (RCA < 1): {binary_matrix.size - binary_matrix.sum().sum()}")
print(f"Percentage with RCA >= 1: {binary_matrix.sum().sum()/binary_matrix.size*100:.2f}%")

# Calculate diversity and ubiquity (key metrics for economic complexity)
diversity = binary_matrix.sum(axis=1)  # Number of activities per location
ubiquity = binary_matrix.sum(axis=0)   # Number of locations per activity

print("\nDiversity (activities per location):")
print(diversity.head())

print("\nUbiquity (locations per activity):")
print(ubiquity.head())



RCA data:
         Cultivo de cereais  \
1100015            0.175294   
1100023            0.660811   
1100031            3.834745   
1100049            0.124683   
1100056            2.565574   

         Cultivo de algodão herbáceo e de outras fibras de lavoura temporária  \
1100015                                           0.000000                      
1100023                                           0.650133                      
1100031                                           0.000000                      
1100049                                           0.000000                      
1100056                                           0.000000                      

         Cultivo de cana-de-açúcar  Cultivo de fumo  Cultivo de soja  \
1100015                        0.0              0.0         0.741554   
1100023                        0.0              0.0         0.381079   
1100031                        0.0              0.0        38.062616   
1100049                     

In [39]:
binary_matrix.loc[3550308].sum()

43