In [1]:
import pandas as pd 


# Read processed data from csv files

df = pd.read_csv('../data/processed/sanitary_registry_medicines.csv')
print(df.head())
def filter_outliers(frequencies: pd.Series, threshold: int = 1.5):
    Q1 = frequencies.quantile(0.25)
    Q3 = frequencies.quantile(0.75)

    IQR = Q3 - Q1


    # Filter the pharmaceutical forms that are outliers

 
    return frequencies[((frequencies < (Q1 - threshold * IQR)) |(frequencies > (Q3 + threshold * IQR)))]

     product_name                                   manufacturer  \
0     cardilat 50           laboratorios de aplicaciones medicas   
1  alicon complex                       laboratorios ameripharma   
2         trental          sanofi-aventis de mexico s.a. de c.v.   
3    tensolisin-d                             acromax dominicana   
4        biovagin  gefarca industria  farmaceutica perezespinosa   

  pharmaceutical_form expiration_date  register_year  
0          COMPRIMIDO      2025-06-28           2000  
1              JARABE      2025-11-08           2000  
2            SOLUCION      2020-05-23           2000  
3             TABLETA      2025-12-18           2000  
4               CREMA      2025-12-18           2000  


In [2]:
# Obtain the relevant pharmaceutical forms in the sanitary registry 

frequencies = df['pharmaceutical_form'].value_counts()

# Filter the pharmaceutical forms that are outliers
pharmaceutical_forms = filter_outliers(frequencies,1.5)


# Filter the dataframe to only contain the relevant pharmaceutical forms
df = df[df['pharmaceutical_form'].isin(pharmaceutical_forms.index)]


print('-'*50)
print(pharmaceutical_forms.describe())


--------------------------------------------------
count      12.000000
mean     1099.083333
std      1132.556678
min        77.000000
25%       328.000000
50%       687.000000
75%      1318.000000
max      3520.000000
Name: count, dtype: float64


In [3]:
# Obtain the relevant manufacturers in the sanitary registry
frequencies = df['manufacturer'].value_counts()


# Filter the manufacturers that are outliers

manufacturers = filter_outliers(frequencies)


# Filter the dataframe to only contain the relevant manufacturers

df = df[df['manufacturer'].isin(manufacturers.index)]


In [4]:
# Print the statistics of the dataframe

print('-'*50)
print('Sanitary Registry Statistics')
print(df.describe())
print('-'*50)
print('Pharmaceutical Forms')
print(pharmaceutical_forms.describe())
print('-'*50)
print('Manufacturers')
print(manufacturers.describe())
print('-'*50)


# Save the processed data to a csv file

df.to_csv('../data/final/sanitary_registry_medicines.csv', index=False)





--------------------------------------------------
Sanitary Registry Statistics
       register_year
count    9621.000000
mean     2012.128677
std         5.116773
min      2000.000000
25%      2009.000000
50%      2014.000000
75%      2016.000000
max      2019.000000
--------------------------------------------------
Pharmaceutical Forms
count      12.000000
mean     1099.083333
std      1132.556678
min        77.000000
25%       328.000000
50%       687.000000
75%      1318.000000
max      3520.000000
Name: count, dtype: float64
--------------------------------------------------
Manufacturers
count    248.000000
mean      38.794355
std       75.531116
min        9.000000
25%       11.000000
50%       17.000000
75%       28.000000
max      570.000000
Name: count, dtype: float64
--------------------------------------------------
