In [87]:
# Data exploration

import pandas as pd

# Read data

df = pd.read_csv('../DataSources/HNP_Stats_CSV/HNP_StatsData.csv')

In [88]:
df.head(10)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,Unnamed: 67
0,Africa Eastern and Southern,AFE,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,140.180526,140.810248,141.664168,142.324951,143.456933,144.33115,...,107.697715,105.501833,103.222825,100.963526,99.491703,99.085572,97.413585,96.181442,,
1,Africa Eastern and Southern,AFE,Adults (ages 15+) and children (0-14 years) li...,SH.HIV.TOTL,,,,,,,...,,,,,,,,,,
2,Africa Eastern and Southern,AFE,Adults (ages 15+) and children (ages 0-14) new...,SH.HIV.INCD.TL,,,,,,,...,,,,,,,,,,
3,Africa Eastern and Southern,AFE,Adults (ages 15+) living with HIV,SH.DYN.AIDS,,,,,,,...,,,,,,,,,,
4,Africa Eastern and Southern,AFE,Adults (ages 15-49) newly infected with HIV,SH.HIV.INCD,,,,,,,...,,,,,,,,,,
5,Africa Eastern and Southern,AFE,"Age at first marriage, female",SP.DYN.SMAM.FE,,,,,,,...,,,,,,,,,,
6,Africa Eastern and Southern,AFE,"Age at first marriage, male",SP.DYN.SMAM.MA,,,,,,,...,,,,,,,,,,
7,Africa Eastern and Southern,AFE,Age dependency ratio (% of working-age populat...,SP.POP.DPND,89.594604,89.87337,90.191721,90.574678,90.952719,91.346821,...,85.499375,84.957551,84.439468,83.93098,83.342388,82.692059,81.968963,81.189988,80.386304,
8,Africa Eastern and Southern,AFE,"Age dependency ratio, old",SP.POP.DPND.OL,5.626944,5.598776,5.571718,5.549702,5.534236,5.526348,...,5.509918,5.544147,5.5846,5.631777,5.677017,5.718424,5.733111,5.706199,5.662143,
9,Africa Eastern and Southern,AFE,"Age dependency ratio, young",SP.POP.DPND.YG,83.668705,83.936992,84.243994,84.61314,84.994512,85.398509,...,78.874913,78.311969,77.798386,77.297024,76.690679,76.028564,75.310862,74.565149,73.801353,


In [89]:
# The data is organized by country, indicador and a series of years. The first 4 columns are metadata, the rest are the years 1960-2022. First we will keep only >2020 years.

df = df.drop(df.columns[4:64], axis=1)

df.head(10)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,2020,2021,2022,Unnamed: 67
0,Africa Eastern and Southern,AFE,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,97.413585,96.181442,,
1,Africa Eastern and Southern,AFE,Adults (ages 15+) and children (0-14 years) li...,SH.HIV.TOTL,,,,
2,Africa Eastern and Southern,AFE,Adults (ages 15+) and children (ages 0-14) new...,SH.HIV.INCD.TL,,,,
3,Africa Eastern and Southern,AFE,Adults (ages 15+) living with HIV,SH.DYN.AIDS,,,,
4,Africa Eastern and Southern,AFE,Adults (ages 15-49) newly infected with HIV,SH.HIV.INCD,,,,
5,Africa Eastern and Southern,AFE,"Age at first marriage, female",SP.DYN.SMAM.FE,,,,
6,Africa Eastern and Southern,AFE,"Age at first marriage, male",SP.DYN.SMAM.MA,,,,
7,Africa Eastern and Southern,AFE,Age dependency ratio (% of working-age populat...,SP.POP.DPND,81.968963,81.189988,80.386304,
8,Africa Eastern and Southern,AFE,"Age dependency ratio, old",SP.POP.DPND.OL,5.733111,5.706199,5.662143,
9,Africa Eastern and Southern,AFE,"Age dependency ratio, young",SP.POP.DPND.YG,75.310862,74.565149,73.801353,


In [90]:
# Now we will create 3 dataframes, one for each year (2020, 2021, 2022) with the first 4 columns and the corresponding year.

df_2020 = df[['Country Name','Country Code', 'Indicator Name', 'Indicator Code', '2020']]
df_2021 = df[['Country Name','Country Code', 'Indicator Name', 'Indicator Code', '2021']]
df_2022 = df[['Country Name','Country Code', 'Indicator Name', 'Indicator Code', '2022']]

# For each data frame, we will reorganize the data to join all the indicators of the same country as columns, and the values as rows. 

df_2020 = df_2020.pivot_table(index=['Country Name','Country Code'], columns='Indicator Name', values='2020').reset_index()

df_2021 = df_2021.pivot_table(index=['Country Name','Country Code'], columns='Indicator Name', values='2021').reset_index()

df_2022 = df_2022.pivot_table(index=['Country Name','Country Code'], columns='Indicator Name', values='2022').reset_index()



In [91]:
# List of indicators

(df_2020.columns).tolist()


['Country Name',
 'Country Code',
 'AIDS estimated deaths (UNAIDS estimates)',
 'ARI treatment (% of children under 5 taken to a health provider)',
 'Adolescent fertility rate (births per 1,000 women ages 15-19)',
 'Adults (ages 15+) and children (0-14 years) living with HIV',
 'Adults (ages 15+) and children (ages 0-14) newly infected with HIV',
 'Adults (ages 15+) living with HIV',
 'Adults (ages 15-49) newly infected with HIV',
 'Age dependency ratio (% of working-age population)',
 'Age dependency ratio, old',
 'Age dependency ratio, young',
 'Age population, age 00, female, interpolated',
 'Age population, age 00, male, interpolated',
 'Age population, age 01, female, interpolated',
 'Age population, age 01, male, interpolated',
 'Age population, age 02, female, interpolated',
 'Age population, age 02, male, interpolated',
 'Age population, age 03, female, interpolated',
 'Age population, age 03, male, interpolated',
 'Age population, age 04, female, interpolated',
 'Age populatio

In [92]:
# Now we will explore which indicators are more available for each year, drop all with >25% of missing values and keep only the ones with >75% of values.

# 2020
df_2020 = df_2020.dropna(thresh=0.75*len(df_2020), axis=1)

# List of Top missing indicators

(df_2020.isnull().sum()).sort_values(ascending=False).head(50)

# Drop all Age Population interpolated indicators (Columns that ends on ' interpolated')

df_2020 = df_2020.loc[:,~df_2020.columns.str.contains(' interpolated')]



In [94]:
# List of most problematic countries-> Country Name and Total missing values

(df_2020.isnull().sum(axis=1)).sort_values(ascending=False).head(50)
# Add a column with the total missing values, and the country name

df_2020['Total Missing Values'] = df_2020.isnull().sum(axis=1)

# Drop all rows with >50 missing values, list of countries with >50 missing values

dropped_countries = (df_2020[df_2020['Total Missing Values'] > 50])['Country Name'].tolist()

df_2020 = df_2020[df_2020['Total Missing Values'] < 50]

print(dropped_countries)

# List of countries kept

# Drop "Fragile and conflict affected situations" and "Not classified" countries
df_2020 = df_2020[df_2020['Country Name'] != 'Fragile and conflict affected situations']
df_2020 = df_2020[df_2020['Country Name'] != 'Not classified']

# Drop Taiwan using the country code
df_2020 = df_2020[df_2020['Country Code'] != 'TWN']


print((df_2020['Country Name']).tolist())


[]
['Afghanistan', 'Africa Eastern and Southern', 'Africa Western and Central', 'Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Arab World', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas, The', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei Darussalam', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada', 'Caribbean small states', 'Central African Republic', 'Central Europe and the Baltics', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Congo, Dem. Rep.', 'Congo, Rep.', 'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba', 'Cyprus', 'Czechia', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Early-demographic dividend', 'East Asia & Pacific', 'East Asia & Pacific (IDA & IBRD countries)', 'East Asia & Pacific (excluding high income)', 'Ecuador', 'Egypt, Arab Rep.', 'El Salvador', 'Equatorial Gui

In [131]:
import re
import urllib.request

# Descargar el texto de "Romeo and Juliet" desde el enlace proporcionado
url = "https://www.gutenberg.org/cache/epub/1513/pg1513.txt"
response = urllib.request.urlopen(url)
data = response.read().decode("utf-8")



# Definir una expresión regular para identificar las intervenciones de los personajes donde el personaje es una palabra en mayusculas seguida de un punto y aparte.
# La intervención es todo el texto entre el nombre del personaje y el siguiente personaje o el final del texto.
# En el ejemplo \r\n\r\nSAMPSON.\r\nGregory, on my word, we’ll not carry coals.\r\n\r\nGREGORY.\r\nNo, for then we should be colliers.\r\n\r\nSAMPSON.\r\nI mean, if we be in choler, we’ll draw.\r\n\r\nGREGORY.
# La expresión regular debe identificar: 
# \r\n\r\nSAMPSON.\r\n
# Gregory, on my word, we’ll not carry coals.\r\n\r\n
# GREGORY.\r\n
# No, for then we should be colliers.\r\n\r\n
# SAMPSON.\r\n
# I mean, if we be in choler, we’ll draw.\r\n\r\n
# GREGORY.

patron = r'(\r\n.+?\.\r\n)(?=(?:[A-Z][A-Z\s]+\.\r\n|$))'
intervenciones = re.findall(patron, data, re.DOTALL)

# Imprimir las intervenciones de personajes
for intervencion in intervenciones:
    print(intervencion.strip())

This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: Romeo and Juliet


Author: William Shakespeare

Release date: November 1, 1998 [eBook #1513]
                Most recently updated: June 27, 2023

Language: English



*** START OF THE PROJECT GUTENBERG EBOOK ROMEO AND JULIET ***



THE TRAGEDY OF ROMEO AND JULIET

by William Shakespeare




Contents

THE PROLOGUE.

ACT I
Scene I. A public place.
Scene II. A Street.
Scene III. Room in Capulet’s House.
Scene IV. A Street.
Scene V. A Hall in Capulet’s House.

ACT II
CHORUS.
Scene I. An open place adjoining Capulet’s Garden.
Scene II. Capulet’s G

In [132]:
patron = r'\r\n([A-Z\s]+\.\r\n.+?)(?=(?:[A-Z\s]+\.\r\n|$))'
intervenciones = re.findall(patron, data, re.DOTALL)

# Imprimir las intervenciones de personajes
for intervencion in intervenciones:
    print(intervencion)
    print("PLEX")


THE PROLOGUE.

ACT I
Scene I. A public place.
Scene II. A Street.
Scene III. Room in Capulet’s House.
Scene IV. A Street.
Scene V. A Hall in Capulet’s House.
PLEX

ACT II
CHORUS.
Scene I. An open place adjoining Capulet’s Garden.
Scene II. Capulet’s Garden.
Scene III. Friar Lawrence’s Cell.
Scene IV. A Street.
Scene V. Capulet’s Garden.
Scene VI. Friar Lawrence’s Cell.

ACT III
Scene I. A public Place.
Scene II. A Room in Capulet’s House.
Scene III. Friar Lawrence’s cell.
Scene IV. A Room in Capulet’s House.
Scene V. An open Gallery to Juliet’s Chamber, overlooking the Garden.

ACT IV
Scene I. Friar Lawrence’s Cell.
Scene II. Hall in Capulet’s House.
Scene III. Juliet’s Chamber.
Scene IV. Hall in Capulet’s House.
Scene V. Juliet’s Chamber; Juliet on the bed.

ACT V
Scene I. Mantua. A Street.
Scene II. Friar Lawrence’s Cell.
Scene III. A churchyard; in it a Monument belonging to the Capulets.




 Dramatis Personæ

ESCALUS, Prince of Verona.
MERCUTIO, kinsman to the Prince, and friend 

In [121]:
intervenciones

[]

In [128]:
print(data)

The Project Gutenberg eBook of Romeo and Juliet
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: Romeo and Juliet


Author: William Shakespeare

Release date: November 1, 1998 [eBook #1513]
                Most recently updated: June 27, 2023

Language: English



*** START OF THE PROJECT GUTENBERG EBOOK ROMEO AND JULIET ***



THE TRAGEDY OF ROMEO AND JULIET

by William Shakespeare




Contents

THE PROLOGUE.

ACT I
Scene I. A public place.
Scene II. A Street.
Scene III. Room in Capulet’s House.
Scene IV. A Street.
Scene V. A Hall in Capulet’s House.

ACT II
CHORUS.
Scene I. An open pl

In [166]:
patron = r'(\r\n[A-Z\s]+\.\r\n.+?)(?=(?:\r\n[A-Z\s]+\.\r\n|$))'
intervenciones = re.findall(patron, data, re.DOTALL)

def limpiar_intervencion(intervencion):
    lineas = intervencion.split('\r\n')
    personaje = lineas[2].strip()
    lineas_limpias = [linea.strip() for linea in lineas[3:] if not (linea.startswith(' ') or '[' in linea or ']' in linea or re.match(r'^[A-Z\s]+\.$', linea) or not linea.strip())]
    if lineas_limpias and lineas_limpias[-1].endswith('?'):
        return "{}\r\n{}".format(personaje, '\r\n'.join(lineas_limpias))
    else:
        return ''

intervenciones_limpias = [limpiar_intervencion(intervencion) for intervencion in intervenciones]
intervenciones_limpias = [intervencion for intervencion in intervenciones_limpias if intervencion]

for intervencion in intervenciones_limpias:
    print(intervencion)


GREGORY.
The heads of the maids?
GREGORY.
How? Turn thy back and run?
ABRAM.
Do you bite your thumb at us, sir?
ABRAM.
Do you bite your thumb at us, sir?
SAMPSON.
Is the law of our side if I say ay?
GREGORY.
Do you quarrel, sir?
LADY CAPULET.
A crutch, a crutch! Why call you for a sword?
MONTAGUE.
Who set this ancient quarrel new abroach?
Speak, nephew, were you by when it began?
BENVOLIO.
My noble uncle, do you know the cause?
BENVOLIO.
Have you importun’d him by any means?
ROMEO.
Is the day so young?
ROMEO.
Ay me, sad hours seem long.
Was that my father that went hence so fast?
BENVOLIO.
It was. What sadness lengthens Romeo’s hours?
BENVOLIO.
In love?
BENVOLIO.
Of love?
ROMEO.
Alas that love, whose view is muffled still,
Should, without eyes, see pathways to his will!
Where shall we dine? O me! What fray was here?
Yet tell me not, for I have heard it all.
Here’s much to do with hate, but more with love:
Why, then, O brawling love! O loving hate!
O anything, of nothing first create!
O

In [170]:
import re
import urllib.request

# Descargar el texto de "Romeo and Juliet" desde el enlace proporcionado
url = "https://www.gutenberg.org/cache/epub/1513/pg1513.txt"
response = urllib.request.urlopen(url)
data = response.read().decode("utf-8")

patron = r'\r\n([A-Z\s]+\.)\r\n(.+?)(?=(?:\r\n[A-Z\s]+\.\r\n|$))'
intervenciones = re.findall(patron, data, re.DOTALL)

def limpiar_intervencion(intervencion):
    personaje, dialogo = intervencion
    lineas_limpias = re.findall(r'^(?![\s\[\]])[^\r\n]+(?=\r\n[A-Z\s]+\.\r\n|\Z)', dialogo, re.MULTILINE)
    
    if lineas_limpias and lineas_limpias[-1].endswith('?'):
        return "{}\r\n{}".format(personaje, '\r\n'.join(lineas_limpias))
    else:
        return ''


intervenciones_limpias = [limpiar_intervencion(intervencion) for intervencion in intervenciones]
intervenciones_limpias = [intervencion for intervencion in intervenciones_limpias if intervencion]

for intervencion in intervenciones_limpias:
    print(intervencion)

print(intervenciones_limpias)
    


GREGORY.
The heads of the maids?

GREGORY.
How? Turn thy back and run?

ABRAM.
Do you bite your thumb at us, sir?

ABRAM.
Do you bite your thumb at us, sir?

SAMPSON.
Is the law of our side if I say ay?

GREGORY.
Do you quarrel, sir?

LADY CAPULET.
A crutch, a crutch! Why call you for a sword?

MONTAGUE.
Speak, nephew, were you by when it began?

BENVOLIO.
My noble uncle, do you know the cause?

BENVOLIO.
Have you importun’d him by any means?

ROMEO.
Is the day so young?

ROMEO.
Was that my father that went hence so fast?

BENVOLIO.
It was. What sadness lengthens Romeo’s hours?

BENVOLIO.
In love?

BENVOLIO.
Of love?

ROMEO.
Dost thou not laugh?

ROMEO.
Good heart, at what?

BENVOLIO.
Tell me in sadness who is that you love?

ROMEO.
What, shall I groan and tell thee?

BENVOLIO.
Then she hath sworn that she will still live chaste?

PARIS.
But now my lord, what say you to my suit?

BENVOLIO.
For what, I pray thee?

BENVOLIO.
Why, Romeo, art thou mad?

SERVANT.
God gi’ go-den. I pray, si

In [172]:
conteo_personajes = {}

# Expresión regular para extraer el nombre del personaje
patron_nombre = r'\r\n([A-Z\s]+)\.'

for intervencion in intervenciones_limpias:
    # Usar expresión regular para extraer el nombre del personaje
    match = re.search(patron_nombre, intervencion)
    
    if match:
        nombre_personaje = match.group(1).strip()
        
        # Si el personaje ya está en el diccionario, aumenta su contador
        if nombre_personaje in conteo_personajes:
            conteo_personajes[nombre_personaje] += 1
        else:
            # Si el personaje no está en el diccionario, agrégalo con un contador de 1
            conteo_personajes[nombre_personaje] = 1

# El diccionario conteo_personajes ahora contiene el recuento de intervenciones por personaje
print(conteo_personajes)
print(interveciones_limpias.length())

{'GREGORY': 3, 'ABRAM': 2, 'SAMPSON': 1, 'LADY CAPULET': 8, 'MONTAGUE': 3, 'BENVOLIO': 14, 'ROMEO': 27, 'PARIS': 3, 'SERVANT': 2, 'JULIET': 28, 'NURSE': 17, 'MERCUTIO': 9, 'CAPULET': 11, 'FRIAR LAWRENCE': 8, 'TYBALT': 1, 'FIRST CITIZEN': 1, 'PRINCE': 6, 'FIRST MUSICIAN': 2, 'PETER': 5, 'APOTHECARY': 1}


NameError: name 'interveciones_limpias' is not defined

In [110]:
data



In [136]:
print(data)

The Project Gutenberg eBook of Romeo and Juliet
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: Romeo and Juliet


Author: William Shakespeare

Release date: November 1, 1998 [eBook #1513]
                Most recently updated: June 27, 2023

Language: English



*** START OF THE PROJECT GUTENBERG EBOOK ROMEO AND JULIET ***



THE TRAGEDY OF ROMEO AND JULIET

by William Shakespeare




Contents

THE PROLOGUE.

ACT I
Scene I. A public place.
Scene II. A Street.
Scene III. Room in Capulet’s House.
Scene IV. A Street.
Scene V. A Hall in Capulet’s House.

ACT II
CHORUS.
Scene I. An open pl