In [1]:
#pip install selenium

## Importando as bibliotecas

In [2]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By

##  Configurando o Selenium para usar o Chrome como navegador

In [3]:
service = Service()

options = webdriver.ChromeOptions()

driver = webdriver.Chrome(service=service, options=options)

## Definindo a URL que irá ser acessada

In [4]:
url = 'https://www.mortalkombatwarehouse.com/mk12/dialogue/'

driver.get(url)

## Buscando elementos por tag

In [5]:
elementos = driver.find_elements(By.TAG_NAME, 'p')

Alguns elementos desnecessários são pegos 

In [6]:
len(elementos)

3802

In [7]:
driver.find_elements(By.TAG_NAME, 'p')[0].text

''

Somente a partir do terceiro é necessário

In [8]:
driver.find_elements(By.TAG_NAME, 'p')[2].text

'Ashrah: What hell spawned you?\nAshrah: You will never find out.'

##  Armazenando os textos extraídos dos elementos da tag p

In [9]:
lista_dialogos = []
for paragrafo in elementos[2:]:
        dialogo = paragrafo.text
        lista_dialogos.append(dialogo)

In [10]:
lista_dialogos

['Ashrah: What hell spawned you?\nAshrah: You will never find out.',
 'Ashrah: You are all that remains of my sins.\nAshrah: The evil within you cannot be destroyed.',
 'Ashrah: Quan Chi is to blame for this.\nAshrah: Who else wields such dark magic?',
 'Ashrah: You also desire absolution?\nAshrah: I seek it with every swipe of my blade.',
 'Ashrah: How is your soul already redeemed?\nAshrah: I am an unrelenting foe of evil.',
 "Ashrah: Together we can rid all realms of evil.\nAshrah: It's a task I must face alone.",
 'Ashrah: You must live in unbearable pain.\nBaraka: The suffering makes me stronger.',
 "Ashrah: How did a merchant learn to fight?\nBaraka: I was once in Outworld's military.",
 "Ashrah: Kombat won't ease your symptoms.\nBaraka: But for a while, I will forget them.",
 'Ashrah: Your kin have forsaken you?\nBaraka: Down to the last member of my family.',
 'Ashrah: Scorpion has offered you forgiveness?\nCyrax: Yes. But only if I complete my penance.',
 'Ashrah: If I had bee

## Fecha navegador que esta sendo controlado pelo WebDriver

In [11]:
driver.quit()

## Função para dividir cada string da lista em quatro partes

In [12]:
def dividir_texto(texto):
    partes = texto.split(':', 1)
    if len(partes) == 2:
        parte1 = partes[0].strip()
        restante = partes[1]
        subpartes = restante.split('\n', 1)
        if len(subpartes) == 2:
            parte2, restante2 = subpartes[0], subpartes[1]
            parte2 = parte2.strip()
            subpartes2 = restante2.split(':', 1)
            if len(subpartes2) == 2:
                parte3, parte4 = subpartes2
                parte3 = parte3.strip()
                parte4 = parte4.strip()
                return parte1, parte2, parte3, parte4

## Chama a função que divide as strings, armazena em dicionários e organiza tudo em um DataFrame

In [13]:
dados = []

for texto in lista_dialogos:
    parte1, parte2, parte3, parte4 = dividir_texto(texto)
    dados.append({'speaker': parte1, 'dialogue_speaker': parte2, 'replier': parte3, 'dialogue_replier': parte4})

# Criar DataFrame
df = pd.DataFrame(dados)

## Ordenando o DataFrame 

In [14]:
df = df.sort_values(by=['speaker', 'replier'])
df.head()

Unnamed: 0,speaker,dialogue_speaker,replier,dialogue_replier
0,Ashrah,What hell spawned you?,Ashrah,You will never find out.
1,Ashrah,You are all that remains of my sins.,Ashrah,The evil within you cannot be destroyed.
2,Ashrah,Quan Chi is to blame for this.,Ashrah,Who else wields such dark magic?
3,Ashrah,You also desire absolution?,Ashrah,I seek it with every swipe of my blade.
4,Ashrah,How is your soul already redeemed?,Ashrah,I am an unrelenting foe of evil.


## Conferindo os valores únicos

In [15]:
df['speaker'].unique()

array(['Ashrah', 'Baraka', 'Cassie Cage', 'Cyrax', 'Ermac',
       'General Shao', 'Geras', 'Havik', 'Homelander', 'Jade',
       'Johnny Cage', 'Johnny Cage JCVD', 'Kenshi', 'Kitana', 'Kung Jin',
       'Kung Lao', 'Li Mei', 'Liu Kang', 'Mileena', 'Nitara',
       'Noob Saibot', 'Omni-Man', 'Peacemaker', 'Quan Chi', 'Raiden',
       'Rain', 'Reiko', 'Reptile', 'Scorpion', 'Sektor', 'Shang Tsung',
       'Sindel', 'Smoke', 'Sub-Zero', 'Takeda', 'Tanya'], dtype=object)

## Renomeando Valores

In [16]:
df['speaker'] = df['speaker'].replace(['Johnny Cage JCVD'], 'Jean-Claude Van Damme')
df['replier'] = df['replier'].replace(['Johnny Cage JCVD'], 'Jean-Claude Van Damme')

print(df['speaker'].unique())
print(len(df['speaker'].unique()))

['Ashrah' 'Baraka' 'Cassie Cage' 'Cyrax' 'Ermac' 'General Shao' 'Geras'
 'Havik' 'Homelander' 'Jade' 'Johnny Cage' 'Jean-Claude Van Damme'
 'Kenshi' 'Kitana' 'Kung Jin' 'Kung Lao' 'Li Mei' 'Liu Kang' 'Mileena'
 'Nitara' 'Noob Saibot' 'Omni-Man' 'Peacemaker' 'Quan Chi' 'Raiden' 'Rain'
 'Reiko' 'Reptile' 'Scorpion' 'Sektor' 'Shang Tsung' 'Sindel' 'Smoke'
 'Sub-Zero' 'Takeda' 'Tanya']
36


In [17]:
siglas = {
    'Ashrah': 'AS',
    'Baraka': 'BA',
    'Cassie Cage' : 'CG',
    'Cyrax' : 'CY',
    'Ermac': 'ER',
    'General Shao': 'GS',
    'Geras': 'GE',
    'Havik': 'HA',
    'Homelander': 'HO',
    'Jade' : 'JA',
    'Jean-Claude Van Damme': 'JCVD',
    'Johnny Cage': 'JC',
    'Kenshi': 'KE',
    'Kitana': 'KI',
    'Kung Jin' : 'KJ',
    'Kung Lao': 'KL',
    'Li Mei': 'LM',
    'Liu Kang': 'LK',
    'Mileena': 'MI',
    'Nitara': 'NI',
    'Noob Saibot' : 'NS',
    'Omni-Man': 'OM',
    'Peacemaker': 'PE',
    'Quan Chi': 'QC',
    'Raiden': 'RA',
    'Rain': 'RI',
    'Reiko': 'RE',
    'Reptile': 'RP',
    'Scorpion': 'SC',
    'Sektor' : 'SE',
    'Shang Tsung': 'ST',
    'Sindel': 'SI',
    'Smoke': 'SM',
    'Sub-Zero': 'SZ',
    'Takeda' : 'TK',
    'Tanya': 'TA'
}

len(siglas)

36

In [18]:
combination_count = {}

def combinar_e_adicionar_sequencia(row):
    sigla1 = siglas.get(row['speaker'], '')
    sigla2 = siglas.get(row['replier'], '')
    combination = f"{sigla1}-{sigla2}"
    
    if combination not in combination_count:
        combination_count[combination] = 1
    else:
        combination_count[combination] += 1
        
    return f"{combination}-{combination_count[combination]}"

In [19]:
df['ID'] = df.apply(combinar_e_adicionar_sequencia, axis=1)

df.head(50)

Unnamed: 0,speaker,dialogue_speaker,replier,dialogue_replier,ID
0,Ashrah,What hell spawned you?,Ashrah,You will never find out.,AS-AS-1
1,Ashrah,You are all that remains of my sins.,Ashrah,The evil within you cannot be destroyed.,AS-AS-2
2,Ashrah,Quan Chi is to blame for this.,Ashrah,Who else wields such dark magic?,AS-AS-3
3,Ashrah,You also desire absolution?,Ashrah,I seek it with every swipe of my blade.,AS-AS-4
4,Ashrah,How is your soul already redeemed?,Ashrah,I am an unrelenting foe of evil.,AS-AS-5
5,Ashrah,Together we can rid all realms of evil.,Ashrah,It's a task I must face alone.,AS-AS-6
6,Ashrah,You must live in unbearable pain.,Baraka,The suffering makes me stronger.,AS-BA-1
7,Ashrah,How did a merchant learn to fight?,Baraka,I was once in Outworld's military.,AS-BA-2
8,Ashrah,Kombat won't ease your symptoms.,Baraka,"But for a while, I will forget them.",AS-BA-3
9,Ashrah,Your kin have forsaken you?,Baraka,Down to the last member of my family.,AS-BA-4
