# Analisi dati

### Analisi dei personaggi

Confronto fra i personaggi della web api e dello scraping, confrontare il nome in modo da porter collegare i dati dei personaggi ottenuti dallo scriping ai dati della web api


In [2]:
import pandas as pd
import re
from tqdm import tqdm
import json
import plotly.express as px


In [14]:
api_characters_dataset = pd.read_csv("./Datasets/api_character_list.csv")
scraping_characters_dataset = pd.read_csv("./Datasets/scraping_personaggi_completo_filtered_relations.csv")
film_dataset = pd.read_csv("./Datasets/film_data_scraping.csv")
tv_series_dataset = pd.read_csv("./Datasets/tv_series.csv")
comics = pd.read_csv("./Datasets/merged.csv")


In [6]:
def matching_characters(first_name, second_name):
  first_name_without_paranthesys = re.sub("[\(\[].*?[\)\]]", "", first_name.strip()).strip()
  first_name_without_paranthesys = first_name_without_paranthesys.replace("'", " ").replace('"', " ").replace("-"," ").strip()
  second_name_without_paranthesys = re.sub(
      "[\(\[].*?[\)\]]", "", second_name.strip()).strip()
  secondt_name_without_paranthesys = second_name_without_paranthesys.replace(
      "'", " ").replace('"', " ").replace("-", " ").strip()
  return first_name_without_paranthesys.lower() == second_name_without_paranthesys.lower()


In [4]:
for idx, row in tqdm(scraping_characters_dataset.iterrows(), total=scraping_characters_dataset.shape[0]):
  proc_name = re.sub(
      "[\(\[].*?[\)\]]", "", row["Processed_Name"].strip()).strip()
  scraping_characters_dataset.at[idx, "Processed_Name"] = proc_name.lower()
scraping_characters_dataset.head(10)
scraping_characters_dataset.to_csv(
    "./Datasets/scraping_personaggi_completo_filtered_relations.csv")


100%|██████████| 3746/3746 [00:00<00:00, 22160.72it/s]


In [5]:
num_link = 0
for idx, row in tqdm(api_characters_dataset.iterrows(), total=api_characters_dataset.shape[0]):
  formatted_name = re.sub(
      "[\(\[].*?[\)\]]", "", row["name"].strip()).strip()
  querystring = "Processed_Name == '%s'" % (
      formatted_name.replace("'", " ").replace('"', " ").replace(" \u2020", "").lower())
  if len(scraping_characters_dataset.query(querystring)) != 0:
    num_link += 1
print("Numero di corrispondenze: %s"%(num_link))
dataframe = pd.DataFrame([["Personaggi collegati", num_link], [
                         "Personaggi non collegate", api_characters_dataset.shape[0] - num_link]], columns=["type", "count"])
fig = px.bar(dataframe, x='type', y='count', width=700)
fig.update_traces(width=0.25)
fig.show()


100%|██████████| 1559/1559 [00:01<00:00, 1266.60it/s]


Numero di corrispondenze: 325


In [16]:
api_characters_dataset.duplicated(subset="name").sum()

1

In [15]:
api_characters_dataset.loc[api_characters_dataset["name"] == "Captain Marvel"]

Unnamed: 0,id,name,description,modified,thumbnail,resourceURI,comics,series,stories,events,urls


323 personaggi ottenuti dalla web api sono collegabili con i personaggi ottenuti tramite scraping

# Filtering relazioni

Identificazione preliminare delle relazioni che non trovano corrispondenza nella lista di personaggi

In [24]:
character_dataset_scraping = pd.read_csv("./Datasets/scraping_personaggi_completo_filtered_name.csv")
linked = 0
totale = 0
for idx, row in tqdm(character_dataset_scraping.iterrows(), total=character_dataset_scraping.shape[0]):
  try:
    relations = json.loads(row["Relazioni"])
    filtered_relations = []
    for relation in relations:
        totale += 1
        querystring = "Processed_Name == '%s'" % (
            relation[0].replace("'", " ").replace('"', " ").replace(" \u2020", ""))
        if len(character_dataset_scraping.query(querystring)) != 0:
          linked +=1 
  except Exception as e:
   None
print("Tenute %s relazioni su %s"%(linked, totale))
dataframe = pd.DataFrame([["Relazioni tenute", linked],["Relazioni scartate", totale - linked]], columns=["type", "count"])
fig = px.bar(dataframe, x='type', y='count', width=700)
fig.update_traces(width=0.25)
fig.show()


100%|██████████| 3746/3746 [00:43<00:00, 86.55it/s] 

Tenute 45309 relazioni su 61114





Analisi dati biografie

In [8]:
character_dataset_scraping = pd.read_csv(
    "./Datasets/scraping_personaggi_completo_filtered_name.csv")
linked = 0
totale = 0
for idx, row in tqdm(character_dataset_scraping.iterrows(), total=character_dataset_scraping.shape[0]):
  try:
    if len(row["Processed_Biography"]) < 10:
      print(row["Processed_Biography"])
    if row["Processed_Biography"].isdigit():
      print(row["Processed_Biography"])
  except Exception as e:
   None
print("Tenute %s relazioni su %s" % (linked, totale))


100%|██████████| 3746/3746 [00:00<00:00, 26561.26it/s]

Tenute 0 relazioni su 0





# Analisi fumetti

Analisi descrizioni fumetti


In [21]:
from neo4j import GraphDatabase
db_host = "bolt://localhost:7687"
password = "1234qwer"
driver = GraphDatabase.driver(db_host, auth=("neo4j", password))
with driver.session() as session:
  try:
    comics = session.run("MATCH (n:comic) WHERE n.comic_description = 'nan' WITH count(n) AS numero return numero").single()[0]
    
    total = session.run("MATCH (n:comic) WITH count(n) as totale RETURN totale").single()[0]
    # print("Ci sono %s fumetti senza descrizione su %s fumetti" % (comics, total))
    import plotly.express as px
    import pandas as pd
    dataframe = pd.DataFrame([["Fumetti senza descrizione", comics], [
                             "Fumetti con descrizione", total - comics]], columns=["type", "count"])
    dataframe.head()
    fig = px.bar(dataframe, x='type', y='count', width=700)
    fig.update_traces(width = 0.25)
    fig.show()
  except Exception as e:
    print(e)

# Redundancy

Personaggi API


In [10]:
tot = api_characters_dataset.shape[0]
dups_color = api_characters_dataset.pivot_table(columns=['name'], aggfunc='size')
dups_color.shape
print(tot - dups_color.shape[0] )

1


Personaggi Scraping

In [13]:
tot = scraping_characters_dataset.shape[0]
dups_color = scraping_characters_dataset.pivot_table(columns=['Nome'], aggfunc='size')
dups_color.shape
print(tot - dups_color.shape[0] )

0


Comics

In [18]:
tot = comics.shape[0]
dups_color = comics.pivot_table(columns=['title'], aggfunc='size')
dups_color.shape
print(tot - dups_color.shape[0] )

2216


Film

In [20]:
tot = film_dataset.shape[0]
dups_color = film_dataset.pivot_table(columns=['Title'], aggfunc='size')
dups_color.shape
print(tot - dups_color.shape[0] )

0


Serie Tv


In [22]:
tot = tv_series_dataset.shape[0]
dups_color = tv_series_dataset.pivot_table(columns=['Title'], aggfunc='size')
dups_color.shape
print(tot - dups_color.shape[0] )

0


# Completezza

personaggi api

In [44]:
tot = 0
for item in api_characters_dataset.isna().sum():
  tot += item
completezza_tab =1 -(tot /(api_characters_dataset.shape[0]*api_characters_dataset.shape[1])) 
print(completezza_tab)

0.9269928275701207


personaggi Scraping

In [45]:
tot = 0
for item in scraping_characters_dataset.isna().sum():
  tot += item
completezza_tab =1 -(tot /(scraping_characters_dataset.shape[0]*scraping_characters_dataset.shape[1])) 
print(completezza_tab)

0.9530650876086006


Comics

In [46]:
tot = 0
for item in comics.isna().sum():
  tot += item
completezza_tab =1-(tot /(comics.shape[0]*comics.shape[1])) 
print(completezza_tab)

0.8396189159838089


Film

In [47]:
tot = 0
for item in film_dataset.isna().sum():
  tot += item
completezza_tab = 1 - (tot /(film_dataset.shape[0]*film_dataset.shape[1]))
print(completezza_tab)

0.8915343915343915


Serie TV

In [43]:
tot = 0
for item in tv_series_dataset.isna().sum():
  tot += item
completezza_tab = 1 -(tot /(tv_series_dataset.shape[0]*tv_series_dataset.shape[1]))
print(completezza_tab)

0.9671052631578947
