In [None]:
%cd ../..
%load_ext autoreload

%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import json
import folium
from collections import Counter
import os
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval

from emv.client.get_content import get_features
from emv.utils import dataframe_from_hdf5
from emv.settings import DRIVE_PATH

from emv.features.wikidata import get_wikidata_id, get_property, get_wikidata_label
from emv.features.wikidata import process_batch

# Load from API

In [None]:
features = get_features(feature_type='transcript+ner', max_features=None)

In [None]:
df = pd.DataFrame(features)

In [None]:
metadata = dataframe_from_hdf5(DRIVE_PATH + "rts/metadata", "rts_metadata")
metadata.reset_index(inplace=True)
metadata.rename(columns = {"mediaId": "rts_id"}, inplace=True)

In [None]:
df["rts_id"] = df["media_id"].apply(lambda x: x.split("-")[1])
df = df.merge(metadata, on='rts_id', how='left')

In [None]:
sample_rts = pd.read_csv(DRIVE_PATH + "rts/aibox-vectors/videos.csv")
sample_rts_ids = sample_rts.umid.tolist()
df = df[df.rts_id.isin(sample_rts.umid)].reset_index(drop=True)
print(f"Processed {len(df.rts_id.unique())} videos out of {len(sample_rts_ids)} - {len(df.rts_id.unique())/len(sample_rts_ids)*100:.2f}%")

In [None]:
df["publishedDate"] = pd.to_datetime(df["publishedDate"])
df["year"] = df["publishedDate"].dt.year

## Extracting entities

In [None]:
def get_entities(data):
    entities = []
    if "entities" in data.keys():
        entities = data["entities"]
    else:
        entities = [t.get("entities", []) for t in data.get("transcript", [])]
        entities = [e for sublist in entities for e in sublist]
    
    return entities

In [None]:
df["entities"] = df["data"].apply(get_entities)

In [None]:
entity_types = list(set([e[1] for sublist in df.entities for e in sublist]))
entity_types

In [None]:
df["locations"] = df["entities"].apply(lambda x: [e[0] for e in x if e[1] == "LOC" and len(e[0]) > 2])
df["people"] = df["entities"].apply(lambda x: [e[0] for e in x if e[1] == "PER" and len(e[0]) > 2])
df["orgs"] = df["entities"].apply(lambda x: [e[0] for e in x if e[1] == "ORG" and len(e[0]) > 1])
df["misc"] = df["entities"].apply(lambda x: [e[0] for e in x if e[1] == "MISC" and len(e[0]) > 2])

In [None]:
df = df[["media_id", "rts_id", "year", "data", "categoryName", "contentType", "title", "resume", "mediaDuration", "locations", "people", "orgs", "misc"]]

In [None]:
df.to_csv("data/rts_sample.csv", index=False, sep = "\t")

# Load presaved data

In [None]:
df = pd.read_csv("data/rts_sample.csv", 
                 sep = "\t", 
                 converters = {
                     "data": literal_eval,
                     "locations": literal_eval,
                     "people": literal_eval,
                     "orgs": literal_eval,
                     "misc": literal_eval
                })

In [None]:
df.head(2)

In [None]:
df["transcript"] = df["data"].map(lambda x: x.get("transcript", []))
df = df[df.transcript.map(lambda x: type(x) == list)].reset_index(drop=True) # Get full videos with speaker diarization info
print(f"Processed {len(df)} videos.")

In [None]:
df.transcript.map(lambda x: len(x)).mean(), df.transcript.map(lambda x: len(x)).sem()

In [None]:
df.transcript.map(lambda x: len(x)).hist(bins=100, grid = False, edgecolor = 'black')
plt.xscale("log")

In [None]:
len(df.transcript.map(lambda x: " ".join([t["t"] for t in x])).sum().split())

In [None]:
df.mediaDuration.sum() / 3600

In [None]:
print(f"Locations extracted: {df.locations.explode().nunique()}")   
print(f"People extracted: {df.people.explode().nunique()}")
print(f"Organizations extracted: {df.orgs.explode().nunique()}")
print(f"Miscellaneous entities extracted: {df.misc.explode().nunique()}")

In [None]:
labels = ["Locations", "People", "Organizations", "Miscellaneous"]
sizes = [df.locations.explode().nunique(), df.people.explode().nunique(), df.orgs.explode().nunique(), df.misc.explode().nunique()]

plt.figure(figsize=(6, 4))
plt.bar(labels, sizes, edgecolor = 'black')
plt.ylabel("Number of entities", fontweight = "bold")
plt.xlabel("Entity type", fontweight = "bold")
plt.show()

In [None]:
print(f"Entities extracted: {sum(sizes)}")

In [None]:
plt.figure(figsize=(6, 4)) 
df.year.hist(bins=50, grid = False, edgecolor='black')
plt.xlabel("Year", fontweight='bold')
plt.ylabel("Number of videos", fontweight='bold')
plt.savefig("data/outputs/year_distribution.png", dpi=300)
plt.show()

# Locations

## Map of Switzerland

In [None]:
locations = df[["locations", "year"]].explode("locations").dropna().groupby("locations").agg(list)
locations["count"] = locations["year"].apply(len)
locations["year"] = locations.year.map(lambda x: Counter(x))
locations = locations.sort_values("count", ascending=False).reset_index()
locations.head(10)

print(f"Found {len(locations)} locations in the dataset.")

### Manual matching

In [None]:
with open("emv/features/cities.json", "r") as f:
    cities = json.load(f)

In [None]:
cities = pd.DataFrame([{"locations":k, "lon":float(v[0]), "lat":float(v[1])} for k,v in cities.items() if len(v) == 2])

In [None]:
cities.head()

In [None]:
found_cities = pd.merge(locations, cities, on="locations", how="left").dropna(subset = ["lat", "lon"]).reset_index()
found_cities["first_mention"] = found_cities["year"].map(lambda x: min(x.keys()))
found_cities = found_cities[found_cities["count"] > 25]
found_cities.head()

In [None]:
found_cities["count"].describe()

In [None]:
import branca.colormap as cm

colormap = cm.linear.plasma.scale(found_cities["first_mention"].min(), found_cities["first_mention"].max()).to_step(10)

# Create a base map centered around Switzerland
m = folium.Map(location=[46.8182, 8.2275], zoom_start=8)
size_multiplier = 1
# Add city points to the map
for index, row in found_cities.iterrows():
    color = colormap(row['first_mention'])
    folium.CircleMarker(
        location=(row['lon'], row['lat']),
        radius = np.sqrt(row['count'] / np.pi) * size_multiplier,
        color="black",
        weight = 1,
        fill=True,
        fill_color=color,
        fill_opacity=0.6,
        tooltip=row['locations'] + ': ' + str(row['count']) + ' occurrences - First mentioned in ' + str(row['first_mention']),
    ).add_to(m)

m.add_child(colormap)

m

### Matching with Wikidata

In [None]:
locations = locations.head(1000)

In [None]:
locations["wikidata_id"] = locations["locations"].apply(get_wikidata_id)

In [None]:
locations = locations[locations.wikidata_id.map(lambda x: len(x) > 0)].reset_index(drop=True)

In [None]:
locations["wikidata_label"] = locations.wikidata_id.map(lambda x: x[0].get("label", "MISSING_LABEL") if len(x) > 0 else None)
locations["wikidata_description"] = locations.wikidata_id.map(lambda x: x[0].get("description", "MISSING_DESCRIPTION") if len(x) > 0 else None)
locations["wikidata_id"] = locations.wikidata_id.map(lambda x: x[0].get("id", "MISSING_ID") if len(x) > 0 else None)

In [None]:
aggregation_functions = {
    'year': 'sum',
    'count': 'sum',
    'wikidata_id': 'first',
    'wikidata_label': 'first',
    'wikidata_description': 'first',
}

In [None]:
len_locs = len(locations)
locations = locations.groupby("wikidata_id").agg(aggregation_functions).reset_index(drop = True)
print(f"Found {len_locs - len(locations)} duplicate locations.")

In [None]:
locations = locations[locations["count"] > 25]
print(f"Found {len(locations)} locations with more than 25 occurrences.")

In [None]:
locations["instance_of"] = locations.wikidata_id.map(lambda x: get_property(x, "P31", delay = 2))

In [None]:
instances = locations.instance_of.explode().dropna().unique()
instances = {id_: get_wikidata_label(id_, delay = 2) for id_ in instances}

In [None]:
locations["instance_of_label"] = locations.instance_of.map(lambda x: [instances[id_]["en"].get("value") for id_ in x])

In [None]:
locations.to_csv("data/rts_sample_locations_filtered.csv", index=False, sep = "\t")

In [None]:
locations.instance_of_label.explode().value_counts()[:30].plot(kind = "barh", figsize=(8, 6))
plt.ylabel("")
plt.xlabel("Number of occurrences", fontweight = "bold")
plt.show()

In [None]:
locations = pd.read_csv("data/rts_sample_locations_filtered.csv", sep = "\t", converters={"count": literal_eval})

In [None]:
[x for x in locations.instance_of_label.explode().dropna().unique() if "Switzerland" in x or "Swiss" in x]

In [None]:
def is_swiss_location(instances):
    is_swiss_loc = False
    for label in ["municipality of Switzerland", "city of Switzerland", "cantonal capital of Switzerland"]:
        if label in instances:
            is_swiss_loc = True
            break
    return is_swiss_loc

found_cities = locations[locations.instance_of_label.map(is_swiss_location)]
print(f"Found {len(found_cities)} locations in Switzerland.")

In [None]:
found_cities["coords"] = found_cities.wikidata_id.map(lambda x: get_property(x, "P625"))
found_cities["lat"] = found_cities.coords.map(lambda x: float(x[0].replace("Point(", "").split(" ")[0]))
found_cities["lon"] = found_cities.coords.map(lambda x: float(x[0].replace("Point(", "").split(" ")[1].replace(")", "")))

In [None]:
found_cities["year"] = found_cities["year"].map(lambda x: literal_eval(x.replace("Counter(", "").replace(")", "")))

In [None]:
found_cities["first_mention"] = found_cities["year"].map(lambda x: min(x.keys()))

In [None]:
import branca.colormap as cm

colormap = cm.linear.plasma.scale(found_cities["first_mention"].min(), found_cities["first_mention"].max()).to_step(10)

# Create a base map centered around Switzerland
m = folium.Map(location=[46.8182, 8.2275], zoom_start=8.3)
size_multiplier = 1.5
# Add city points to the map
for index, row in found_cities.iterrows():
    color = colormap(row['first_mention'])
    folium.CircleMarker(
        location=(row['lon'], row['lat']),
        radius = np.sqrt(row['count'] / np.pi) * size_multiplier,
        color="black",
        weight = 1,
        fill=True,
        fill_color=color,
        fill_opacity=0.6,
        tooltip=row['wikidata_label'] + ': ' + str(row['count']) + ' occurrences - First mentioned in ' + str(row['first_mention']),
    ).add_to(m)

m.add_child(colormap)

m

## Evolution over time

In [None]:
n_mentions_per_year = found_cities.year.sum()
n_mentions_per_year = pd.DataFrame(n_mentions_per_year.items(), columns=["year", "count"])
n_mentions_per_year = n_mentions_per_year.sort_values("year")

plt.figure(figsize=(6, 4))
sns.lineplot(found_cities["year"])
plt.title("Number of mentions of locations in Switzerland in the RTS videos")
plt.show()

In [None]:
first_year = found_cities.year.map(lambda x: min(x.keys())).min()
last_year = found_cities.year.map(lambda x: max(x.keys())).max()

n_mentions_per_year = found_cities[["wikidata_label", "year", "count"]].set_index("wikidata_label")
n_mentions_per_year["year"] = n_mentions_per_year["year"].map(lambda x: [x.get(year, 0) for year in range(first_year, last_year + 1)])
n_mentions_per_year.sort_values("count", ascending=False, inplace=True)
n_mentions_per_year.head()

In [None]:
plt.figure(figsize=(16, 6))
sns.heatmap(n_mentions_per_year["year"].values.tolist(), cmap = "viridis")
plt.xticks(ticks = np.array(list(range(last_year - first_year + 1))[::5]) + 0.5, labels = range(first_year, last_year + 1, 5), rotation = 0)
plt.yticks(ticks = np.arange(len(n_mentions_per_year)) + 0.5, labels = n_mentions_per_year.index, rotation = 0)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
for i, row in n_mentions_per_year[:10].iterrows():
    plt.plot(range(first_year, last_year + 1), row["year"], label = i)
plt.legend()
plt.show()    

In [None]:
n_mentions_per_year_dict = n_mentions_per_year.set_index("year").to_dict()["count"]
found_cities["relative_counts"] = found_cities.year.map(lambda x: {k:v / n_mentions_per_year_dict[k] for k,v in x.items()})

In [None]:
plt.figure(figsize=(6, 4))
found_cities[:20].sort_values("count").set_index("locations")["count"].plot(kind="barh")
plt.xlabel("Number of mentions")
plt.title("Top 20 locations mentioned in the RTS videos")
plt.show()

In [None]:
top_N = 20
skip_first_n = 1
top_cities = found_cities[skip_first_n:top_N + skip_first_n]
counts_per_year = pd.DataFrame(top_cities.year.tolist(), index=top_cities.locations).fillna(0)
counts_per_year = counts_per_year.T.sort_index().T

plt.figure(figsize=(12, 6))
sns.heatmap(counts_per_year, cmap="Blues", cbar_kws={'label': 'Number of mentions'})
plt.title("Number of mentions of locations in Switzerland in the RTS videos")
plt.xticks(rotation=45)
plt.show()

plt.figure(figsize=(12, 6))
for loc,mentions in counts_per_year.iterrows():
    plt.plot(mentions.index, mentions.values, label=loc)
plt.legend()
plt.title(f"Number of mentions of the top {len(counts_per_year)} locations in Switzerland in the RTS videos")
plt.show()

In [None]:
top_N = 20
skip_first_n = 1
top_cities = found_cities[skip_first_n:top_N + skip_first_n]
counts_per_year = pd.DataFrame(top_cities.relative_counts.tolist(), index=top_cities.locations).fillna(0)
counts_per_year = counts_per_year.T.sort_index().T

counts_per_year = counts_per_year.div(counts_per_year.sum(axis=1), axis=0)

plt.figure(figsize=(12, 6))
sns.heatmap(counts_per_year, cmap="Blues", cbar_kws={'label': 'Number of mentions'})
plt.title("Number of mentions of locations in Switzerland in the RTS videos")
plt.xticks(rotation=45)
plt.show()

plt.figure(figsize=(12, 6))
for loc,mentions in counts_per_year.iterrows():
    plt.plot(mentions.index, mentions.values, label=loc)
plt.legend()
plt.title(f"Number of mentions of the top {len(counts_per_year)} locations in Switzerland in the RTS videos")
plt.show()

# People

In [None]:
filter_people = ["messieurs", "monsieur", "madame", "mesdames"]

df["people"] = df["people"].apply(lambda x: [p for p in x if p.lower() not in filter_people])   

In [None]:
persons = df["transcript"].to_frame()
persons["entities"] = df.transcript.map(lambda x: [t.get("entities", None) for t in x])
persons["context"] = df.transcript.map(lambda x: [t["t"] for t in x])
persons = persons.explode(["entities", "context"]).explode("entities").reset_index(drop=True).dropna(subset = ["entities"])
persons["entities"] = persons.entities.map(lambda x: x[0] if x[1] == "PER" else None)
persons = persons.dropna(subset = ["entities"]).reset_index(drop=True)
persons.head()

In [None]:
top_persons = df[["people", "year"]].explode("people").dropna().groupby("people").agg(list).reset_index()
top_persons["count"] = top_persons["year"].apply(len)
top_persons["year"] = top_persons.year.map(lambda x: Counter(x))
top_persons = top_persons.sort_values("count", ascending=False)
print(f"Found {len(top_persons)} persons in the dataset.")
print(f"Mean number of mentions per person: {top_persons['count'].mean():.2f} +/- {top_persons['count'].sem():.2f}")

In [None]:
min_count = 50
top_persons = top_persons[top_persons["count"] > min_count]
print(f"Found {len(top_persons)} persons with more than {min_count} occurrences.")
top_persons.head(10)

In [None]:
top_persons["wikidata_search"] = top_persons["people"].map(lambda x: get_wikidata_id(x, top_n = 10, delay = 1))
top_persons = top_persons[top_persons["wikidata_search"].map(lambda x: len(x) > 0)]

In [None]:
top_persons = top_persons[top_persons["wikidata_search"].map(lambda x: len(x) > 0)]

In [None]:
top_persons["wikidata_candidates"] = top_persons["wikidata_search"].map(lambda x: [(c.get("id"), c.get("description")) for c in x])

In [None]:
persons_candidates = top_persons[["people", "wikidata_candidates"]].to_dict("records")
persons_candidates[:2]

In [None]:
persons[persons.entities == "Roger"].head().context.tolist()

In [None]:
top_persons["wikidata_id"] = top_persons["wikidata_search"].apply(lambda x: x[0].get("id", "MISSING_ID") if len(x) > 0 else None)
top_persons["wikidata_label"] = top_persons["wikidata_search"].apply(lambda x: x[0].get("label", "MISSING_LABEL") if len(x) > 0 else None)
top_persons["wikidata_description"] = top_persons["wikidata_search"].apply(lambda x: x[0].get("description", "MISSING_DESCRIPTION") if len(x) > 0 else None)

In [None]:
aggregation_functions = {
    'year': 'sum',
    'count': 'sum',
    'wikidata_id': 'first',
    'wikidata_label': 'first',
    'wikidata_description': 'first',
}

len_persons = len(top_persons)
top_persons = top_persons.groupby("wikidata_id").agg(aggregation_functions).reset_index(drop = True)
print(f"Found {len_persons - len(top_persons)} duplicate persons.")

In [None]:
top_persons

In [None]:
instance_of = process_batch(top_persons["wikidata_id"].dropna().tolist(), "P31", BATCH_SIZE=20)
top_persons["instance_of"] = top_persons["wikidata_id"].map(instance_of)

In [None]:
instances = top_persons.instance_of.dropna().unique().tolist()
instances = {i:get_wikidata_label(i.split("/")[-1]) for i in instances}

In [None]:
top_persons["instance_of"] = top_persons["instance_of"].map(instances)
top_persons["instance_of"] = top_persons["instance_of"].fillna("MISSING").map(lambda x: x.get("en", {}).get("value", "MISSING") if x != "MISSING" else "MISSING")

In [None]:
top_persons["instance_of"].value_counts()

## Focus on humans (instance of Q5)

In [None]:
top_persons.dropna(subset=["wikidata_id", "instance_of"], inplace=True)

In [None]:
top_persons = top_persons[top_persons.instance_of == "human"]

In [None]:
top_persons.shape

In [None]:
ids = top_persons["wikidata_id"].tolist()
citizenship = process_batch(ids, "P27", BATCH_SIZE=20)
top_persons["citizenship"] = top_persons["wikidata_id"].map(citizenship)

occupation = process_batch(ids, "P106", BATCH_SIZE=20)
top_persons["occupation"] = top_persons["wikidata_id"].map(occupation)

In [None]:
citizenship_labels = {k:get_wikidata_label(k.split("/")[-1]) for k in top_persons["citizenship"].dropna().unique()}
top_persons["citizenship"] = top_persons["citizenship"].map(citizenship_labels)

occupation_labels = {k:get_wikidata_label(k.split("/")[-1]) for k in top_persons["occupation"].dropna().unique()}
top_persons["occupation"] = top_persons["occupation"].map(occupation_labels)

In [None]:
top_persons["citizenship"] = top_persons["citizenship"].fillna("MISSING").map(lambda x: x.get("en", {}).get("value", "MISSING") if x != "MISSING" else "MISSING")
top_persons["occupation"] = top_persons["occupation"].fillna("MISSING").map(lambda x: x.get("en", {}).get("value", "MISSING") if x != "MISSING" else "MISSING")

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(12, 4))
top_persons.occupation.value_counts()[:20].plot(kind="barh", ax=axs[0])
axs[0].set_title("Occupations of most occurring persons")
top_persons.citizenship.value_counts()[:20].plot(kind="barh", ax=axs[1])
axs[1].set_title("Citizenships of most occurring persons")
plt.tight_layout()
plt.show()

# Sentiment Analysis

In [None]:
from transformers import pipeline

In [None]:
sentences = df[["media_id", "rts_id", "year", "mediaDuration", "transcript"]].explode("transcript")
sentences.dropna(subset="transcript", inplace=True)
sentences["transcript"] = sentences.transcript.map(lambda x: x.get("t", "")).tolist()
print(f"Extracted {len(sentences)} sentences")

In [None]:
sentences["sentence_length"] = sentences.transcript.map(lambda x: len(x.split()))
sentences.sentence_length.describe()

In [None]:
plt.figure(figsize=(8, 4))
sentences.year.value_counts().sort_index().plot(kind="bar")
plt.xticks(range(0, len(sentences.year.value_counts().index), 5), rotation=0)
plt.ylabel("Number of sentences")
plt.title("Number of sentences per year")
plt.show()

In [None]:
# Sample N sentences per year
N = 100
sampled_sentences = pd.DataFrame()
for year, group in sentences.groupby("year"):
    if N > len(group):
        sampled_group = group
        print(f"Only sampled {len(group)} sentences for year {year}")
    else:
        sampled_group = group.sample(N)
    sampled_sentences = pd.concat([sampled_sentences, sampled_group])
sampled_sentences.reset_index(drop=True, inplace=True)
print(f"\nSampled {len(sampled_sentences)} sentences")

In [None]:
sampled_sentences.transcript.map(lambda x: type(x)).value_counts()

## Sentiment Score

In [None]:
sentiment_classifier = pipeline(
    model = "lxyuan/distilbert-base-multilingual-cased-sentiments-student",
    top_k = None
)

In [None]:
sampled_sentences["sentiment_scores"] = sampled_sentences.transcript.map(lambda x: sentiment_classifier(x)[0])

In [None]:
def calculate_sentiment_score(positive_score, neutral_score, negative_score):
    sentiment_score = positive_score - negative_score
    return sentiment_score



In [None]:
sampled_sentences.dropna(subset=["sentiment_scores"], inplace=True)

sampled_sentences["sentiment_scores"] = sampled_sentences["sentiment_scores"].map(lambda x: {s["label"]:s["score"] for s in x})
sampled_sentences["positive_score"] = sampled_sentences["sentiment_scores"].map(lambda x: x.get("positive", 0))
sampled_sentences["negative_score"] = sampled_sentences["sentiment_scores"].map(lambda x: x.get("negative", 0))
sampled_sentences["neutral_score"] = sampled_sentences["sentiment_scores"].map(lambda x: x.get("neutral", 0))
sampled_sentences["top_sentiment"] = sampled_sentences["sentiment_scores"].map(lambda x: max(x, key=x.get))
sampled_sentences["top_sentiment_score"] = sampled_sentences["sentiment_scores"].map(lambda x: max(x.values()))

def calculate_sentiment_score(positive_score, neutral_score, negative_score):
    sentiment_score = positive_score - negative_score
    return sentiment_score

sampled_sentences["sentiment_score"] = sampled_sentences[["positive_score", "neutral_score", "negative_score"]].apply(lambda x: calculate_sentiment_score(*x), axis=1)
sampled_sentences.drop(columns = ["sentiment_scores"], inplace=True)

In [None]:
sampled_sentences.head(2)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(12, 4))
sampled_sentences.top_sentiment.value_counts().reindex(["positive", "negative", "neutral"]).plot(kind="bar", ax=axs[0])
axs[0].set_title("Sentiment distribution")
axs[0].set_xticks(range(3), ["positive", "negative", "neutral"], rotation=0)

sampled_sentences.sentiment_score.hist(ax=axs[1], bins=100, grid = False)
axs[1].set_title("Sentiment score distribution")

plt.show()

In [None]:
# Top k extreme sentences, by sentiment
k = 50
print(f"Top {k} positive sentences")
_ = [print(s) for s in sampled_sentences[sampled_sentences.top_sentiment == "positive"].sort_values("top_sentiment_score", ascending=False).transcript.tolist()[:k]]
print()
print(f"Top {k} negative sentences")
_ = [print(s) for s in sampled_sentences[sampled_sentences.top_sentiment == "negative"].sort_values("top_sentiment_score", ascending=False).transcript.tolist()[:k]]

In [None]:
clips = sampled_sentences.groupby("media_id").sentiment_score.mean()
clips = clips.reset_index()
clips["sentiment_std"] = sampled_sentences.groupby("media_id").sentiment_score.std().fillna(0).tolist()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(12, 4))
axs[0].hist(clips.sentiment_score, bins = 100)
axs[0].set_title("Distribution of the mean sentiment score")

axs[1].hist(clips.sentiment_std, bins = 50)
axs[1].set_title("Distribution of the standard deviation of the sentiment score")

plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sentiment_scores_per_year = sampled_sentences.groupby("year").sentiment_score.mean().reset_index()
sentiment_scores_per_year["sentiment_std"] = sampled_sentences.groupby("year").sentiment_score.std().fillna(0).tolist() / np.sqrt(sampled_sentences.groupby("year").sentiment_score.count().tolist())
sentiment_scores_per_year.sort_values("year", inplace=True)

plt.errorbar(sentiment_scores_per_year.year, sentiment_scores_per_year.sentiment_score, yerr=sentiment_scores_per_year.sentiment_std, fmt='o')
plt.hlines(0, 1949, 2022, color="black", linestyle="--")
plt.xlim(1949, 2022)  # Limit the x-axis view
plt.ylabel("Mean sentiment score")
plt.title("Mean sentiment score of sentences per year")
plt.show()

In [None]:
sentiment_scores_per_year

In [None]:
sampled_sentences[sampled_sentences.year == 1957][["transcript", "sentiment_score"]].values

## Emotions

In [None]:
classifier = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True)

In [None]:
sampled_sentences["emotions"] = sampled_sentences.transcript.apply(lambda x: classifier(x)[0])

In [None]:
sampled_sentences["emotions"] = sampled_sentences["emotions"].map(lambda x: {s["label"]:s["score"] for s in x})

In [None]:
sampled_sentences["top_emotion"] = sampled_sentences["emotions"].map(lambda x: max(x, key=x.get))

In [None]:
sampled_sentences["top_emotion"].value_counts()