# Tekstanalyse demo med kunstintelligens

**Global setup**

In [None]:
try:
    with open("../global_setup.py") as setupfile:
        exec(setupfile.read())
except FileNotFoundError:
    print('Setup already completed')

**Import alle moduler**

In [None]:
import io # Read the prime-minister's speech file with UTF-8 encoding
import glob # Read the prime minister's speech files in a directory
import pandas as pd # Displaying results in a data-frame
import requests # Used to search wikipedia for the articles
import urllib.parse # Used to URL-encode the query strings

import matplotlib # Plotting
import matplotlib.pyplot as plt # Plotting
import numpy as np # Plotting

from scipy.interpolate import spline # Smoothing matplotlib graphs
from afinn import Afinn # Sentiment analysis package
from IPython.core.display import display#, HTML # HTML displayer
from ipywidgets.widgets import Accordion, HTML
from notebooks.exercises.src.text.rsspedia import Rsspedia # Searching in Wiki for text matches using Okapi BM25
from notebooks.exercises.src.text.news_sentiment_1 import RSSDashboard
from notebooks.exercises.src.text.news_sentiment_2 import PrimeMinisterSpeechDashboard
from src.text.document_retrieval.wikipedia import Wikipedia # Generic Wikipedia class

## Nyhedsanalyse 1: Sentiment

Du kan vælge mellem forskellige danske nyhedskilder og se de sidste nyheder med deres sentiment-scores.

In [None]:
RSSdashboard = RSSDashboard()
display(RSSdashboard.widget_box)

## Nyhedsanalyse 2: relevante Wikipedia sider 

In [None]:
# Initialize the wikipedia class and (down)load the vocabulary
wikipedia = Wikipedia(language="danish", cache_directory_url=None)
# Initialize the class to search RSS titles in the Wikipedia
rsspedia = Rsspedia(wikipedia)
rsspedia.search_wikipedia(RSSdashboard.data_titles)

list_labels = []
for i in range(len(RSSdashboard.data_titles)):
    list_labels.append(HTML(value = rsspedia.search_results[i]))

accordion = Accordion(children = (list_labels),)

for i in range(len(RSSdashboard.data_titles)):
    accordion.set_title(i, "{}. {}".format(i + 1, RSSdashboard.data_titles[i]))

display(accordion)

## Nyhedsanalyse 3: relevante Wikipedia sider

In [None]:
from dasem.wikipedia import ExplicitSemanticAnalysis
esa = ExplicitSemanticAnalysis()
#import nltk #nltk.download('punkt')
content_items = []
n_wiki_results = 3

for i in range(len(RSSdashboard.data_titles)):
    urls = []
    titles = []
    abstracts = []
    list_labels = esa.related(RSSdashboard.data_titles[i].lower(), n = n_wiki_results)
    for j in range(n_wiki_results):
        url = "https://da.wikipedia.org/w/api.php?action=query&prop=extracts&exintro&titles={}&format=json&redirects".format(urllib.parse.quote_plus(list_labels[0][j].replace(" ","_")))
        json_content = requests.get(url).json()
        content_item = next(iter(json_content["query"]["pages"].values()))
        urls.append(url)
        titles.append(content_item["title"])
        abstracts.append(content_item["extract"])
    content_items.append(HTML(value = "{}{}".format(list_labels[0], rsspedia.display_beautifully(titles, abstracts, urls))))

accordion = Accordion(children = (content_items),)

for i in range(len(RSSdashboard.data_titles)):
    accordion.set_title(i, "{}. {}".format(i + 1, RSSdashboard.data_titles[i]))

display(accordion)

# Sentimentanalyse af statsministeren's tale
Statsministerens tale ved Folketingets åbning gennem år

In [None]:
speeches = {}
speeches_sentiments = {}
for filepath in glob.iglob('data/statsminister/*.txt'):
    speeches[os.path.basename(filepath).replace(".txt","")] = [line.rstrip('\n') for line in open(filepath, mode="r", encoding="utf-8")]
    current_sentiment = 0
    for line in speeches[os.path.basename(filepath).replace(".txt","")]:
        current_sentiment += afinn.score(line)
    speeches_sentiments[os.path.basename(filepath).replace(".txt","")] = current_sentiment

lists = sorted(speeches_sentiments.items()) # sorted by key, return a list of tuples
x, y = zip(*lists) # unpack a list of pairs into two tuples

fig, ax = plt.subplots(figsize=(20, 10))
ax.plot(x, y, color="black")
ax.set(xlabel='Tid', ylabel='Sentiment', title="Statsministeren's tale sentiment")

ax.grid()
plt.show()

In [None]:
current_speech = "2018"
afinn = Afinn(language = "da")
scores = []
for i in range(len(speeches["2018"])):
    scores.append(afinn.score(speeches["2018"][i]))

# Dataframe
pd.set_option('display.max_colwidth', -1) # Used to display whole title (non-truncated)
df = pd.DataFrame({"Line": speeches["2018"], "Score": scores}) # Creating the data frame and populating it

# Highlight the positive and negative sentiments
def highlight(s):
    if s.Score > 0:
        return ['background-color: #AAFFAA']*2
    elif s.Score < 0:
        return ['background-color: #FFAAAA']*2
    else:
        return ['background-color: #FFFFFF']*2

df = df.style.apply(highlight, axis=1)
display(df)

In [None]:
smoothed_scores = []
smoothed_scores.append(scores[0])
smoothing_constant = 0.3
number_of_averaged_scores = 10
for i in range(len(scores) - 1):
    s = 0
    for j in range(number_of_averaged_scores):
        if j == 0:
            s = scores[i - j] * smoothing_constant
        elif i - j >= 0:
            s = s + scores[i - j] * (1 - smoothing_constant / number_of_averaged_scores)        
    smoothed_scores.append(s)

# Data for plotting
y = np.array(smoothed_scores)
x = np.array(range(1, len(smoothed_scores) + 1))
x_s = np.linspace(x.min(),x.max(), 1800) #300 represents number of points to make between T.min and T.max
y_s = spline(x, y, x_s)

fig, ax = plt.subplots(figsize=(20, 10))
ax.plot(x_s, y_s, color="black")
ax.set(xlabel='Tid', ylabel='Sentiment', title="Statsministeren's tale sentiment")

# use xnew, ynew to plot filled-color graphs
plt.fill_between(x_s, 0, y_s, where=(y_s-1) < -1 , color='red')
plt.fill_between(x_s, 0, y_s, where=(y_s-1) > -1 , color='green')

ax.grid()
plt.show()

In [None]:
primeMinisterSpeechDashboard = PrimeMinisterSpeechDashboard
primeMinisterSpeechDashboard.load_speeches()
display(primeMinisterSpeechDashboard.widget_box)