In [None]:
# Check at de rigtige biblioteker er installeret. Og installer dem, hvis det ikke er tilfældet

import sys

try: 
    import requests
    print("requests library has been imported")
except: 
    print("requests library not found. Installing...")
    !pip install requests
    try:
        import requests
    except: 
        print("Something went wrong in the installation of the requests library. Please check your internet connection and consult output from the installation below")

try:
    import pandas
    print("Pandas library has been imported")
except:
    print("Pandas library not found. Installing...")
    !pip install pandas
    
    try:
        import pandas
    except:
        print("Something went wrong in the installation of the Pandas library. Please check your internet connection and consult output from the installation below")

try:
    import bs4
    print("BeatifulSoup library has been imported")
except:
    print("BeatifulSoup library not found. Installing...")
    !pip install beautifulsoup4
    
    try:
        import bs4
    except:
        print("Something went wrong in the installation of the BeatifulSoup library. Please check your internet connection and consult output from the installation below")

try:
    import urllib
    print("Urllib library has been imported")
except:
    print("Urllib library not found. Installing...")
    !pip install urllib
    
    try:
        import urllib
    except:
        print("Something went wrong in the installation of the Urlllib library. Please check your internet connection and consult output from the installation below")

        
try:
    import wikipediaapi
    print("Wikipedia api library has been imported")
except:
    print("wikipedia api library not found. Installing...")
    !pip install wikipedia-api
    
    try:
        import wikipediaapi
    except:
        print("Something went wrong in the installation of the wikipedia api library. Please check your internet connection and consult output from the installation below")


try: 
    import networkx
    print("networkx library has been imported")
except: 
    print("networkx library not found. Installing...")
    !pip install networkx
    try:
        import networkx
    except: 
        print("Something went wrong in the installation of the networkx library. Please check your internet connection and consult output from the installation below")


## Konstruer et corpus af tekster 

Før vi kan øve os i forskellige teknikker til tekstanalyse skal vi have et corpus (datasæt) at arbejde på. I nedenstående eksempel genbruger vi scraperen fra lektion 9, der fandt artikler fra infoboxen om machine learning på Wikipedia. Denne gang lader vi scraperen hente selve brødeteksten fra siderne.

In [None]:
#Scrape tekster fra linksamlingen om 'Machine Learning and Data Science' på Wikipedia
import wikipediaapi
import urllib.request
from bs4 import BeautifulSoup

html_doc = urllib.request.urlopen("https://en.wikipedia.org/wiki/Data_science")

soup = BeautifulSoup(html_doc)

corpus = []


pages = []
table = soup.find('table', attrs={'class':'vertical-navbox nowraplinks'})
table_body = table.find('tbody')

for link in table.find_all('a'):
    if '/wiki/' in link.get('href'):
        pages.append(link.get('href').split('/wiki/')[1])


wiki_wiki = wikipediaapi.Wikipedia(
        language='en',
        extract_format=wikipediaapi.ExtractFormat.WIKI
)



print("Collecting text from "+str(len(pages))+" pages...")

for page in pages:
    p_wiki = wiki_wiki.page(page)
    page_text=p_wiki.text.lower()
    corpus.append(page_text)
    
print('done')


In [None]:
#corpus ligger nu som en liste af tekster, der kan udforskes enkeltvis
corpus[2]

## Tokeniser teksterne
For at kunne analysere corpus bryder vi først tekstdokumenterne ned i enkelte ord (tokens). Det producerer en oversigt over alle de ord, der findes i corpus, hvor mange gange de findes, og hvor mange gange de findes i hvert enkelt dokument.

In [None]:
#importer de nødvendige biblioteker

from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction import stop_words
import numpy as np
import json
import warnings
import pandas as pd

warnings.filterwarnings('ignore')




In [None]:
# kør tokeinsering med almindelig optælling af ordene. 

count_vectorizer = CountVectorizer(stop_words= 'english') # vi bruger en liste med engelske stopord 
X_count = count_vectorizer.fit_transform(corpus)



In [None]:
# udforsk resultatet
count_vectorizer.vocabulary_

In [None]:
# udforsk resultatet på dokumentniveau i en Pandas dataframe
pd.set_option('display.max_columns', 100)
sheet = [count_vectorizer.get_feature_names()]

for i in X_count.toarray():
    sheet.append(list(i))

df_count = pd.DataFrame(sheet)
df_count.columns = df_count.iloc[0]
df_count = df_count.drop(0)

df_count.head()

## Byg et co-word netværk
For at kunne undersøge, hvad det er for et sprog, der karakteriserer de forskellige dokumenter, kan vi producere et co-word netværk. Det er et netværk af ord, der er forbundet til hinanden, når de optræder sammen. For at netværket ikke skal blive for stort sætter vi nogle minimumskriterier på, hvor mange et ord skal optræde i et dokument for at komme i betragtning, og hvor mange dokumenter to ord skal samforekomme i, for at deres forbindelse kan komme med. 

In [None]:
# her bruger vi networkx biblioteket til at bygge en netværksfil. 

import networkx as nx


min_occ = 5 #set the minimum occurence count per document

edges = {}
for x, row in df_count.iterrows():
    found_terms = []
    for term in df_count.columns:
        if row[term] >= min_occ:
            found_terms.append(term)
    for i,source in enumerate(found_terms):
        for target in found_terms[i+1:]:
            if source in edges.keys():
                if target in edges[source].keys():
                    edges[source][target] = edges[source][target] + 1
                else: 
                    edges[source].update({target:1})
            elif target in edges.keys():
                if source in edges[target].keys():
                     edges[target][source] = edges[target][source] + 1
                else:
                    edges[target].update({source:1})
            else:
                edges.update({source:{target:1}})
                
edge_list = []
min_edge_weight = 5 # set the minimum co-occurence count for two words to be connected

for source in edges:
    for target in edges[source]:
        if edges[source][target] >= min_edge_weight:
            edge = (source,target,{'weight':edges[source][target]})
            edge_list.append(edge)

G = nx.Graph()   
G.add_edges_from(edge_list)

nx.write_gexf(G, "CoWordNET_countvectorizer_minocc"+str(min_occ)+"_minedgeweight"+str(min_edge_weight)+".gexf")

#### lad os prøve at vægte ordenes væsentlighed med TF-IDF istedet for simpel optælling

In [None]:
# kør tokeinsering med TF-IDF 

tfidf_vectorizer = TfidfVectorizer(stop_words= 'english')
X_tfidf = tfidf_vectorizer.fit_transform(corpus)

In [None]:
# udforsk resultatet på dokumentniveau i en Pandas dataframe
pd.set_option('display.max_columns', 100)

sheet = [tfidf_vectorizer.get_feature_names()]

for i in X_tfidf.toarray():
    sheet.append(list(i))

df_tfidf = pd.DataFrame(sheet)
df_tfidf.columns = df_tfidf.iloc[0]
df_tfidf = df_tfidf.drop(0)

df_tfidf.head()

In [None]:
# lad os bygge et co-word netværk igen, men dennegang baseret på TF-IDF vægtning

import networkx as nx


min_occ = 0.03 #set the minimum occurence count

edges = {}
for x, row in df_tfidf.iterrows():
    found_terms = []
    for term in df_tfidf.columns:
        if row[term] >= min_occ:
            found_terms.append(term)
    for i,source in enumerate(found_terms):
        for target in found_terms[i+1:]:
            if source in edges.keys():
                if target in edges[source].keys():
                    edges[source][target] = edges[source][target] + 1
                else: 
                    edges[source].update({target:1})
            elif target in edges.keys():
                if source in edges[target].keys():
                     edges[target][source] = edges[target][source] + 1
                else:
                    edges[target].update({source:1})
            else:
                edges.update({source:{target:1}})
edge_list = []
min_edge_weight = 5 # set the minimum co-occurence count for two words to be connected

for source in edges:
    for target in edges[source]:
        if edges[source][target] >= min_edge_weight:
            edge = (source,target,{'weight':edges[source][target]})
            edge_list.append(edge)

G = nx.Graph()   
G.add_edges_from(edge_list)

nx.write_gexf(G, "CoWordNET_tfidifvectorizer_minocc"+str(min_occ)+"_minedgeweight"+str(min_edge_weight)+".gexf")               


In [None]:
# sammen netværk, men nu med manuel oprensning af ord

cleaning = ['displaystyle']


import networkx as nx


min_occ = 0.03 #set the minimum occurence count

edges = {}
for x, row in df_tfidf.iterrows():
    found_terms = []
    for term in df_tfidf.columns:
        if term not in cleaning:
            if row[term] >= min_occ:
                found_terms.append(term)
    for i,source in enumerate(found_terms):
        for target in found_terms[i+1:]:
            if source in edges.keys():
                if target in edges[source].keys():
                    edges[source][target] = edges[source][target] + 1
                else: 
                    edges[source].update({target:1})
            elif target in edges.keys():
                if source in edges[target].keys():
                     edges[target][source] = edges[target][source] + 1
                else:
                    edges[target].update({source:1})
            else:
                edges.update({source:{target:1}})
edge_list = []
min_edge_weight = 5 # set the minimum co-occurence count for two words to be connected

for source in edges:
    for target in edges[source]:
        if edges[source][target] >= min_edge_weight:
            edge = (source,target,{'weight':edges[source][target]})
            edge_list.append(edge)

G = nx.Graph()   
G.add_edges_from(edge_list)

nx.write_gexf(G, "CoWordNET_tfidifvectorizer_minocc"+str(min_occ)+"_minedgeweight"+str(min_edge_weight)+".gexf")               


## Forsøg med tekstanalyse udover tokenization
Hvis vi vil lave mere end basal tokenisering, er vi nødt til at importere et decideret NLP bibliotek. Det giver os mulighed for at lave Part of Speech Tagging, Named Entity Recognition og Sentimentanalyse

In [None]:
try: 
    import stanza
    print("stanza library has been imported")
except: 
    print("stanza library not found. Installing...")
    !pip install stanza
    try:
        import stanza
    except: 
        print("Something went wrong in the installation of the networkx library. Please check your internet connection and consult output from the installation below")


In [None]:
import stanza

#stanza.download('en')

In [None]:
nlp = stanza.Pipeline('en')

In [None]:

nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos')
doc = nlp(corpus[0])
print(*[f'word: {word.text}\tupos: {word.upos}\txpos: {word.xpos}\tfeats: {word.feats if word.feats else "_"}' for sent in doc.sentences for word in sent.words], sep='\n')

In [None]:

nlp = stanza.Pipeline(lang='en', processors='tokenize,ner')
doc = nlp(corpus[0])
print(*[f'entity: {ent.text}\ttype: {ent.type}' for ent in doc.ents], sep='\n')

In [None]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,sentiment')
doc = nlp(corpus[0])
for i, sentence in enumerate(doc.sentences):
    if sentence.sentiment == 0:
        sentiment = 'NEGATIVE'
    if sentence.sentiment == 1:
        sentiment = 'NEUTRAL'
    if sentence.sentiment == 2:
        sentiment = 'POSITIVE'
    print(sentiment, sentence.text)