# Hyperlink Generation
This notebook generates hyperlinks for the keywords identified in the transcript dataset using the wikipedia library.

In [1]:
import pandas as pd
import wikipedia
import re
import os
import spacy

## Input Course URL
The course id can be found in the hyperlink for any page in the course.

In [2]:
url = 'https://www.coursera.org/learn/siads697698/lecture/3vwIb/how-to-do-a-standup'
course = re.search('(?<=coursera.org/learn/)(\w+)', url).group(0)

In [3]:
course

'siads697698'

## Load Directory

In [4]:
directory = os.listdir('/Users/nicolascap/MADS/Capstone/intermediate_data')
new = True
for file in directory:
    if '{}_summaries_keywords_hyperlinks'.format(course) in file:
        print("Course Already In Directory")
        new = False

## Read in Transcript DataFrame

In [9]:
df = pd.read_csv("./intermediate_data/transcripts_{}_summaries_keywords.csv".format(course)).drop(['Unnamed: 0'], axis=1)
df.head()

Unnamed: 0,course_id,video_title,transcripts,length,summary,keywords
0,siads697698,recording-of-elle-o-brien-office-hours-siads-6...,We'll see if anybody is joining us today. What...,20293,"Because at first time, I was thinking about us...","['git tutorial today', 'terminal', 'analysis s..."
1,siads697698,recording-of-elle-o-brien-office-hours-siads-6...,"I'm going to do this, Git log.one line and tha...",20294,"In this branch, let's create a new file and ca...","['git log.one line', 'commits', 'dvc repository']"
2,siads697698,recording-of-elle-o-brien-office-hours-siads-6...,"Well, not sure if anybody is joining this morn...",21840,"If that doesn't get you exactly what you want,...","['license file', 'team meeting', 'office hour'..."
3,siads697698,recording-of-elle-o-brien-office-hours-siads-6...,"now. Cool. There's lots of stuff here. Wow, l...",21840,If you're not very comfortable doing terminal ...,"['github', 'make dataset file', 'folder struct..."
4,siads697698,recording-of-elle-o-brien-office-hours-siads-6...,"Hello, nice to meet you. >> Nice to meet you t...",24317,">> Yeah, you can do that too, so today, I'm go...","['jupiter notebook experience', 'gpu', 'course..."


## Generate First Transcript for URLs

In [10]:
keywords = df.keywords.iloc[10] #7

summary = df.summary.iloc[10] #7
data_science = wikipedia.summary('Data_science')
online_learning_in_higher_education = wikipedia.summary('Online_learning_in_higher_education')

def clean_alt_list(list_):
    list_ = list_.replace("', '", 'X')
    list_ = list_.replace("['", 'X')
    list_ = list_.replace("']", 'X')
    return list_.split("X")[1:-1]
keywords = clean_alt_list(keywords)

## Define get_options Function

In [12]:
def get_options(keyword):
    """All keywords input will get a list of options output"""
    #if type(keyword) != list
    
    options = list(keyword)
    
    
    try:
        page = wikipedia.page(keyword).url
        worked=True
        
    except wikipedia.exceptions.DisambiguationError as e:
        options = e.options
    except wikipedia.PageError as e:
        options = wikipedia.search(keyword)
        
        
    return options


## Define Similarity Function

In [13]:
nlp=spacy.load('en_core_web_sm')

def cosine_similarity(doc1,doc2): 
    """Calculate the similarity between original keyword and list of options output"""
    nlp1 = nlp(doc1)
    nlp2 = nlp(doc2)
    similarity = nlp1.similarity(nlp2)
    return similarity

## Define Option Evaluation Function

In [14]:
def select_option(options, keyword): #options is a list
    """ Create a function to evaluate option in options """
    
    if len(options)==1:
        option = options[0] # The only result will be used as searching query
    elif len(options)>1: 
        try: 
            temp=[]
            for option in options: 
                #score=cosine_similarity(wikipedia.summary(option,sentences=1),keyword)
                try:
                    score=cosine_similarity(wikipedia.summary(option),keyword)
                    #page = wikipedia.page(option)                    
                    
                    #score=cosine_similarity(page.content, keyword)
                    temp.append((option, score))
                except:
                    None
            option = temp #options[temp.index(max(temp))]
            
        except:
            option = None
    else:
        option = None
        print("***OPTION***", options)
    return option

## Define URL Query Function

In [15]:
def create_url(query):
    """Find URL for query"""
    page = wikipedia.page(query).url
    
    return page

## Get Query for Top 5

In [16]:
query_list = []
for word in keywords:
    options = get_options([word])
    if len(options) > 1:
        options = options + get_options([word+'_'])
        
    selected = select_option(options,summary + data_science)
    if type(selected) == list:
        selected.sort(key = lambda x: x[1], reverse=True)
        selected = selected[0][0]
    query_list.append(selected)
query_list

  import sys


['github readme document',
 'Time-based one-time password',
 'PATH Biobank',
 'basic installation notes',
 'Knowledge management software']

## Get URLs for Top 3

In [17]:
url_list = []
for query in query_list:
    sim = cosine_similarity(wikipedia.summary(query), summary + data_science)
    url = create_url(query)
    url_list.append((query, url, sim))
                    
url_list.sort(key = lambda x: x[2], reverse=True)
url_list = url_list[:3]
url_list = [(x, y) for x,y,z in url_list]
url_list 


  import sys


[('github readme document', 'https://en.wikipedia.org/wiki/README'),
 ('Time-based one-time password',
  'https://en.wikipedia.org/wiki/One-time_password'),
 ('Knowledge management software',
  'https://en.wikipedia.org/wiki/Knowledge_management_software')]

## Apply to All Transcripts

In [18]:
def url_creation(lecture_keywords, lecture_summary):
    query_list = []
    url_list = []
    lecture_keywords_clean = clean_alt_list(lecture_keywords)
    for word in lecture_keywords_clean:     
        #gather all options
        options = get_options([word])
        if len(options) > 1:
            options = options + get_options([word+'_'])
        elif len(options) < 1:
            print(word, options) 
        #select the best option
        selected = select_option(options, lecture_summary + data_science)
        if type(selected) == list:
            selected.sort(key = lambda x: x[1], reverse=True)
            selected = selected[0][0]
        query_list.append(selected)
    res = []
    for val in query_list:
        if val != None :
            res.append(val)
    query_list = res
    for query in query_list:
        sim = cosine_similarity(wikipedia.summary(query), lecture_summary + data_science)
        url = create_url(query)
        url_list.append((query, url, sim))
                    
    url_list.sort(key = lambda x: x[2], reverse=True)
    if len(url_list) > 3:
        url_list = url_list[:3]
    url_list = [(x, y) for x,y,z in url_list]
    return url_list

df['url'] = df.apply(lambda x: url_creation(x.keywords, x.summary), axis=1)





  lis = BeautifulSoup(html).find_all('li')
  import sys


biweekly asynchronous video stand []
***OPTION*** []


In [19]:
df

Unnamed: 0,course_id,video_title,transcripts,length,summary,keywords,url
0,siads697698,recording-of-elle-o-brien-office-hours-siads-6...,We'll see if anybody is joining us today. What...,20293,"Because at first time, I was thinking about us...","['git tutorial today', 'terminal', 'analysis s...","[(analysis scripts, https://en.wikipedia.org/w..."
1,siads697698,recording-of-elle-o-brien-office-hours-siads-6...,"I'm going to do this, Git log.one line and tha...",20294,"In this branch, let's create a new file and ca...","['git log.one line', 'commits', 'dvc repository']","[(Nicotine replacement therapy, https://en.wik..."
2,siads697698,recording-of-elle-o-brien-office-hours-siads-6...,"Well, not sure if anybody is joining this morn...",21840,"If that doesn't get you exactly what you want,...","['license file', 'team meeting', 'office hour'...","[(project structure, https://en.wikipedia.org/..."
3,siads697698,recording-of-elle-o-brien-office-hours-siads-6...,"now. Cool. There's lots of stuff here. Wow, l...",21840,If you're not very comfortable doing terminal ...,"['github', 'make dataset file', 'folder struct...","[(Image viewer, https://en.wikipedia.org/wiki/..."
4,siads697698,recording-of-elle-o-brien-office-hours-siads-6...,"Hello, nice to meet you. >> Nice to meet you t...",24317,">> Yeah, you can do that too, so today, I'm go...","['jupiter notebook experience', 'gpu', 'course...","[(Lead poisoning, https://en.wikipedia.org/wik..."
5,siads697698,recording-of-elle-o-brien-office-hours-siads-6...,model perhaps we don't have any rules like th...,24317,"And so the ideal model for each one could, it ...","['data science job market analysis', 'deep reg...","[(Study skill, https://en.wikipedia.org/wiki/S..."
6,siads697698,how-to-write-an-effective-blog-post,It's not enough to just do data science on you...,7725,It could be that you've done something that ot...,"['professional data science', 'voice', 'blog',...","[(editing_, https://en.wikipedia.org/wiki/Edit..."
7,siads697698,how-to-do-a-standup,I mentioned to you that we're going to do some...,3253,I mentioned to you that we're going to do some...,"['slack', 'screen recording tool', 'webcam', '...","[(Place identity, https://en.wikipedia.org/wik..."
8,siads697698,how-to-collaborate-with-a-team,One of the most unexpectedly challenging parts...,13279,I don't know what a really reliable and certai...,"['data science collaboration', 'dead code bloc...","[(data science collaboration, https://en.wikip..."
9,siads697698,capstone-overview,"Hi, welcome to the capstone. My name's Dr. Ell...",5438,Office hours are not required or expected of y...,"['other data science instructors', 'office hou...","[(other data science instructors, https://en.w..."


## Save Dataset
We save the transcript dataset as a csv file for further analysis.

In [20]:
df.to_csv("./intermediate_data/transcripts_{}_summaries_keywords_urls".format(course))

## Next step
After you saved the dataset here, the information can then be used as part of a web app for usability.

---

**Authors:** [Wei Zhou](mailto:weiwzhou@umich.edu), [Nick Capaldini](mailto:nickcaps@umich.edu), University of Michigan, August 21, 2022

---