In [2]:
from keybert import KeyBERT
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import PorterStemmer
from google.cloud import storage
import textrank

import random
import re
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from io import BytesIO
import base64

In [3]:
text = '''Hurricane Florence came by while I was working on statquest, dark clouds filled the sky, but that didn't stop statquest stats code. Hello, I'm just armor and welcome, statquest. Today. We're going to be talking about some machine learning, fundamentals, bias, and variance. And they're going to be clearly explained. Imagine we measured the weight and height of a bunch of mice and plotted the data on a graph. Light my skin to be short. And heavier, mice tend to be taller. But after a certain weight mice, don't get any taller just more obese. Given the stator, we would like to predictor Mouse height, given its weight. For example, if you told me your mouse weigh this much. Then we might predict that the mouse is this tall. Ideally, we would know the exact mathematical formula that describes the relationship between weight and height. But in this case, we don't know the formula. So we're going to use to machine learning methods to approximate this relationship. However, I'll leave the true relationship curve in the figure for reference. The first thing we do is put the data into two sets, one for training, the machine learning algorithms and one for testing them. The blue dots are the training set and the green dots are the testing set. Here's just the training set. The first machine learning algorithm that we will use is linearregression AKA least-squares. Linear regression, fits a straight line to the training set. Note, the straight line doesn't have the flexibility to accurately. Replicate The Ark in the true relationship. No matter how we try to fit the line. He will never curve. The straight-line will never capture the true relationship between weight and height. No matter how well we fit it to the training set. The inability for a machine learning method, like linearregression to capture. The true relationship is called by us because the straight line can't be curved like the true relationship. It has a relatively large amount of bias. Another machine learning method, might fit a squiggly line to the training set. The squiggly line is super flexible and hugs. The training set along the archive, the true relationship. Because the squiggly line can handle the Ark in the true relationship between weight and height. It has very little bias. We can compare how well the straight line in the squiggly line. Hit the training set by calculating their sums of squares. In other words, we measure the distances from the fit lines to the data squared them and add them up. They are squared, so that negative distances do not cancel out, positive distances. Notice how the squiggly line fits the data. So well that the distance is between the line and the data are all 0. In the contest to see whether the straight-line fits the training set better than the squiggly line. The squiggly line winds. But remember so far, we've only calculated the sums of squares for the training set. We also have a testing set. Now, let's calculate the sums of squares for the testing set. In the contest to see whether the straight-line fits the testing set better than the squiggly line. The straight line winds. Even though the squiggly line did a great job hitting the training set. It did a terrible job, hitting the testing set. In machine learning lingo, the difference in fits between data sets is called variance. The squiggly line has lobias since it is flexible and can adapt to the curve in the relationship between weight and height. But the squiggly line has high variability because it results in vastly different sums of squares for different data sets. In other words, it's hard to predict how well the squiggly line will perform with future data sets. It might do well sometimes and other times it might do terribly. In contrast, the straight line has relatively High by us since it cannot capture the curve in the relationship between weight and height, but the straight line has relatively low variance because the sums of squares are very similar for different data sets. In other words, the straight-line might only give good predictions and not grape predictions, but they will be consistently. Good predictions, bam. Oh, no, terminology alert, because the squiggly line fits the training set really well, but not the testing set. We say that the squiggly line is over fit in machine learning. The ideal algorithm has Low by us and can accurately model the true relationship. And it has low variability by producing consistent predictions across different data sets. This is down by finding The Sweet Spot between a simple model and a complex model. Oh, no, another terminology Alert 3. Commonly used methods for finding the Sweet Spot between simple and complicated. Models are regular ization boost and a bagging. The stats Quest on random Forest. Show an example of bagging in action and we'll talk about regularization and posting in future. Statquest double bam. Gray, we've made it to the end of another exciting. Statquest. If you like this data Quest and want to see more. Please subscribe. And if you want to support statquest, well, please consider buying one or two of my original songs. All right, until next time Quest on.'''

In [4]:
stopwords_path = '../../text/stop_words_english.txt'
with open(stopwords_path, 'r', encoding='utf-8') as f:
    stop_words = f.readlines()
    stop_words = [word.strip() for word in stop_words]
    
keywords = textrank.get_keywords(text, word_num=20, 
#                                  stopwords_list=stop_words
                                )
pro_keywords = textrank.postprocess_keywords(keywords)
keywords_only = [word for (word, _) in pro_keywords]
print(keywords)
print(pro_keywords)
print(keywords_only)

[('statquest', 0.5071), ('stats', 0.5045), ('regression', 0.483), ('bias', 0.4503), ('variance', 0.4373), ('variability', 0.4289), ('data', 0.3263), ('learning', 0.31), ('mathematical', 0.3032), ('stator', 0.2892), ('algorithms', 0.2772), ('plotted', 0.2731), ('linearregression', 0.2714), ('algorithm', 0.2625), ('predict', 0.2561), ('predictor', 0.2544), ('hurricane', 0.2524), ('calculated', 0.2515), ('random', 0.2487), ('accurately', 0.2475)]
[('statquest', 0.5071), ('stats', 0.5045), ('regression', 0.483), ('bias', 0.4503), ('variance', 0.4373), ('variability', 0.4289), ('data', 0.3263), ('learning', 0.31), ('mathematical', 0.3032), ('stator', 0.2892)]
['statquest', 'stats', 'regression', 'bias', 'variance', 'variability', 'data', 'learning', 'mathematical', 'stator']


In [7]:
stemmer = PorterStemmer()
keywords_stem = [stemmer.stem(word) for word in keywords_only]
print(keywords_stem)

['statquest', 'stat', 'regress', 'bia', 'varianc', 'variabl', 'data', 'learn', 'mathemat', 'stator']


# Keyword Highlighting
script 내에서 keyword와 일치 또는 품사만 다른 단어가 있으면 highlighting 표시

In [None]:
words_list_text = text.split()
for idx, word_in_text in enumerate(words_list_text):
    word_stem = stemmer.stem(word_in_text)
    if word_stem in keywords_stem:
        words_list_text[idx] = '<span>' + word_in_text + '</span>'
        
text = ' '.join(words_list_text)

In [8]:
text

"Hurricane Florence came by while I was working on statquest, dark clouds filled the sky, but that didn't stop <span>statquest</span> <span>stats</span> code. Hello, I'm just armor and welcome, statquest. Today. We're going to be talking about some machine learning, fundamentals, bias, and variance. And they're going to be clearly explained. Imagine we measured the weight and height of a bunch of mice and plotted the <span>data</span> on a graph. Light my skin to be short. And heavier, mice tend to be taller. But after a certain weight mice, don't get any taller just more obese. Given the stator, we would like to predictor Mouse height, given its weight. For example, if you told me your mouse weigh this much. Then we might predict that the mouse is this tall. Ideally, we would know the exact <span>mathematical</span> formula that describes the relationship between weight and height. But in this case, we don't know the formula. So we're going to use to machine <span>learning</span> meth

# Keyword Timestamps
word timestamp 딕셔너리 돌다가 word가 keyword와 일치 또는 품사만 다른 단어라면 start_time 기록

In [26]:
timestamps = [{'word':'that', 'start_time':0, 'end_time':0.1}, 
              {'word':'didn\'t', 'start_time':0.2, 'end_time':0.3}, 
              {'word':'statquest', 'start_time':0.4, 'end_time':0.5}]
keywords_only = ['statquest']
keywords_stem = [stemmer.stem(word) for word in keywords_only]

In [27]:
keyword_timestamps = {keyword:[] for keyword in keywords_only}
for word_dict in timestamps:
    word_stem = stemmer.stem(word_dict['word'])
    if word_stem in keywords_stem:
        keyword_ori = keywords_only[keywords_stem.index(word_stem)]
        keyword_timestamps[keyword_ori].append(word_dict['start_time'])

In [28]:
keyword_timestamps

{'statquest': [0.4]}