In [98]:
import re
import json
import yaml

from datetime import datetime, timedelta
from pathlib import Path

import pandas as pd

import clipboard
import frontmatter

## Some prep work

Defininig some variables

In [50]:
dtc_website_dir = Path('c:/Users/alexe/git/datatalksclub.github.io')
data_dir = Path('./data')

Loading emojis:

In [17]:
df_emojis = pd.read_csv(dtc_website_dir / 'scripts' / 'emojis.csv')
emoji_map = dict(zip(df_emojis.code, df_emojis.emoji))

Fidning books without answers:

In [88]:
dtc_website_dir / '_books'
book_files = list((dtc_website_dir / '_books').glob('202*.md'))

In [166]:
no_archive = []

for book_file in book_files:
    if book_file.parts[-1] == '20210315-database-internals.md':
        continue

    post = frontmatter.load(book_file)

    # looking only at events that finished
    if datetime.today() < post['end']:
        continue

    if 'archive' not in post.keys():
        no_archive.append((book_file, post.to_dict()))

## Some helper functions

We'll need them for parsing slack dump

In [6]:
def repl_user_callback(match):
    user_id = match.group(1)
    user_name = users[user_id]
    return user_name['name']

user_pattern = re.compile(r'<@(.+?)>')
link_pattern_text = re.compile(r'<(http.+?)\|(.+?)>')
link_pattern = re.compile(r'<(http.+?)>')
emoji_pattern = re.compile(r':([-+0-9_a-z]+):(:[-+0-9_a-z]:)?')

def replace_emoji_callback(match):
    code = match.group(1)
    if code in emoji_map:
        return emoji_map[code]
    print('cannot find %s' % code)
    return ":%s:" % code

def prepare_text(text):
    text = text.replace('\xa0', ' ').replace('•', '-').replace('\n\n', '\n')
    text = user_pattern.sub(repl_user_callback, text)
    text = emoji_pattern.sub(replace_emoji_callback, text)
    text = link_pattern_text.sub(r'[\2](\1)', text)
    text = link_pattern.sub(r'[\1](\1)', text)
    return text

In [91]:
def load_docs(files):
    all_docs = []

    for f in files:
        with f.open(encoding="utf-8") as f_in:
            docs = json.load(f_in)
            all_docs.extend(docs)
    
    return all_docs

def clean_user(d):
    p = d['profile']
    name = p['display_name']
    if len(name) == 0:
        name = p['real_name']
    return {
        'name': name,
        'image': p['image_72']
    }

## Loading slack data

First, load the users:

In [32]:
with (data_dir / 'users.json').open(encoding="utf-8") as f_in:
    all_users = json.load(f_in)

In [33]:
users = {d['id']: clean_user(d) for d in all_users}

Now let's find all the json files in the dump of the book-of-the-week channel:

In [92]:
bow_data_dir = data_dir / 'book-of-the-week'
all_files = sorted(bow_data_dir.glob('*.json'))

And load all messages (we'll need them for making threads):

In [93]:
all_messages_docs = load_docs(all_files)

## Processing

Keep running this until run out of books =)

Now let's take one of the books:

In [362]:
book_file, book = no_archive.pop()
print(book_file.parts[-1])

IndexError: pop from empty list

In [356]:
start = book['start'] - timedelta(days=1)
end = start + timedelta(days=7)
print(start)
print(end)

question_files = []

for filename in all_files:
    date = datetime.strptime(filename.parts[-1], '%Y-%m-%d.json')
    if start <= date and date <= end:
        question_files.append(filename)
        
question_files

2021-08-29 00:00:00
2021-09-05 00:00:00


[WindowsPath('data/book-of-the-week/2021-08-29.json'),
 WindowsPath('data/book-of-the-week/2021-08-30.json'),
 WindowsPath('data/book-of-the-week/2021-08-31.json'),
 WindowsPath('data/book-of-the-week/2021-09-01.json'),
 WindowsPath('data/book-of-the-week/2021-09-02.json'),
 WindowsPath('data/book-of-the-week/2021-09-03.json'),
 WindowsPath('data/book-of-the-week/2021-09-04.json'),
 WindowsPath('data/book-of-the-week/2021-09-05.json')]

In [357]:
question_messages_docs = load_docs(question_files)

top_messages = [d for d in question_messages_docs if 'parent_user_id' not in d]
thread_replies = [d for d in all_messages_docs if 'parent_user_id' in d]
replies_idx = {(d['user'], d['ts']): d for d in all_messages_docs}

top_messages = [d for d in top_messages if d.get('subtype') not in ('thread_broadcast', 'channel_join')]


threads = []

for top_message in top_messages:
    user_id = top_message['user']
    if user_id == 'USLACKBOT':
        continue
    
    top_name = users[user_id]['name']
    
    if top_name == 'Francis Terence Amit':
        continue

    top_text = prepare_text(top_message['text']).strip()
    
    if 'Hello, everyone!' in top_text and 'The book of this week is' in top_text:
        continue
    
    if 'The lucky winners' in top_text:
        continue
    
    if 'Please send me your emails in DM' in top_text:
        continue
    
    replies = []

    for p in top_message.get('replies', []):
        reply_id = (p['user'], p['ts'])
        reply = replies_idx[reply_id]
        name = users[p['user']]['name']
        text = prepare_text(reply['text']).strip()

        replies.append({'name': name, 'text': text})

    thread = {
        'name': top_name,
        'text': top_text,
        'replies': replies
    }
    
    threads.append(thread)

In [358]:
yaml_snippet = yaml.dump({'archive': threads}, sort_keys=False)

In [359]:
print(yaml_snippet[:300])

archive:
- name: Alex S
  text: I wasn't sure how it's possible to read this book as it isn't published until
    October this year. Could you let us know, Alexey Grigorev?
  replies:
  - name: Alexey Grigorev
    text: "Probably you should ask Noah Gift about it \U0001F603 But you can read\
      \


In [360]:
clipboard.copy(yaml_snippet)

In [361]:
!code {book_file}