In [1]:
from jupytercloud.library import get_secret
jupyter_cloud_secrets = get_secret("sec-01d83hjj2yehgykzn85n2h5pa5")

In [2]:
import io
import requests
from importlib.metadata import FastPath, Prepared, PathDistribution

# order is important, low priority goes first
syspath2 = ['/usr/lib/python2.7/dist-packages', '/usr/local/lib/python2.7/dist-packages']
syspath3 = ['/usr/lib/python3/dist-packages/', '/usr/local/lib/python3.7/dist-packages/']

In [3]:
# ЯННП почему это дает меньше пакетов, чем то, что ниже
# def get_packages(paths):
#     packages = {}

#     context = DistributionFinder.Context(path=paths)
#     for package in MetadataPathFinder.find_distributions(context=context):
#         name = package.metadata['Name']
        
#         print(name)
        
#         packages[name] = package
        
#     return packages

def get_packages(paths):
    packages = {}

    prepared = Prepared(None)
    
    for path in paths:
        for found in FastPath(path).search(prepared):
            package = PathDistribution(found)
            name = package.metadata['Name']
            
            packages[name] = package
        
    return packages

In [4]:
py2_distributions = get_packages(syspath2)
py3_distributions = get_packages(syspath3)

In [5]:
from collections import Counter

def get_fields_count(packages):
    result = Counter()
    
    for distr in packages.values():
        fields = distr.metadata.keys()  # keys are not unique
        result.update(set(fields))
        
    return dict(result)        

In [6]:
get_fields_count(py2_distributions)

{'Author-email': 241,
 'Version': 263,
 'Author': 246,
 'Name': 263,
 'Platform': 254,
 'Description': 29,
 'Home-page': 263,
 'Download-URL': 29,
 'Classifier': 214,
 'License': 262,
 'Metadata-Version': 263,
 'Summary': 263,
 'Keywords': 84,
 'Maintainer-email': 51,
 'Project-URL': 34,
 'Provides-Extra': 70,
 'Description-Content-Type': 45,
 'Requires-Python': 87,
 'Requires-Dist': 152,
 'Maintainer': 54,
 'Requires': 5,
 'Download-url': 1,
 'Provides': 4,
 'blas_opt_info': 1,
 'blas_mkl_info': 1}

In [7]:
get_fields_count(py3_distributions)

{'Author-email': 229,
 'Version': 269,
 'Author': 237,
 'Name': 269,
 'Platform': 259,
 'Description': 22,
 'Home-page': 269,
 'Classifier': 238,
 'License': 268,
 'Keywords': 116,
 'Metadata-Version': 269,
 'Summary': 269,
 'Provides-Extra': 96,
 'Project-URL': 48,
 'Maintainer': 70,
 'Maintainer-email': 65,
 'Requires-Python': 135,
 'Requires-Dist': 167,
 'Download-URL': 41,
 'Description-Content-Type': 69,
 'Requires': 3,
 'Download-url': 1,
 'Provides': 2,
 'blas_opt_info': 1,
 'blas_mkl_info': 1}

In [8]:
import re

URL_RE = re.compile(r'https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&//=]*)')

In [9]:
import re
import warnings
from dataclasses import dataclass
from typing import List, Tuple, Optional

@dataclass
class Package:
    name: str
    version: str
    distribution: PathDistribution
    top_levels: Optional[List[str]] = None
    summary: Optional[str] = None
    description: Optional[str] = None
    keywords: Optional[List[str]] = None
    urls: List[Tuple[str, str]] = None
    emails: Optional[List[str]] = None
            
    @classmethod
    def parse_keywords(cls, metadata):
        raw_keywords = cls.get_metadata_field(metadata, 'Keywords')
        
        if not raw_keywords:
            return
        split_character = ',' if ',' in raw_keywords else ' '
        keywords = raw_keywords.split(split_character)        
        return [k.strip() for k in keywords]
    
    @staticmethod
    def get_top_levels(distribution):
        top_level = distribution.read_text('top_level.txt')
        if not top_level:
            return None
        
        top_level = top_level.strip()
        raw_top_levels = top_level.split('\n')       
        top_levels = [t.strip() for t in raw_top_levels if len(t) < 32]
        
        if len(top_levels) > 10:
            return None
        
        return top_levels or None    
    
    @staticmethod
    def parse_url(text):
        match = URL_RE.search(text)
        if not match:
            warnings.warn(f'bad url match {text}')
            return None, None
        
        pre_text = text[:match.start()]
        url = match.group(0)        
        
        return pre_text.strip(' ,'), url
    
    @classmethod
    def parse_urls(cls, metadata):
        result = []
        
        for key, value in metadata.items():
            if (
                'url' in key.lower() and 'download' not in key.lower()
                or key == 'Home-page'
            ):
                if value == 'UNKNOWN':
                    continue
                heading, url = cls.parse_url(value)
                
                heading = heading or key.replace('-', ' ')
                url = url or value
                
                result.append((heading, url))                    
                
        return result or None
    
    @staticmethod
    def parse_emails(metadata):
        result = []
        
        for key, value in metadata.items():
            if 'email' in key.lower() and 'yandex-team' in value:
                values = re.split(r'[, ]', value)
                result.extend(v.strip() for v in values)
                
        result = list(set(result))
                
        return result or None
    
    @staticmethod
    def get_metadata_field(metadata, field):
        value = metadata.get(field)
        if value:
            value = value.strip()
            if value == 'UNKNOWN':
                value = None                
        
        return value or None
        
    @classmethod
    def from_distribution(cls, distribution):
        metadata = distribution.metadata

        return cls(
            name=cls.get_metadata_field(metadata, 'Name'),
            version=cls.get_metadata_field(metadata, 'Version'),
            distribution=distribution,
            summary=cls.get_metadata_field(metadata, 'Summary'),
            description=cls.get_metadata_field(metadata, 'Description'),
            keywords=cls.parse_keywords(metadata),
            top_levels=cls.get_top_levels(distribution),
            urls=cls.parse_urls(metadata),
            emails=cls.parse_emails(metadata)
        )

In [10]:
py2_packages = [Package.from_distribution(distribution) for distribution in py2_distributions.values()]
py3_packages = [Package.from_distribution(distribution) for distribution in py3_distributions.values()]

In [11]:
class BaseFormatter:
    def __init__(self):
        self.text = io.StringIO()
        
    def write(self, text, newline=True):
        print(text, file=self.text, end='\n' if newline else '')
         
    def write_disclaimer(self):
        self.write_line("Эта документация генерируется автоматически и ее не имеет смысла править.")
        self.write_line(f"Дата генерации: {datetime.datetime.now()}")
        self.write_line(f'Ссылки для связи:')
        self.write_line(self.ulist([
            self.link('jupyter.yandex-team.ru', 'https://jupyter.yandex-team.ru'),
            self.link('Telegram чат', 'https://nda.ya.ru/3UYxBK'),
            self.link('Slack канал', 'https://yndx-all.slack.com/archives/C01EQFCNPSA'),
            self.link('Startrek очередь', 'https://startrek.yandex-team.ru/JUPYTER')
        ]))
    
    def write_new_line(self):
        self.write('')
        
    def write_line(self, text):
        self.write(text)
        self.write_new_line()
        
    def write_heading(self, text, level):
        self.write('=' * level + text)
        self.write_new_line()
        
    def code(self, text):
        return f'`{text}`'
        
    def link(self, text, url):
        return f'[{text}]({url})'
    
    def ulist(self, values):
        return '\n'.join(f'* {v}' for v in values)        
    
    def format(self):
        raise NotImplemented

In [12]:
import itertools

class PackageFormatter(BaseFormatter):
    def __init__(self, package):
        super().__init__()
        self.package = package
        
    def write_section(self, name, text=None):
        if text:
            text = ' ' + text
        else:
            text = ''
            
        self.write(f'**{name}**:{text}')
        self.write_new_line()
    
    def write_version(self): 
        self.write_section('Версия', self.code(self.package.version))
        
    def write_keywords(self):
        if not self.package.keywords:
            return 
        
        keywords = ', '.join(self.package.keywords)
        
        self.write_section('Ключевые слова', keywords)
                
    def write_top_levels(self):
        if not self.package.top_levels:
            return
        
        if (
            len(self.package.top_levels) == 1 and 
            self.package.top_levels[0] == self.package.name
        ):
            return
        
        top_levels = ', '.join(
            self.code(p) 
            for p in self.package.top_levels
        )
        
        self.write_section('Пакеты', top_levels)
        
    def write_urls(self):
        if not self.package.urls:
            return
        
        self.write_section('Ссылки')
        
        for url_name, url in self.package.urls:
            link = self.link(url_name, url)
            self.write(f'* {link}')
            
        self.write_new_line()
        
    def write_emails(self):
        if not self.package.emails:
            return
        
        emails = [
            self.link(email, f'mailto:{email}')
            for email in self.package.emails
        ]
        
        if len(emails) == 1:
            self.write_section('Почта', emails[0])
        else:
            self.write_section('Почта')
            for email in emails:
                self.write(f'* {email}')
                    
    def write_summary(self):
        if not self.package.summary:
            return
        
        self.write_section('Краткое описание')
        self.write(self.package.summary)
        self.write_new_line()
        
    def write_description(self):
        if not self.package.description:
            return        
        
        self.write_section('Описание')
        self.write(self.package.description)
        self.write_new_line()
        
    def get_top_levels(self):
        top_levels = self.package.top_levels
        
        if (
            not top_levels or 
            len(top_levels) == 1 and 
            top_levels[0] == self.package.name
        ):
            return []
        
        return [self.code(k) for k in top_levels]        
 
    def write_table(self):
        version = [self.code(self.package.version)]
        keywords = [self.code(k) for k in self.package.keywords or ()]
        top_levels = self.get_top_levels()
        urls = [self.link(url_name, url) for url_name, url in self.package.urls or ()]       
        emails = [
            self.link(email, f'mailto:{email}')
            for email in self.package.emails or ()
            if email
        ]
        
        def join(values):
            if values:
                return [' '.join(values)]
            
        
        columns = [
            ('Версия', version),
            ('Ключевые слова', join(keywords)),
            ('Пакеты', join(top_levels)),
            ('Ссылки', join(urls)),
            ('Почта', join(emails)),
        ]
        
        columns = [(k, v) for k, v in columns if v]
        
        self.write('| {} |'.format(
            ' | '.join(k for k, v in columns)
        ))
        self.write('| {} |'.format(
            ' | '.join('---' for _ in columns)
        ))
        
        for line in itertools.zip_longest(
            *(v for k, v in columns),
            fillvalue=''
        ):
            self.write('| {} |'.format(
                ' | '.join(line)
            ))
         
    def format(self):
        self.write_heading(self.package.name, 2)
        self.write_table()
#         self.write_version()
#         self.write_top_levels()
#         self.write_keywords()
#         self.write_urls()   
#         self.write_emails()
        self.write_summary()
#        self.write_description()
        
        return self.text.getvalue()

In [13]:
import datetime

class AllPackagesFormatter(BaseFormatter):
    def __init__(self, packages):
        super().__init__()
        self.packages = packages
       
    def format(self):
        self.write_disclaimer()
        for package in sorted(self.packages, key=lambda p: p.name.lower()):
            formatter = PackageFormatter(package)
            package_text = formatter.format()
            self.write(package_text)
            self.write_new_line()
        
        return self.text.getvalue()   

In [14]:
import json
import requests

base = 'https://wiki-api.yandex-team.ru/_api/frontend'
link_base = 'https://wiki.yandex-team.ru'
headers = {
    'Authorization': 'OAuth ' + jupyter_cloud_secrets['jupytercloud_app_oauth'],
    'Content-Type': 'application/json'
}

def upload_doc(path, title, body):    
    data = {
        'title': title,
        'body': body,
    }
    response = requests.post(
        base + path,
        headers=headers,
        json=data,
    )
    
    response.raise_for_status()
    
    print('Doc uploaded:', link_base + path)

In [15]:
py2_full_doc = AllPackagesFormatter(py2_packages).format()
py2_full_doc_path = '/jupyter/kernels/py2/full'
upload_doc(py2_full_doc_path, 'Полный список библиотек в ядре Python 2', py2_full_doc)

Doc uploaded: https://wiki.yandex-team.ru/jupyter/kernels/py2/full


In [16]:
py3_full_doc = AllPackagesFormatter(py3_packages).format()
py3_full_doc_path = '/jupyter/kernels/py3/full'
upload_doc(py3_full_doc_path, 'Полный список библиотек в ядре Python 3', py3_full_doc)

Doc uploaded: https://wiki.yandex-team.ru/jupyter/kernels/py3/full


In [17]:
import pymorphy2

morph = pymorphy2.MorphAnalyzer()

class KeywordSubjectIndexFormatter(BaseFormatter):
    def __init__(self, packages, full_doc_path):
        super().__init__()
        self.keyword_index = {}
        self.full_doc_path = full_doc_path
        
        self.vhojdenia = morph.parse('вхождение')[0]
        
        for package in packages:
            for keyword in package.keywords or []:
                self.keyword_index.setdefault(keyword, [])
                self.keyword_index[keyword].append(package)
                
    def entries_number_text(self, keyword):
        packages = self.keyword_index[keyword]
        number = len(packages)
        word = self.vhojdenia.make_agree_with_number(number).word
        return f'{number} {word}'
            
    def format(self):
        self.write_disclaimer()
        self.write_heading('Ключевые слова', 2)
        sorted_keywords = sorted(self.keyword_index.items(), key=lambda _: _[0].lower())
        
        self.write_line(self.ulist(
            self.link(keyword, f'#{keyword.replace(" ", "")}') + f' ({self.entries_number_text(keyword)})'
            for keyword, _ in sorted_keywords
        ))
        
        self.write_heading('Указатель', 2)
        for keyword, packages in sorted_keywords:
            self.write_heading(keyword, 3)
            self.write_line(self.ulist(
                self.link(package.name, f'{self.full_doc_path}#{package.name}')
                for package in packages
            ))
        
        self.write_disclaimer()
        return self.text.getvalue()

In [18]:
py2_subject_index = KeywordSubjectIndexFormatter(py2_packages, py2_full_doc_path).format()
py2_keyword_index_path = '/jupyter/kernels/py2/keyword_index'
upload_doc(py2_keyword_index_path, 'Предметный указатель по ключевым словам для ядра Python 2', py2_subject_index)

Doc uploaded: https://wiki.yandex-team.ru/jupyter/kernels/py2/keyword_index


In [19]:
py3_subject_index = KeywordSubjectIndexFormatter(py3_packages, py3_full_doc_path).format()
py3_keyword_index_path = '/jupyter/kernels/py3/keyword_index'
upload_doc(py3_keyword_index_path, 'Предметный указатель по ключевым словам для ядра Python 3', py3_subject_index)

Doc uploaded: https://wiki.yandex-team.ru/jupyter/kernels/py3/keyword_index
