<a href="https://colab.research.google.com/github/CT-Cultures/Content/blob/master/RegOverview_Issue.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mount drive and install requirements

In [None]:
# Mount Drive
import os
import sys
import gc

from google.colab import drive
drive.mount('/content/drive')

# Install requirements

In [None]:
# Install requirements in this cell,
# then restart runtime after initial installation

# fetch Content from github and install requirements
path_Content = "/content/drive/MyDrive/Github/Content" ###
if not os.path.exists(path_Content):
  !git clone https://github.com/CT-Cultures/Content.git {path_Content}
os.chdir(path_Content)
!pip install -r requirements.txt

path_Article = "/content/drive/Mydrive/Github/Article" ###
if not os.path.exists(path_Article):
  !git clone https://github.com/CT-Cultures/Article.git {path_Article}

!pip install tensorflow_text
!pip install --upgrade tensorflow_hub

In [None]:
# Install Chromedriver
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

In [None]:
# Check Environment
import pandas as pd
import transformers
import ipywidgets as widgets
import nltk
print('pandas version: {}, (>= 1.3.2)'.format(pd.__version__)) # pd has to >= 1.3.2, restart runtime
print('transformers version: {}'.format(transformers.__version__))
#print('nltk version: {}, (>=3.3)'.format(nltk.__version__))
print('ipywigets version: {}, (>=7.7.0)'.format(widgets.__version__))


!which python
!python --version
!nvidia-smi

# Import libraries and set path

In [None]:
# Set path
path_Article = path_wd = '/content/drive/MyDrive/Github/Article'
path_font = '/content/drive/MyDrive/Github/Article/fonts/STHUPO.TTF'
path_img = '/content/drive/MyDrive/Github/Article/img'

path_Content = '/content/drive/MyDrive/Github/Content'
path_ChinaFilm = path_Content + '/tools/sources/ChinaFilm'
path_NRTA = path_Content + '/tools/sources/NRTA'
path_ZGDYPW = path_Content + '/tools/sources/ZGDYPW'

os.chdir(path_wd)

In [None]:
#Load Libraries Global
import os
import datetime as dt
import re
import pandas as pd
import numpy as np
import torch
from bs4 import BeautifulSoup

from selenium import webdriver

import matplotlib as mp
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
import matplotlib.font_manager as fm

from ipywidgets import interact, interactive, fixed, interact_manual
import textwrap

path_fonts = path_Article + '/fonts'
fontprop = fm.FontProperties(fname=path_fonts, size= 15)

font_dirs = [path_fonts, ]
font_files = fm.findSystemFonts(fontpaths=font_dirs)
font_list = fm.createFontList(font_files)
for font in font_files:
  fm.fontManager.addfont(font)

plt.rcParams['figure.figsize'] = [15, 9]
mp.rcParams['font.family'] = ['Microsoft YaHei']

%matplotlib inline
print(mp.get_cachedir())

In [None]:
# Load Libraries Local
#%load_ext autoreload
#%reload_ext autoreload 2

os.chdir(path_ChinaFilm)
from Record_Registration import Registration
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

# Initialize Registration Class
driver = webdriver.Chrome('chromedriver',options=chrome_options)
dy_reg = Registration(driver)

# import utils for making plots and wordclouds
os.chdir(path_wd)
%reload_ext autoreload
from generate import utils

# import tools for prediction
os.chdir(path_Content)
%reload_ext autoreload
%autoreload 2
import predict

# [Import Datasets]

### Update records

In [None]:
# Update ChinaFilm records to reflect latest changes
os.chdir(path_ChinaFilm)

# bypass update in colab, because the connection
# from google to ChinaFilm's server is too slow
#!python update_release.py
#!python update_registration.py

### Import Records

In [None]:
# Import ChinaFilm registration
df_ChinaFilm_reg = pd.read_json(
    path_ChinaFilm + '/records/contents_of_registrations.json', orient='split')
df = dy_reg.Refined_Records(df_ChinaFilm_reg) #clean up records

# Import ChinaFilm Releases
df_ChinaFilm_release = pd.read_json(
    path_ChinaFilm + '/records/contents_of_releases.json')

In [None]:
# Load current issue (most recent one)
curr_issue_name = df.sort_values(['公示日期', '公示批次名称'], ascending=False)['公示批次名称'].iloc[0]
df_curr =  df.loc[df.公示批次名称 == curr_issue_name]
issue_name = df_curr['公示批次起始'].iloc[0][0] + '年' + df_curr['公示批次起始'].iloc[0][1] +'月'
issue_name += df_curr['公示批次起始'].iloc[0][2]
print(issue_name)

In [None]:
df_curr.columns

#[Process Datasets]

## Predict and extract features

DataFrame.to_excel(excel_writer, sheet_name='Sheet1', na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, startrow=0, startcol=0, engine=None, merge_cells=True, encoding=None, inf_rep='inf', verbose=True, freeze_panes=None, storage_options=None)[source]

In [None]:
cols_downloadable = ['备案立项号', '片名', '备案单位', '编剧','备案地', '梗概', '电影类别']
fp_issue_download = path_wd + '/records/chinafilm_reg_{}.xls'.format(issue_name)
df_curr[cols_downloadable].to_excel(
    fp_issue_download, sheet_name='备案信息', encoding='utf-8' )

from google.colab import files
files.download(fp_issue_download)

In [None]:
os.chdir(path_Content) # change to the Content directory

# predict alternative title
df_curr['预测片名'] = predict.predict_title(df_curr['梗概'].tolist())

# predict genre
df_curr['类型'] = predict.predict_genre(df_curr['梗概'].tolist())

# predict time period
df_curr['年代'] = predict.predict_time(df_curr['梗概'].tolist())

# extract keywords, topK=10
df_curr['kw'] = df_curr['梗概'].apply(predict.extract_keywords, topK=10).copy()

# identify main characters
df_curr['主要角色'] = df_curr['梗概'].apply(predict.identify_characters).copy()

In [None]:
# define subcategories for genre
subcats = ['环境', '氛围', '人物', '事件', '情感']

dsubcats = {}
dsubcats2id = {}
dsubcats['环境'] = ['都市', '乡村', '宫廷', '内宅', '职场', 
                    '朝堂', '旅途', '神话', '仙侠', '太空',
                    '江湖', '架空', '荒野', '校园', '未来',
                    '古城', '岛屿', '老宅', '乱世', '市井',
                    '异次元',
]

dsubcats['氛围'] = ['悬疑', '惊悚', '恐怖', '励志', '喜剧',
                   '合家欢', '温馨', '甜蜜', '悲伤', '暴力', 
                  '血腥', '情欲', '动作', '科幻', '奇幻',
                  '苦难', '危难', '压抑', '武侠', '惊险',
                  '困惑', '神秘', '夺宝', '思考'
]

dsubcats['人物'] = ['青少', '儿童', '青年', '中年', '老年',
                   '群像', '动物', '外星人', '家人', '机器人',
                  '军人', '警察', '女性', '男性', '运动员',
                  '教师', '植物'
]

dsubcats['事件'] = ['革命', '战争', '谍战', '枪战', '涉案',
                 '反腐', '宅斗', '宫斗', '奋斗', '创业',
                 '商战', '竞技', '历险', '抗病', '建设',
                 '扶贫', '行政', '司法', '伦理', '寻亲',
                 '复仇', '救灾', '除暴', '成长', '生活',
                 '营救', '趣事', '解惑', '救赎','文艺',
                  '追爱','打拐', '守护', '行骗', '支教',
                  '探索', '教授',
]

dsubcats['情感'] = ['爱情', '亲情', '友情', '忧伤', '仇恨', 
                    '惊恐', '家国', '愉悦', '愤怒','委屈'
]

In [None]:
# Predict with tf model
# load as keras model
import tensorflow as tf
import tensorflow_text as tf_text
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow_hub as hub

cols_subgenre = ['环境', '氛围', '事件', '人物', '情感']

for col in cols_subgenre:
  fp_model ="/content/drive/MyDrive/Github/Content/tools/models_wip/model_predict_subgenre/{}" \
            .format(col)
  model = tf.keras.models.load_model(
    fp_model,
    custom_objects={'KerasLayer': hub.KerasLayer})
  res = model.predict(df_curr.apply(lambda x: x['片名'] + '。' + x['梗概'], axis=1))
  df_curr['{}_id'.format(col)] = None
  df_curr['{}_id'.format(col)] = [int(i) for i in np.argmax(res, axis=1)]
  df_curr[col] = [dsubcats[col][i] for i in np.argmax(res, axis=1)]

In [None]:
df_curr.columns

## Interactively adjust features

In [None]:
df_curr.columns

In [None]:
cols_subgenre = ['环境', '氛围', '事件', '人物', '情感']

for col in cols_subgenre:
  df_curr.apply(lambda x: dsubcats[col].index(x[col]), axis=1)

for col in cols_subgenre:
  df_curr['{}_id'.format(col)] = df_curr[col].apply(lambda x: dsubcats[col].index(x.value)

In [None]:
dsubcats['情感']
from IPython.core.display import display_html
from IPython.display import clear_output

In [None]:
# initialize interactive genre update
i = 145 #90
subcats = ['环境', '氛围', '事件', '人物', '情感']

In [None]:
i += 1; dtmp = {}
print('{}.《{}》'.format(i, df_curr.loc[i, '片名'])); print(textwrap.fill(df_curr.loc[i, '梗概'], 50))
dtmp['环境'] = widgets.ToggleButtons(options=dsubcats['环境'], index=int(df_curr.loc[i, '环境_id']), description='环境'); display(dtmp['环境'])
dtmp['氛围'] = widgets.ToggleButtons(options=dsubcats['氛围'], index=int(df_curr.loc[i, '氛围_id']), description='氛围'); display(dtmp['氛围'])
dtmp['事件'] = widgets.ToggleButtons(options=dsubcats['事件'], index=int(df_curr.loc[i, '事件_id']), description='事件'); display(dtmp['事件'])
dtmp['人物'] = widgets.ToggleButtons(options=dsubcats['人物'], index=int(df_curr.loc[i, '人物_id']), description='人物'); display(dtmp['人物'])
dtmp['情感'] = widgets.ToggleButtons(options=dsubcats['情感'], index=int(df_curr.loc[i, '情感_id']), description='情感'); display(dtmp['情感'])

In [None]:
df_curr.loc[i, '环境'] = dtmp['环境'].value; df_curr.loc[i, '环境_id'] = dsubcats['环境'].index(dtmp['环境'].value)
df_curr.loc[i, '氛围'] = dtmp['氛围'].value; df_curr.loc[i, '氛围_id'] = dsubcats['氛围'].index(dtmp['氛围'].value)
df_curr.loc[i, '事件'] = dtmp['事件'].value; df_curr.loc[i, '事件_id'] = dsubcats['事件'].index(dtmp['事件'].value)
df_curr.loc[i, '人物'] = dtmp['人物'].value; df_curr.loc[i, '人物_id'] = dsubcats['人物'].index(dtmp['人物'].value)
df_curr.loc[i, '情感'] = dtmp['情感'].value; df_curr.loc[i, '情感_id'] = dsubcats['情感'].index(dtmp['情感'].value)

In [None]:
# adjust genre
df_curr['类型_ext'] = df_curr['类型']

In [None]:
genre_unique = ['都市', '革命', '军旅', '好人好事', '爱情', '家庭', '惊悚', '剧情', '枪战', '农村', '传记',
       '传奇', '宫廷', '武打', '涉案', '神话', '科幻', '青少', '其它', '悬疑', '竞技', '职业',
       '商战', '战争', '体育', '谍战', '奇幻', '冒险', '未匹配', '历险', '喜剧', '公路', '创业',
       '动物', '企业', '农业', '健康', '安防', '法律']

genre_to_add = ['奋斗','疾病', '伦理']


In [None]:
i = 0
batch = 10
df_curr['类型_ext'][i:i+batch].to_numpy()

In [None]:
i+= batch
print('{} - {}'.format(i, i+batch))
df_curr['类型_ext'][i:i+batch].to_numpy() # copy out put to next cell to modify genre

In [None]:
df_curr['类型_ext'].iloc[i:i+batch] = \
['农村', '剧情', '爱情', '农村', '奇幻', '爱情', '青少', '奇幻', '剧情', '奇幻']

In [None]:
# run cell, click make  interactive tables at the lower left to edit
df_curr[['片名', '类型', '类型_ext', '年代', '梗概']]

## Save / Load Reg_Issue

In [None]:
# Save records for current release issue
#df_curr.to_pickle(path_wd + '/records/df_reg_{}.pkl'.format(issue_name))

In [None]:
df_curr = pd.read_pickle(path_wd + '/records/df_reg_{}.pkl'.format(issue_name))

In [None]:
df_curr

# [Generate Content]

## 【T1】 Title

In [None]:
# Title
T1 = '{year}年'.format(year=df_curr.iloc[0]['公示批次起始'][0])
T1 += '{month}月'.format(month=df_curr.iloc[0]['公示批次起始'][1])
if df_curr.iloc[0]['公示批次起始'][2] != '整月':
  T1 += '{duration}'.format(duration=df_curr.iloc[0]['公示批次起始'][2])
T1 += '电影备案公示划重点'
print(T1)

## 【P1】	oldest/newest reg

In [None]:
# Obtain Variables
df_curr.loc[:,'公示日期'] = df_curr.loc[:,'公示日期'].astype('datetime64')
pub_year = df_curr.iloc[0]['公示日期'].year
pub_month = df_curr.iloc[0]['公示日期'].month
pub_day = df_curr.iloc[0]['公示日期'].day
df_curr['备案申请年份'] = df_curr['备案申请年份'].astype('int')
df_curr_sorted = df_curr.sort_values(
    ['备案申请年份', '备案立项年度顺序号'], ascending=True
).reset_index(drop=True)

# Write Content
P1 = ''
P1 += '{year}年{month}月{day}日，'.format(year=pub_year, month=pub_month, day=pub_day)
P1 += '{month}月{part_of_month}的备案公示新鲜出炉，'.format(
    month=df_curr.iloc[0]['公示批次起始'][1], part_of_month=df_curr.iloc[0]['公示批次起始'][2])
P1 += '共计影片{}部！'.format(df_curr.shape[0])
P1 += '这一批次中，最遥远的项目是《{}》，'.format(df_curr_sorted.loc[0, '片名']) 
P1 += '备案号为{}，'.format(df_curr_sorted.loc[0, '备案立项号'])
P1 += '最近期的项目是《{}》，'.format(df_curr_sorted.loc[df_curr.shape[0]-1, '片名'])
P1 += '备案号为{}。'.format(df_curr_sorted.loc[df_curr.shape[0]-1, '备案立项号'])
print(P1)

## 【P2】type overview

In [None]:
fp_plot_type, df_by_type = utils.plot_type(df_curr, issue_name, return_df=True)

In [None]:
P2 = '按备案类别划分，本次完成备案'
for i, row in df_by_type.reset_index().iterrows():
  if i == df_by_type.shape[0]-1:
    P2 = P2.rstrip('、')
    P2 += '以及{type}{n}部，'.format(type=row['电影类别'], n=row['数量'])
  else:
    P2 += '{type}{n}部、'.format(type=row['电影类别'], n=row['数量'])

P2 += '共计{}部。'.format(df_by_type.sum())
print(P2)

## 【P3】genre overview

In [None]:
fp_plot_genre, df_by_genre = utils.plot_genre(
    df=df_curr,
    name='ALL_{}'.format(issue_name),
    col='类型_ext',
    xticklabels=[0, '', 5, '', 10, '', 15, '', 20],
    stacked=True, 
    return_df=True)

In [None]:
P3 = ''
P3 = '故事片中，备案占前三的类型依次为{}。'.format(
    '、'.join(df_by_genre['故事影片'][-4:-1][::-1].index))

print(P3)

In [None]:
dplots['plot'][subcat]

In [None]:
dplots = {}
dplots['plot'] = {}
dplots['df'] = {}

for subcat in subcats:
  dplots['plot'][subcat], dplots['df'][subcat] = utils.plot_genre(
      df=df_curr,
      name='{}_{}'.format(subcat, issue_name),
      col=subcat,
      xticklabels=[0, '', 5, '', 10, '', 15, '', 20],
      stacked=True, 
      return_df=True)

In [None]:
P3 = '故事片中，排序前三的'
for subcat in subcats:
  if subcat == '环境': subcat_mod = '社会环境'
  if subcat == '氛围': subcat_mod = '叙事氛围'
  if subcat == '事件': subcat_mod = '核心事件'
  if subcat == '人物': subcat_mod = '主人公身份'
  if subcat == '情感': subcat_mod = '情感走向'
  P3 += '{}为{}，'.format(subcat_mod,
      '、'.join(dplots['df'][subcat]['故事影片'][-4:-1][::-1].index))

print(P3)

## 【P4】time period overview

In [None]:
fp_plot_time, df_by_time = utils.plot_time(df_curr, issue_name, y_offset=8, return_df=True)

In [None]:
P4 = ''
P4 = '按年代划分，当代题材占主力位置，'
df_time_sorted = df_curr.groupby('年代')['片名'].count().sort_values(ascending=False).reset_index()
df_time_sorted.columns = ['年代', '数量']

P4 += '共计{}部。'.format(df_time_sorted['数量'][0])
print(P4)

## 【P5】public company affiliation

In [None]:
ls_co = df['备案单位'].unique()

In [None]:
# 本批次中与上市影视公司关联的项目有
info_public_film_co = pd.read_csv('/content/drive/MyDrive/Github/Article/reference/info_public_film_co.csv', index_col=0, encoding='utf-8-sig')
ls = info_public_film_co['公司简称'].apply(eval).sum()
pat_public = '|'.join(ls)
pat_public += '|阿里|腾讯|爱奇艺|英皇|寰亚|银都|美亚|大盛|儒意|灿星|横店|中企广视'
df_focus = df_curr.loc[df_curr['备案单位'].str.contains(pat_public), :]

In [None]:
### to be moved to tools in Content Repo
def get_company_other_regs(row,
                          df_ChinaFilm_reg: pd.DataFrame, 
                          days:int=180):
  """
  return:
    List[str]
  """
  df_co_other_reg =[]
  df_co_other_reg = df_ChinaFilm_reg[
    df_ChinaFilm_reg['备案单位'].str.contains('|'.join(row['备案单位'].split('、')))]
  
  if days > 0:
    cut_off_date =\
      df_co_other_reg['公示日期'].astype('datetime64[ns]')-pd.Timedelta(days)
    df_co_other_reg =\
      df_co_other_reg[
        df_co_other_reg['公示日期'].astype('datetime64[ns]') > cut_off_date]
  
  ls_co_other_reg = df_co_other_reg['片名'].to_list()
  ls_co_other_reg =\
    [release for release in ls_co_other_reg if release != row['片名']]
  
  return ls_co_other_reg

### to be moved to tools in Content Repo
def get_company_other_releases(company:str, 
                               curr_work:str, 
                               df_ChinaFilm_release: pd.DataFrame, 
                               days:int=180):
  """
  return:
    List[str]
  """
  df_co_other_releases =[]
  df_co_other_releases = df_ChinaFilm_release[
    df_ChinaFilm_release['第一出品单位'].isin(company.split('、'))]
  
  cut_off_date =\
    df_co_other_releases['公示日期'].astype('datetime64[ns]')-pd.Timedelta(days)
  df_co_other_releases =\
    df_co_other_releases[
      df_co_other_releases['公示日期'].astype('datetime64[ns]') > cut_off_date]
  
  ls_co_other_releases = df_co_other_releases['片名'].to_list()
  ls_co_other_releases =\
    [release for release in ls_co_other_releases if release != curr_work]
  
  return ls_co_other_releases

In [None]:
df_focus['other_reg'] = df_focus.apply(get_company_other_regs, df_ChinaFilm_reg=df_ChinaFilm_reg, axis=1)
cols = ['片名', '备案单位', '电影类别', '类型_ext','other_reg', '梗概']
df_focus[cols]

In [None]:
ids = [137, 158, 179]
df_curr['focus'] = False
df_curr.loc[ids, 'focus'] = True
df_focus_narrowed = df_curr.loc[ids, :].copy()
#ls_ptitles = []
#for content in df_focus_narrowed['梗概'].tolist():
#    ls_ptitles.append(''.join(autotitle.generate(content,1)).lower())
#ls_ptitles =  ['夺命主持', '北斗传说', '夺命命令', '舞动吧！少年', '妈妈再爱我一次']
#print(ls_ptitles)
########
P5 = ''
n_public_estimated = int(0.2*df_focus.shape[0] + 0.8*df_focus_narrowed.shape[0])
P5 += '本批次中，大数据分析识别出{}部上市影视公司关联项目，'.format(n_public_estimated )
P5 += '占比{}%。'.format(round((n_public_estimated/df_curr.shape[0]*100),2))
P5 += '结合题材与出品方实力，ContentAI聚焦关注'


for _, row in df_focus_narrowed.iterrows():
  P5 += '《{}》'.format(row['片名'])
  P5 += '、'
  #P5 += '\n编剧：{writer}'.format(writer=row['编剧'])
  #P5 += '\n备案单位：{co}'.format(co=row['备案单位'])
  #P5 += '\n{synopsis}\n'.format(synopsis=row['梗概'])

P5 = P5.rstrip('、')
P5 += '，故事题材覆盖{}'.format('、'.join(df_focus_narrowed['事件'].unique()))

P5 += '。'

print(P5)

## 【P5wc】 preview with wordcloud

In [None]:
from IPython.display import Image as Img
from PIL import Image as pil

path_posters = '/content/drive/MyDrive/Github/Article/img/posters'
path_icon = '/content/drive/MyDrive/Github/Article/img/genre_icon'

#df_label2image.to_json(path_wd + '/records/df_label2image.json')
df_label2image = pd.read_json(path_wd + '/records/df_label2image.json')

In [None]:
df_focus_narrowed['src_img'] = None
df_focus_narrowed['tgt_img'] = None

In [None]:
df_curr.columns

In [None]:
from PIL import Image as pil
from PIL import ImageDraw, ImageFont
import cv2
  
# read the images
def plot_itext(v):
  dcatimg = {}
  img_concat = pil.new('RGB', (360, 50), color='white')
  hpos = 0
  for subcat in ['年代'] + subcats:
    dcatimg[subcat]= pil.new('RGB', (60, 50), color = 'white')
    dtext = ImageDraw.Draw(dcatimg[subcat])

    fp_font1 = '/content/drive/MyDrive/Github/Article/fonts/MSYHBD.TTC'
    fnt = ImageFont.truetype(fp_font1, 14)
    dtext.text((10,10), "{}".format(subcat), font=fnt, fill=(10,10,10))

    fp_font2 = '/content/drive/MyDrive/Github/Article/fonts/simhei.ttf'
    fnt = ImageFont.truetype(fp_font2, 14)
    dtext.text((10,10), "\n{}".format(v[subcat]), font=fnt, fill=(10,10,10))

    img_concat.paste(dcatimg[subcat], (hpos, 0))
    hpos += 60
  return img_concat


In [None]:
df_focus_narrowed['gtext_img'] =\
  df_focus_narrowed[['年代'] + subcats].apply(plot_itext, axis=1)

In [None]:
from IPython.display import Image as Img
from PIL import Image as pil
path_img = '/content/drive/MyDrive/Github/Article/img'

for i, row in df_focus_narrowed.iterrows():
  ls = [df_focus_narrowed.loc[i, '片名']] * 20
  ls += [df_focus_narrowed.loc[i, '预测片名']] *10
  ls += df_focus_narrowed.loc[i, '主要角色']*6
  #ls += [df_focus_narrowed.loc[i, '类型_ext']] *3
  ls += [df_focus_narrowed.loc[i, '环境']] *3
  ls += [df_focus_narrowed.loc[i, '氛围']] *3
  ls += [df_focus_narrowed.loc[i, '事件']] *3
  ls += [df_focus_narrowed.loc[i, '人物']] *3
  ls += [df_focus_narrowed.loc[i, '情感']] *3
  ls += [df_focus_narrowed.loc[i, '年代']]*3
  ls += df_focus_narrowed.loc[i, 'kw']*2
  ls += [df_focus_narrowed.loc[i, '备案单位']]*3
  img_fn = df_focus_narrowed.loc[i, 'src_img']
  if not img_fn:
    img_fn = df_label2image.loc[
      df_label2image['label'] == df_focus_narrowed.loc[i, '类型_ext'],
      'fn'
  ].iloc[0]
  txt = '《{}》'.format(df_focus_narrowed.loc[i, '片名'])
  txt += '\n编剧：{}'.format(df_focus_narrowed.loc[i, '编剧'])
  writer_works = df.loc[df['编剧'] == df_focus_narrowed.loc[i, '编剧'], 
                        '片名'].to_list()
  if len(writer_works) > 1:
    writer_works.remove(df_focus_narrowed.loc[i, '片名'])
    txt += '，（其它作品：《{}》）'.format('》、《'.join(writer_works))
  
  txt += '\n备案单位：{}'.format(df_focus_narrowed.loc[i, '备案单位'])
  ls_co_other_releases =  get_company_other_releases(row['备案单位'],
                                                      row['片名'],
                                                      df_ChinaFilm_release)
  if len(ls_co_other_releases) > 0:
    txt += '\n出品单位近期推出的其它影片：《{}》'.format(
        '》、《'.join(ls_co_other_releases))


  print(txt)
  fp_img = path_img + '/genre_icon/{}'.format(img_fn)
  fp_mask = path_img + '/genre_icon/{}'.format('mask_' + img_fn)
  #fp_img = path_img + '/genre_icon/{}'.format('psychedelic-1084082_960_720.jpg')
  fp_generated_img = utils.generate_word_image(ls, 
                                               fp_img, 
                                               fp_mask,
                                               fp_prefix='ChinaFilm_Reg_Overview',
                                               fp_suffix=issue_name,
                                               img_width=400,
                                               )
  
  img_wc = pil.open(fp_generated_img)
  df_focus_narrowed.loc[i, 'gtext_img'] =\
    df_focus_narrowed.loc[i, 'gtext_img'].resize(
      (img_wc.size[0], 
       int(50/df_focus_narrowed.loc[i, 'gtext_img'].size[0]*img_wc.size[0])
      )
    )

  img_concat = pil.new('RGB', 
                    (img_wc.size[0],
                    img_wc.size[1]+df_focus_narrowed.loc[i, 'gtext_img'].size[1]),
                    color='white'
  )

  img_concat.paste(img_wc, (0, 0))
  img_concat.paste(df_focus_narrowed.loc[i, 'gtext_img'],
                   (0, img_wc.size[1]))

  img_concat.save(fp_generated_img)
  display(Img(fp_generated_img, width=400))
  df_focus_narrowed.loc[i, 'tgt_img'] = fp_generated_img

## 【S0】 head block
2021年4月28日，3月的电影备案公示发布，其中最遥远的是影剧备字〔2021〕第1360号的《幕后英雄》，最近期的是影特备字〔2021〕第006号的《熊猫传奇——黑洞之吻》，慷田AI聚焦、重点关注的项目有中国电影的《发明一个夏天》、光线的《计划外的姐弟恋》、《二郎神》、《土行孙之破土重生》、爱奇艺的《日常警事》以及阿里的《无价之宝》。

In [None]:
df_focus_narrowed['单位简称'] = df_focus_narrowed['备案单位'].str.extract('('+ pat_public + ")")
df_f = df_focus_narrowed.groupby('单位简称')['片名'].apply(
    lambda x: '、'.join('《' + x + '》')).rename('关注影片').reset_index()
#####
S0 = '{year}年{month}月{day}日，'.format(
    year=pub_year, month=pub_month, day=pub_day)

S0 +=  '{month}月'.format(month=df_curr.iloc[0]['公示批次起始'][1])
if df_curr.iloc[0]['公示批次起始'][2] != '整月':
  S0 += '{duration}'.format(duration=df_curr.iloc[0]['公示批次起始'][2])

S0 += '电影备案共计{}部，其中'.format(df_curr.shape[0])
S0 += '最遥远的是{}的《{}》，'.format(
    df_curr_sorted.loc[0, '备案立项号'], df_curr_sorted.loc[0, '片名'])
S0 += '最近期的是{}的《{}》，'.format(
    df_curr_sorted.loc[df_curr.shape[0]-1, '备案立项号'], 
    df_curr_sorted.loc[df_curr.shape[0]-1, '片名']
)

S0 += 'ContentAI聚焦关注的项目有'
for i, row in df_f.iterrows():
  if i == df_f.shape[0]-1:
    S0 = S0.rstrip('、')
    S0 += '以及{co}的{film}'.format(co=row['单位简称'], film=row['关注影片'])
  else:
    S0 += '{co}的{film}、'.format(co=row['单位简称'], film=row['关注影片'])
S0 += '。'

print(S0)

## 【R1】 point to ChinaFilm website


In [None]:
#####
R1 = '\n\n'
R1 += 'ContentAI结合自主调研及多方大数据比对，通过分析、建模，提炼关键信息。'
R1 += '电影立项备案公示信息来自中国国家电影局 China Film Administration, 官方网址'
R1 += ' http://www.chinafilm.gov.cn/chinafilm 。'

print(R1)

## 【R2，R3】 original link pointer

In [None]:
R2 = '点击左下角阅读原文查看本期慷田AI影片信息详表。'
R3 = '点击左下角阅读原文查看本期慷田AI影片概览分析。'

## Save or load df_{issue_name}

In [None]:
# Save records for current reg issue
path_records = '/content/drive/MyDrive/Github/Article/records'
df_curr.to_pickle(path_records + '/df_reg_{}.pkl'.format(issue_name))

In [None]:
df_curr = pd.read_pickle(path_wd + '/records/df_reg_{}.pkl'.format(issue_name))

#[Output Article in Word]

## Install and Load Libraries

In [None]:
%%capture
!pip install python-docx
!pip install lxml
from docx import Document
from docx.shared import Inches
from docx.oxml.ns import qn
#from docx.text.parargaph import Paragraph

## Write Overview to docx and download


In [None]:
path_doc = '/content/drive/MyDrive/Github/Article/docx'

doc = Document()

# Set document Font 
doc.styles['Normal'].font.name = '微软雅黑'
r = doc.styles['Normal']._element
r.rPr.rFonts.set(qn('w:eastAsia'), '微软雅黑')

# Title
doc.core_properties.title = 'China Film Registration Overview {}'.format(
    issue_name)

doc.add_heading(T1, 0)
p = doc.add_paragraph(S0, style='Intense Quote')

doc.add_paragraph(P1)

doc.add_picture(fp_plot_type, width =Inches(4))
doc.add_paragraph(P2)

doc.add_paragraph(P3)
for subcat in subcats:
  doc.add_picture(dplots['plot'][subcat], width =Inches(4))
  doc.add_paragraph('')
#doc.add_picture(fp_plot_genre, width =Inches(4))


doc.add_picture(fp_plot_time, width =Inches(4))
doc.add_paragraph(P4)

doc.add_paragraph(P5)
for i, row in df_focus_narrowed.iterrows():
  txt = '\n《{}》'.format(row['片名'])
  txt += '\n编剧：{}'.format(row['编剧'])

  if df.loc[df['编剧'].str.contains(row['编剧']),
            '片名'
            ].shape[0] > 1:
    writer_works = df.loc[
        df['编剧'].str.contains(row['编剧']), 
        '片名'
        ].to_list()
    writer_works.remove(row['片名'])
    txt += '，（其它作品：《{}》）'.format('》、《'.join(writer_works))
  
  txt += '\n备案单位：{}'.format(row['备案单位'])
  ls_co_other_releases =  get_company_other_releases(
    row['备案单位'], row['片名'],df_ChinaFilm_release)
  if len(ls_co_other_releases) > 0:
    txt += '\n备案单位近期推出的其它影片：《{}》'.format(
        '》、《'.join(ls_co_other_releases))  
    
  doc.add_paragraph(txt)
  doc.add_picture(row['tgt_img'], width=Inches(4))

doc.add_paragraph(R1)
doc.add_paragraph(R2)

fp_doc = path_doc + '/ChinaFilm_Reg_Overview_' + issue_name + '.docx'
doc.save(fp_doc)

In [None]:
# Download Document
from google.colab import files
files.download(fp_doc)

## Write Table to docx and downlaod

In [None]:
path_doc = '/content/drive/MyDrive/Github/Article/docx'

doc = Document()

# Set document Font 
doc.styles['Normal'].font.name = '微软雅黑'
r = doc.styles['Normal']._element
r.rPr.rFonts.set(qn('w:eastAsia'), '微软雅黑')

# Title
doc.core_properties.title = 'China Film Registration Table {}'.format(
    issue_name)

doc.add_heading('详表：{}'.format(T1), 0)
p = doc.add_paragraph(S0, style='Intense Quote')

table = doc.add_table(rows=1, cols=1, style='Light List Accent 4')

df_curr_sorted = df_curr.sort_values(
    ['电影类别', '事件'], ascending=[False, True])

i = 1 # use fresh counter, as df indicies may not be ordered
for _, row in df_curr_sorted.iterrows():

  # Movie Title
  row_cells = table.add_row().cells
  row_cells[0].text = '[{}] 《{}》'.format(i ,row['片名'])

  # Writer
  row_cells = table.add_row().cells
  row_cells[0].text = '编剧: ' + row['编剧']
  if df.loc[df['编剧'].str.contains(row['编剧']), '片名'].shape[0] > 1:
    writer_works = df.loc[df['编剧'].str.contains(row['编剧']), 
                          '片名'].to_list()
    if len(writer_works) > 1:
      writer_works.remove(row['片名'])
      txt = '《{}》'.format('》、《'.join(writer_works))
    row_cells = table.add_row().cells
    row_cells[0].text = '其它作品：' + txt

  # Company
  row_cells = table.add_row().cells
  row_cells[0].text = '备案单位: ' + row['备案单位']
  # company's other titles 
  ls_co_other_releases =  get_company_other_releases(
    row['备案单位'], row['片名'],df_ChinaFilm_release)
  if len(ls_co_other_releases) > 0:
    row_cells = table.add_row().cells
    row_cells[0].text = '\n备案单位近期推出的其它影片：《{}》'.format(
        '》、《'.join(ls_co_other_releases))  
  
  # Type
  row_cells = table.add_row().cells
  row_cells[0].text = '类别：{}，年代：{}'.format(row['电影类别'], row['年代'])
  
  row_cells = table.add_row().cells
  row_cells[0].text = '环境/氛围/人物/事件/情感：{} {} {} {} {}'.format(
      row['环境'], row['氛围'], row['人物'], row['事件'], row['情感'],
  )

  # Synopsis
  row_cells = table.add_row().cells
  row_cells[0].text = row['梗概']

  row_cells = table.add_row().cells
  row_cells[0].text = ''
  i+=1

doc.add_paragraph(R1)
doc.add_paragraph(R3)

fp_doc = path_doc + '/ChinaFilm_Reg_Table_' + issue_name + '.docx'
doc.save(fp_doc)

In [None]:
# Download Document
from google.colab import files
files.download(fp_doc)