<a href="https://colab.research.google.com/github/CT-Cultures/Content/blob/master/RegOverview_Issue.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mount drive and install requirements

In [None]:
# Mount Drive
import os
import sys
import gc

from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install requirements in this cell,
# then restart runtime after initial installation

# fetch Content from github and install requirements
path_Content = "/content/drive/MyDrive/Github/Content" ###
if not os.path.exists(path_Content):
  !git clone https://github.com/CT-Cultures/Content.git {path_Content}
os.chdir(path_Content)
!pip install -r requirements.txt

path_Article = "/content/drive/Mydrive/Github/Article" ###
if not os.path.exists(path_Article):
  !git clone https://github.com/CT-Cultures/Article.git {path_Article}

In [None]:
# Install Chromedriver
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

In [None]:
# Check Environment
import pandas as pd
import transformers
import nltk
print('pandas version: {}, (>= 1.3.2)'.format(pd.__version__)) # pd has to >= 1.3.2, restart runtime
print('transformers version: {}'.format(transformers.__version__))
#print('nltk version: {}, (>=3.3)'.format(nltk.__version__))

!which python
!python --version
!nvidia-smi

# Import libraries and set path

In [None]:
# Set path
path_Article = path_wd = '/content/drive/MyDrive/Github/Article'
path_font = '/content/drive/MyDrive/Github/Article/fonts/STHUPO.TTF'
path_img = '/content/drive/MyDrive/Github/Article/img'

path_Content = '/content/drive/MyDrive/Github/Content'
path_ChinaFilm = path_Content + '/tools/sources/ChinaFilm'
path_NRTA = path_Content + '/tools/sources/NRTA'
path_ZGDYPW = path_Content + '/tools/sources/ZGDYPW'

os.chdir(path_wd)

In [None]:
#Load Libraries Global
import os
import datetime as dt
import re
import pandas as pd
import numpy as np
import torch
from bs4 import BeautifulSoup

from selenium import webdriver

import matplotlib as mp
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
import matplotlib.font_manager as fm

path_fonts = path_Article + '/fonts'
fontprop = fm.FontProperties(fname=path_fonts, size= 15)

font_dirs = [path_fonts, ]
font_files = fm.findSystemFonts(fontpaths=font_dirs)
font_list = fm.createFontList(font_files)
for font in font_files:
  fm.fontManager.addfont(font)

plt.rcParams['figure.figsize'] = [15, 9]
mp.rcParams['font.family'] = ['Microsoft YaHei']

%matplotlib inline
print(mp.get_cachedir())

In [None]:
# Load Libraries Local
%load_ext autoreload
#%reload_ext autoreload 2

os.chdir(path_ChinaFilm)
from Record_Registration import Registration
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

# Initialize Registration Class
driver = webdriver.Chrome('chromedriver',options=chrome_options)
dy_reg = Registration(driver)

# import utils for making plots and wordclouds
os.chdir(path_wd)
from generate import utils

# import tools for prediction
os.chdir(path_Content)
import predict

font_dirs = [path_fonts, ]
font_files = fm.findSystemFonts(fontpaths=font_dirs)
#font_list = fm.createFontList(font_files)
for font in font_files:
  fm.fontManager.addfont(font)

plt.rcParams['figure.figsize'] = [15, 9]
mp.rcParams['font.family'] = ['Microsoft YaHei']

%matplotlib inline

# [Import Datasets]

### Update records

In [None]:
# Update ChinaFilm records to reflect latest changes
os.chdir(path_ChinaFilm)

# bypass update in colab, because the connection
# from google to ChinaFilm's server is too slow
#!python update_release.py
#!python update_registration.py

### Import Records

In [None]:
# Import ChinaFilm registration
df = pd.read_json(path_ChinaFilm + '/records/contents_of_registrations.json')
df = dy_reg.Refined_Records(df) #clean up records

In [None]:
df['电影类别'].unique()

In [None]:
# Load current issue (most recent one)
issue_name = '202010_202110'
df_curr = df[df['电影类别'] == '纪录片']
df_curr['公示批次名称'].unique()

year_of_interest = '年|'.join(['2018', '2019', '2020', '2021'])
df_curr = df_curr[df_curr['公示批次名称'].str.contains(year_of_interest)]
print(issue_name)

#[Process Datasets]

## Predict and extract features

In [None]:
os.chdir(path_Content) # change to the Content directory

# predict alternative title
df_curr['预测片名'] = predict.predict_title(df_curr['梗概'].tolist())

# predict genre
df_curr['类型'] = predict.predict_genre(df_curr['梗概'].tolist())

# predict time period
df_curr['年代'] = predict.predict_time(df_curr['梗概'].tolist())

# extract keywords, topK=10
df_curr['kw'] = df_curr['梗概'].apply(predict.extract_keywords, topK=10).copy()

# identify main characters
df_curr['主要角色'] = df_curr['梗概'].apply(predict.identify_characters).copy()

## Interactively adjust features

In [None]:
# adjust genre
df_curr['类型_ext'] = df_curr['类型'].copy()

In [None]:
i = 0
batch = 10
df_curr['类型_ext'][i:i+batch].to_numpy()

In [None]:
i+= batch
print('{} - {}'.format(i, i+batch))
df_curr['类型_ext'][i:i+batch].to_numpy() # copy out put to next cell to modify genre

In [None]:
df_curr['类型_ext'].iloc[i:i+batch] = \
['人物', '探索', '人物，冒险', '农村', '群体，竞技', '战争', '工业', '人物', '旅途', '历史']

In [None]:
# run cell, click make  interactive tables at the lower left to edit
df_curr[['片名', '类型', '类型_ext', '梗概']]

## Save / Load Reg_Issue

In [None]:
# Save records for current release issue
df_curr.to_pickle(path_wd + '/records/df_reg_documentary_{}.pkl'.format(issue_name))

In [None]:
df_curr = pd.read_pickle(path_wd + '/records/df_reg_documentary_{}.pkl'.format(issue_name))

# [Generate Content]

##[2.1] 2021年3月至7月纪录片备案回溯

In [None]:
# Title
T1 = '{}年'.format(df_past6.tail(1)['公示覆盖期间'].iloc[0][0].year)
T1 += '{}月'.format(df_past6.tail(1)['公示覆盖期间'].iloc[0][0].month)
T1 += '至'
if df_past6.tail(1)['公示覆盖期间'].iloc[0][0].year != \
  df_past6.head(1)['公示覆盖期间'].iloc[0][0].year:
    T1 += '{}年'.format(df_past6.head(1)['公示覆盖期间'].iloc[0][0].year)
T1 += '{}月'.format(df_past6.head(1)['公示覆盖期间'].iloc[0][0].month)

issue_name = T1

T1 += '纪录片备案概览'
print(T1)

##[2.2] 	本次回溯共包括6期的公示备案，公示日期分别为2021年9月7日、8月6日、7月2日、6月4日、4月28日和4月2日。


In [None]:
# Write Content
T2 = ''
T2 += '本次回溯共包括{}期的公示备案，'.format(len(ls_issues))
T2 += '共计{}部影片，'.format(df_past6.shape[0])
T2 += '来自{}个单位。'.format(len(df_past6['备案单位'].unique()))
T2 += '公示日期分别为'
yr = None
i = 0
for dt in ls_issues:
  if not yr: 
    yr = pd.to_datetime(dt).year
    T2 += '{}年'.format(yr)

  elif pd.to_datetime(dt).year != yr:
    yr = pd.to_datetime(dt).year
    T2 += '{}年'.format(yr)

  if i == len(ls_issues)-2:
    T2 += '{}月{}日和'.format(pd.to_datetime(dt).month, pd.to_datetime(dt).day)
  else:
    T2 += '{}月{}日、'.format(pd.to_datetime(dt).month, pd.to_datetime(dt).day)
  
  i+=1

T2 = T2.rstrip('、') + '，'

T2 += '其中，'
T2 += '最遥远的是{}的《{}》，'.format(
    df_past6_sorted.loc[0, '备案立项号'], df_past6_sorted.loc[0, '片名'])
T2 += '最近期的是{}的《{}》。'.format(
    df_past6_sorted.loc[df_past6.shape[0]-1, '备案立项号'], 
    df_past6_sorted.loc[df_past6.shape[0]-1, '片名']
)

print(T2)

## [2.3] 按出品单位性质划分，官方机构出品的预计有37部，民营公司出品的预计有15部，没有识别出上市公司关联项目。

In [None]:
df_past6['备案单位'].unique()

ls_co_gvt = ['中国数字文化集团有限公司',
             '中国农业电影电视中心',
             '中央新闻纪录电影制片厂（集团）',
             '深圳广播电影电视集团',
             '深圳广电影视股份有限公司',
             '山东新农村数字电影院线有限公司',
             '青海广电影视传媒有限公司',
                          
             ]

In [None]:
df_past6[~df_past6['备案单位'].str.contains('|'.join(ls_co_gvt))].shape[0]

In [None]:
info_public_film_co = pd.read_csv('/content/drive/MyDrive/Github/Content/tools/articles/reference/info_public_film_co.csv', index_col=0, encoding='utf-8-sig')
ls = info_public_film_co['公司简称'].apply(eval).sum()
pat_public = '|'.join(ls)
pat_public += '|阿里|腾讯|爱奇艺|英皇|寰亚|银都|美亚|大盛|儒意|灿星|横店'
print(pat_public)

df_pub = df_past6[~df_past6['备案单位'].str.contains('|'.join(ls_co_gvt))]
df_pub = df_pub[df_pub['备案单位'].str.contains(pat_public)]
df_pub

In [None]:
ids = []
df_past6['focus'] = False
df_past6.loc[ids, 'focus'] = True
df_focus_narrowed = df_past6.loc[ids].copy()

T3 = ''
T3 = '按出品单位性质划分，'
ngvt = df_past6[~df_past6['备案单位'].str.contains('|'.join(ls_co_gvt))].shape[0]
T3 += '官方机构出品的预计有{}部，'.format(ngvt)
T3 += '民营公司出品的预计有{}部，'.format(df_past6.shape[0]-ngvt)

if df_focus_narrowed.shape[0] == 0:
  T3 += '没有识别出上市公司关联项目。'
else:
  T3 += '与上市影视公司相关联的作品{}部，'.format(df_focus_narrowed.shape[0])

  T3 += '占比{}%。'.format(round((df_focus.shape[0]/df_past6.shape[0]*100),2))
  T3 += '结合题材与出品方实力，慷田AI聚焦关注'

  for _, row in df_focus_narrowed.iterrows():
    T3 += '《{}》'.format(row['片名'])
    T3 += '、'
    #T3 += '\n编剧：{writer}'.format(writer=row['编剧'])
    #T3 += '\n备案单位：{co}'.format(co=row['备案单位'])
    #T3 += '\n{synopsis}\n'.format(synopsis=row['梗概'])
  T3 = T5.rstrip('、')
  T3 += '，题材覆盖{}'.format('、'.join(df_focus_narrowed['类型'].unique()))
  T3 += '。'

top_gvt = df_gvt.groupby('备案单位')['片名'].count().sort_values(
    ascending=False)
top_gvt = top_gvt.rename('数量').reset_index()
if  top_gvt.head(1)['数量'].iloc[0] > 1:
  T3 += '最活跃的官方出品方是{}，共计{}部影片，'.format(
      top_gvt.head(1)['备案单位'].iloc[0], 
      top_gvt.head(1)['数量'].iloc[0])
  
  topics = df_gvt.loc[df_gvt['备案单位'] == top_gvt.head(1)['备案单位'].iloc[0],
             '主题'].unique()
  T3 += '内容题材有{}。'.format('、'.join(topics))

  
top_private = df_private.groupby('备案单位')['片名'].count().sort_values(
    ascending=False)
top_private = top_private.rename('数量').reset_index()
if  top_private.head(1)['数量'].iloc[0] > 1:
  T3 += '最活跃的民营出品方是{}，共计{}部影片，'.format(
      top_private.head(1)['备案单位'].iloc[0], 
      top_private.head(1)['数量'].iloc[0])
  
  topics = df_private.loc[df_private['备案单位'] == top_private.head(1)['备案单位'].iloc[0],
             '主题'].unique()
  T3 += '内容题材有{}。'.format('、'.join(topics))

#df_gvt.groupby('备案单位')['片名'].count().sort_values(ascending=False)

print(T3)

##[2.4] 按影片内容体现的年代划分，当代32部、古代8部、现代6部以及近代6部。

In [None]:
df_n_time = df_past6.groupby('年代')['年代'].count().rename('数量'
  ).reset_index().sort_values('数量', ascending=False)

plt.clf()
plt.rcParams['figure.figsize'] = [8, 4.5]
plt.rcParams['axes.facecolor'] = 'white'
ax = df_n_time.plot(
    kind = 'bar',
    grid = True,
    fontsize = 22,
    rot = 0,
    color = ['violet'],
)
ax.set_title("年代",fontsize= 24, pad=20)
ax.spines['top'].set_color('black')
ax.spines['bottom'].set_color('black')
ax.spines['left'].set_color('black')
ax.spines['right'].set_color('black')
ax.grid(color='gray', linestyle='-', linewidth=0.5)
ax.set_xlabel('年代',fontsize= 18)
ax.set_xticklabels(df_n_time['年代'])
ax.set_ylabel("数量",fontsize= 18)
ax.legend(fontsize=22)

fp_plot_time = path_img + '/df_RegDoct_plot_time_{}.png'.format(issue_name)
plt.savefig(fp_plot_time)

plt.show()

## [2.5] 按选题划分，人物聚焦14部、戏曲10部、人文艺术4部、工业建设4部、人文体育3部、人文游记3部、战争与革命3部、乡村2部、人文自然2部、人文风光2部、农业2部、城市变迁1部、演出1部以及考古1部

In [None]:
T4 = ''
T4 += '按影片内容体现的年代划分，'
for i, row in df_n_time.iterrows():
  if i == df_n_time.shape[0]-2:
    T4 += '{}{}部以及'.format(row['年代'], row['数量'])
  else:
    T4 += '{}{}部、'.format(row['年代'], row['数量'])

T4 = T4.rstrip('、') + '。'

print(T4)

In [None]:
ls_topic = ['戏曲',
       '戏曲',
       '戏曲',
       '戏曲',
       '戏曲',
       '戏曲',
       '戏曲',
       '戏曲',
       '戏曲',
       '戏曲',
       '乡村',
       '演出与活动',
       '人物聚焦',
       '人物聚焦',
       '人物聚焦',
       '人物聚焦',
       '人物聚焦',
       '人物聚焦',
       '工业建设',
       '农业',
       '人物聚焦',
       '人文游记',
       '人文体育',
       '乡村',
       '人文体育',
       '战争与革命',
       '工业建设',
       '人物聚焦',
       '人文游记',
       '战争与革命',
       '人文风光',
       '考古',
       '人物聚焦',
       '人文自然',
       '人文自然',
       '人物聚焦',
       '城市变迁',
       '农业',
       '人文体育',
       '人文风光',
       '人物聚焦',
       '人文艺术',
       '人文艺术',
       '工业建设',
       '战争与革命',
       '人文艺术',
       '人物聚焦',
       '人文艺术',
       '人物聚焦',
       '人文游记',
       '人物聚焦',
       '工业建设']
df_past6['主题'] = ls_topic

In [None]:
df_n_topic = df_past6.groupby('主题')['主题'].count().rename('数量'
  ).reset_index().sort_values('数量', ascending=False)

plt.clf()
plt.rcParams['figure.figsize'] = [12, 7]
plt.rcParams['axes.facecolor'] = 'white'
ax = df_n_topic.plot(
    kind = 'bar',
    grid = True,
    fontsize = 22,
    rot = 0,
    color = ['violet'],
)
ax.set_title("年代",fontsize= 24, pad=20)
ax.spines['top'].set_color('black')
ax.spines['bottom'].set_color('black')
ax.spines['left'].set_color('black')
ax.spines['right'].set_color('black')
ax.grid(color='gray', linestyle='-', linewidth=0.5)
ax.set_xlabel('主题',fontsize= 18)
ax.set_xticklabels(df_n_topic['主题'], fontsize= 16, rotation=45)
ax.set_ylabel("数量",fontsize= 18)
ax.legend(fontsize=22)

fp_plot_topic = path_img + '/df_RegDoct_plot_topic_{}.png'.format(issue_name)
plt.savefig(fp_plot_topic)

plt.show()

In [None]:
T5 = ''
T5 += '按选题划分，'
for i, row in df_n_topic.iterrows():
  if i == df_n_topic.shape[0]-2:
    T5 += '{}{}部以及'.format(row['主题'], row['数量'])
  else:
    T5 += '{}{}部、'.format(row['主题'], row['数量'])

T5 = T5.rstrip('、') + '。'

print(T5)

In [None]:
dfbyco = df_past6.set_index('备案单位')
dfbyco[['片名', '年代', '类型', '梗概', '预测片名']]

In [None]:
df_past6.columns

In [None]:
df_past6[~df_past6['备案单位'].str.contains('|'.join(ls_co_gvt))]['主题'].unique()

##[2.6] 生成词云图
 Create Summary Word Cloud ImageImage

In [None]:
import wordcloud
from PIL import Image
import matplotlib as mp
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

path_font = '/content/drive/MyDrive/Github/Content/tools/articles/fonts/STHUPO.TTF'
path_img = '/content/drive/MyDrive/Github/Content/tools/articles/img'

def generate_word_image(ls_words, img_name):
  path_genre_icon = path_img + '/genre_icon/{}'.format(img_name)
  path_mask_image = path_img + '/genre_icon/mask_{}'.format(img_name)
  path_color_image = path_img + '/genre_icon/{}'.format(img_name)

  #wordcloud.color_from_image(coloring)
  mask = np.array(Image.open(path_mask_image)) # 定义词频背景
  coloring = np.array(Image.open(path_color_image)) # Load Image for coloring
  #image_colors = wordcloud.ImageColorGenerator(mask) # 从背景图建立颜色方案
  #image_colors = wordcloud.ImageColorGenerator(coloring, default_color=(79, 46, 47))
  image_colors = wordcloud.ImageColorGenerator(coloring)
  #wc.recolor(color_func=image_colors,  random_state=3)

  #image_colors = wordcloud.ImageColorGenerator(mask) # 从背景图建立颜色方案

  wc = wordcloud.WordCloud(
      font_path=path_font,
      width = 400,
      height = 100,
      scale = 1,
      mask=mask, # set back ground mask image
      max_words=224,
      max_font_size=188,
      min_font_size=2,
      #mode="RGBA",
      mode="RGB",
      background_color='white',
      #background_color=None, 
      #background_color="rgba(255, 255, 255, 0)", 
      #contour_width=3, 
      #contour_color='steelblue',
      repeat=True,
      color_func=image_colors,
  )

  wc.generate(','.join(ls_words))

  #image_colors = wordcloud.ImageColorGenerator(mask) # 从背景图建立颜色方案
  #wc.recolor(color_func=image_colors) # 将词云颜色设置为背景图方案

  plt.imshow(wc) # 显示词云
  plt.axis('off') # 关闭坐标轴
  plt.show()

  fp_generated_img = path_img + '/FilmReg_Documentary_'  \
             + img_name.split('.')[0] + '_' \
             + ls_words[0] + '_' + issue_name + '.png'
  wc.to_file(fp_generated_img)

  return fp_generated_img


In [None]:
df_topics2img

In [None]:
df_topics2img = df_past6['主题'].drop_duplicates().to_frame()
df_topics2img ['img'] = None

df_topics2img ['主题'].to_numpy()

df_topics_img = ['low-poly-2789706_1280.png', 
 'sun-1064482_1280.png', 'rose-31436_1280.png', 
 'family-2112266_1280.png', 'colorful-1312810_1280.png', 'carrot-33625_1280.png', 
 'feet-3350837_1280.png', 'colorful-1220745_1280.png', 'muscle-1085672_1280.png',
 'art-2687649_1280.png', 'graphic-4259507_1280.png', 
 'colorful-1220745_1280.png', 
 'colorful-1197312_1280.png', 'low-poly-2789706_1280.png'
]
 
df_topics2img['img']  = df_topics_img

In [None]:
%%capture
df_private = df_past6[~df_past6['备案单位'].str.contains('|'.join(ls_co_gvt))]

df_private['src_img'] = None
df_private['tgt_img'] = None

In [None]:
df_private.columns

In [None]:
for i in df_private.index:
  ls = [df_private.loc[i, '片名']] * 10
  ls += [df_private.loc[i, '预测片名']] *6
  ls += df_private.loc[i, '主要角色']*3
  ls += [df_private.loc[i, '主题']] *3
  ls += [df_private.loc[i, '年代']]*2
  ls += df_private.loc[i, 'kw']
  img_fn = df_private.loc[i, 'src_img']
  if not img_fn:
    img_fn = df_topics2img.loc[
        df_topics2img['主题'] == df_private.loc[i, '主题'],
        'img'
    ].iloc[0]
    print(df_private.loc[i, '主题'])
  fp_generated_img = generate_word_image(ls, img_fn)
  df_private.loc[i, 'tgt_img'] = fp_generated_img

In [None]:
df_gvt = df_past6[df_past6['备案单位'].str.contains('|'.join(ls_co_gvt))]

df_gvt_count = df_gvt.groupby('备案单位')['片名'].count().rename(
    '数量').reset_index()
df_gvt = df_gvt.merge(df_gvt_count, on='备案单位', how='left')
df_gvt = df_gvt.sort_values('数量', ascending='False')

df_gvt.columns

In [None]:
df_gvt['src_img'] = None
df_gvt['tgt_img'] = None

In [None]:
for i, row in df_gvt[['备案单位', '数量']].drop_duplicates().sort_values(
    '数量', ascending=False).iterrows():
    ls = df_gvt.loc[df_gvt['备案单位'] == row['备案单位'], '片名'].to_list()*10
    ls += df_gvt.loc[df_gvt['备案单位'] == row['备案单位'], '预测片名'].to_list()*6
    ls += df_gvt.loc[df_gvt['备案单位'] == row['备案单位'], '主题'].to_list()*3
    ls += df_gvt.loc[df_gvt['备案单位'] == row['备案单位'], '年代'].to_list()*3

    ls_tmp = df_gvt.loc[df_gvt['备案单位'] == row['备案单位'], '主要角色'].to_list()
    ls_tmp1 = []
    for l in ls_tmp:
      ls_tmp1.extend(l)
    ls += ls_tmp1*3

    ls_tmp = df_gvt.loc[df_gvt['备案单位'] == row['备案单位'], 'kw'].to_list()
    ls_tmp1 = []
    for l in ls_tmp:
      ls_tmp1.extend(l)
    ls += ls_tmp1*3

    img_choice = random.choice(
        df_gvt.loc[df_gvt['备案单位'] == row['备案单位'], '主题'].to_list())
    
    img_fn = df_topics2img.loc[
      df_topics2img['主题'] == img_choice,
      'img'
    ].iloc[0]

    fp_generated_img = generate_word_image(ls, img_fn)

    df_gvt.loc[df_gvt['备案单位'] == row['备案单位'], 'tgt_img'] = fp_generated_img
    print(ls, img_fn)


In [None]:
ls_topic_sorted = df_private.groupby('主题')['片名'].count().sort_values(
    ascending=False).reset_index()

ls_topic_sorted

## [2.7A] 生成图文内容，独立

In [None]:
from IPython.display import Image as Img

T6 = ''

#df_private['tgt_img']
#df_gvt['tgt_img']

ls_topic_sorted = df_private.groupby('主题')['片名'].count().sort_values(
    ascending=False).reset_index()
ls_topic_sorted = ls_topic_sorted['主题'].tolist()

T6 += '独立纪录片题材丰富，内容包括了{}'.format('、'.join(ls_topic_sorted))
T6 += '。'
print(T6)
for topic in ls_topic_sorted:
  dftmp = df_private[df_private['主题'] == topic]
  print(topic)
  for i, row in dftmp.iterrows():
    print('《{}》，{}'.format(row['片名'], row['编剧']))
    print(row['备案单位'])
    display(Img(row['tgt_img'], width=300))



print(T6)

## [2.7B] 生成图文内容，官方

In [None]:
T7 = ''

#df_private['tgt_img']
#df_gvt['tgt_img']

ls_co_sorted = df_gvt.groupby('备案单位')['片名'].count().sort_values(
    ascending=False).reset_index()
ls_co_sorted = ls_co_sorted['备案单位'].tolist()

T7 += '官方制作机构分工明确，制作内容各有特色，包括了{}'.format('、'.join(ls_topic_sorted))
T7 += '。'
print(T7)
for co in ls_co_sorted:
  dftmp = df_gvt[df_gvt['备案单位'] == co]
  print('\n{}，{}部'.format(co, dftmp.shape[0]))
  for j, row in dftmp[['片名', '编剧']].iterrows():
    print('《{}》，{}；'.format(row['片名'], row['编剧']))
  
  display(Img(dftmp['tgt_img'].iloc[0], width=300))


##[2.7] 生成摘要
2021年4月28日，3月的电影备案公示发布，其中最遥远的是影剧备字〔2021〕第1360号的《幕后英雄》，最近期的是影特备字〔2021〕第006号的《熊猫传奇——黑洞之吻》，慷田AI聚焦、重点关注的项目有中国电影的《发明一个夏天》、光线的《计划外的姐弟恋》、《二郎神》、《土行孙之破土重生》、爱奇艺的《日常警事》以及阿里的《无价之宝》。

In [None]:
S0 = ''
S0 += issue_name

S0 += '的纪录片电影备案'
S0 += '共计{}部影片，'.format(df_past6.shape[0])
S0 += '来自{}个单位，'.format(len(df_past6['备案单位'].unique()))
ngvt = df_past6[~df_past6['备案单位'].str.contains('|'.join(ls_co_gvt))].shape[0]
S0 += '官方出品{}部，'.format(ngvt)
S0 += '独立出品{}部，'.format(df_past6.shape[0]-ngvt)

if df_focus_narrowed.shape[0] == 0:
  S0 += '无上市公司关联项目。'
else:
  S0 += '其中与上市影视公司关联的项目有{}部，'.format(df_focus_narrowed.shape[0])

"""
S0 += '慷田AI聚焦关注的项目有'
for i, row in df_f.iterrows():
  if i == df_f.shape[0]-1:
    S0 = S0.rstrip('、')
    S0 += '以及{co}的{film}'.format(co=row['单位简称'], film=row['关注影片'])
  else:
    S0 += '{co}的{film}、'.format(co=row['单位简称'], film=row['关注影片'])
S0 += '。'
"""
print(S0)

##[2.8] 指向电影剧官网


In [None]:
#####
T8 = '\n\n'
T8 += '慷田AI结合自主调研及多方大数据比对，通过分析、建模，提炼关键信息。'
T8 += '电影立项备案公示信息来自中国国家电影局 China Film Administration, 官方网址 '
T8 += ' http://www.chinafilm.gov.cn/chinafilm 。'

print(T6)

## [2.9] Save or load df_{issue_name}

In [None]:
issue_name

In [None]:
# Save records for current reg issue
path_records = '/content/drive/MyDrive/Github/Content/tools/articles/records'
df_past6.to_pickle(path_records + '/df_RegDocPast6_{}.pkl'.format(issue_name))
df_private.to_pickle(path_records + '/df_RegDoc_private_{}.pkl'.format(issue_name))
df_gvt.to_pickle(path_records + '/df_RegDoc_gvt_{}.pkl'.format(issue_name))

In [None]:
df_past6 = pd.read_pickle(path_records + '/df_DocumentaryReg_{}.pkl'.format(issue_name))

#[3] Output Word Document

In [None]:
df_private.groupby('主题')['片名'].count()

## 3.1 Install and Load Libraries

In [None]:
%%capture
!pip install python-docx
!pip install lxml
from docx import Document
from docx.shared import Inches
#from docx.text.parargaph import Paragraph

## 3.2 Output Docx


In [None]:
display(Img(dftmp['tgt_img'].iloc[0], width=300))

In [None]:
T5

In [None]:
path_doc = '/content/drive/MyDrive/Github/Content/tools/articles/docx'

doc = Document()
doc.core_properties.title = 'China Film Documentary Registration Overview ' \
  + issue_name

doc.add_heading(T1, 0)

p = doc.add_paragraph(S0, style='Intense Quote')

doc.add_paragraph(T2)

doc.add_paragraph(T3)

doc.add_paragraph(T4)
doc.add_picture(fp_plot_time, width =Inches(5))

doc.add_paragraph(T5)
doc.add_picture(fp_plot_topic, width =Inches(5))

doc.add_paragraph(T6)
ls_topic_sorted = df_private.groupby('主题')['片名'].count().sort_values(
    ascending=False).reset_index()
ls_topic_sorted = ls_topic_sorted['主题'].tolist()
for topic in ls_topic_sorted:
  dftmp = df_private[df_private['主题'].str.contains(topic)]
  doc.add_heading(topic, 1)
  for i, row in dftmp.iterrows():
    txt = '《{}》，{}\n'.format(row['片名'], row['编剧'])
    txt += '{}\n'.format(row['备案单位'])
    doc.add_paragraph(txt)
    doc.add_picture(row['tgt_img'], width =Inches(4))

doc.add_paragraph(T7)
ls_co_sorted = df_gvt.groupby('备案单位')['片名'].count().sort_values(
    ascending=False).reset_index()
ls_co_sorted = ls_co_sorted['备案单位'].tolist()
for co in ls_co_sorted:
  dftmp = df_gvt[df_gvt['备案单位'] == co]
  doc.add_heading( '{}，{}部'.format(co, dftmp.shape[0]), 1)
  txt = ''
  for j, row in dftmp[['片名', '编剧']].iterrows():
    txt += '《{}》，{}；'.format(row['片名'], row['编剧'])
  txt = txt.rstrip('；')
  doc.add_paragraph(txt)
  doc.add_picture(dftmp['tgt_img'].iloc[0], width =Inches(5))


doc.add_paragraph(T8)

doc.save(path_doc + '/ChinaFilm_DocumentaryReg_' + issue_name + '.docx' )


In [None]:
path_doc = '/content/drive/MyDrive/Github/Content/tools/articles/docx'

doc = Document()
doc.core_properties.title = 'China Film Documentary Registration Table' \
  + issue_name

doc.add_heading('{}纪录片备案汇总'.format(issue_name), 0)

p = doc.add_paragraph(S0, style='Intense Quote')

table = doc.add_table(rows=1, cols=1, style='Light List Accent 5')
i = 1
for _, row in df_past6.iterrows():
  row_cells = table.add_row().cells
  row_cells[0].text = '[' + str(i) + ']' + '《'.format(i) + row['片名'] + '》'

  row_cells = table.add_row().cells
  row_cells[0].text = '编剧：' + row['编剧']

  row_cells = table.add_row().cells
  row_cells[0].text = '备案单位：' + row['备案单位']

  row_cells = table.add_row().cells
  row_cells[0].text = '主题：{}  年代：{}'.format(row['主题'], row['年代'])

  row_cells = table.add_row().cells
  row_cells[0].text = row['梗概']

  row_cells = table.add_row().cells
  row_cells[0].text = ''
  i+=1

doc.add_paragraph(T8)

doc.save(path_doc + '/ChinaFilm_DocumentaryReg_Table_' + issue_name + '.docx' )
