<a href="https://colab.research.google.com/github/CT-Cultures/Content/blob/master/RegOverview_Issue.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mount drive and install requirements

In [None]:
# Mount Drive
import os
import sys
import gc

from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install requirements in this cell,
# then restart runtime after initial installation

# fetch Content from github and install requirements
path_Content = "/content/drive/MyDrive/Github/Content" ###
if not os.path.exists(path_Content):
  !git clone https://github.com/CT-Cultures/Content.git {path_Content}
os.chdir(path_Content)
!pip install -r requirements.txt

path_Article = "/content/drive/Mydrive/Github/Article" ###
if not os.path.exists(path_Article):
  !git clone https://github.com/CT-Cultures/Article.git {path_Article}

In [None]:
# Install Chromedriver
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

In [None]:
# Check Environment
import pandas as pd
import transformers
import nltk
print('pandas version: {}, (>= 1.3.2)'.format(pd.__version__)) # pd has to >= 1.3.2, restart runtime
print('transformers version: {}'.format(transformers.__version__))
#print('nltk version: {}, (>=3.3)'.format(nltk.__version__))

!which python
!python --version
!nvidia-smi

# Import libraries and set path

In [None]:
#Load Libraries Global
import os
import datetime as dt
import re
import pandas as pd
import numpy as np
import torch
from bs4 import BeautifulSoup

from selenium import webdriver

import matplotlib as mp
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
import matplotlib.font_manager as fm

path_fonts = path_Article + '/fonts'
fontprop = fm.FontProperties(fname=path_fonts, size= 15)

font_dirs = [path_fonts, ]
font_files = fm.findSystemFonts(fontpaths=font_dirs)
font_list = fm.createFontList(font_files)
for font in font_files:
  fm.fontManager.addfont(font)

plt.rcParams['figure.figsize'] = [15, 9]
mp.rcParams['font.family'] = ['Microsoft YaHei']

%matplotlib inline
print(mp.get_cachedir())

In [None]:
# Set path
path_Article = path_wd = '/content/drive/MyDrive/Github/Article'
path_font = '/content/drive/MyDrive/Github/Article/fonts/STHUPO.TTF'
path_img = '/content/drive/MyDrive/Github/Article/img'

path_Content = '/content/drive/MyDrive/Github/Content'
path_ChinaFilm = path_Content + '/tools/sources/ChinaFilm'
path_NRTA = path_Content + '/tools/sources/NRTA'
path_ZGDYPW = path_Content + '/tools/sources/ZGDYPW'

os.chdir(path_wd)

In [None]:
# Load Libraries Local
%load_ext autoreload
#%reload_ext autoreload 2

os.chdir(path_ChinaFilm)
from Record_Registration import Registration
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

# Initialize Registration Class
driver = webdriver.Chrome('chromedriver',options=chrome_options)
dy_reg = Registration(driver)

# import utils for making plots and wordclouds
os.chdir(path_wd)
from generate import utils

# import tools for prediction
os.chdir(path_Content)
import predict

# [Import Datasets]

### Update records

In [None]:
# Update ChinaFilm records to reflect latest changes
os.chdir(path_ChinaFilm)

# bypass update in colab, because the connection
# from google to ChinaFilm's server is too slow
#!python update_release.py
#!python update_registration.py

### Import Records

In [None]:
# Import ChinaFilm registration
df = pd.read_json(path_ChinaFilm + '/records/contents_of_registrations.json')
df = dy_reg.Refined_Records(df) #clean up records

In [None]:
# Load current issue (most recent one)
curr_issue_name = df.sort_values(['公示日期', '公示批次名称'], ascending=False)['公示批次名称'].iloc[0]
df_curr =  df.loc[df.公示批次名称 == curr_issue_name]
issue_name = df_curr['公示批次起始'].iloc[0][0] + '年' + df_curr['公示批次起始'].iloc[0][1] +'月'
issue_name += df_curr['公示批次起始'].iloc[0][2]
print(issue_name)

#[Process Datasets]

## Predict and extract features

In [None]:
os.chdir(path_Content) # change to the Content directory

# predict alternative title
df_curr['预测片名'] = predict.predict_title(df_curr['梗概'].tolist())

# predict genre
df_curr['类型'] = predict.predict_genre(df_curr['梗概'].tolist())

# predict time period
df_curr['年代'] = predict.predict_time(df_curr['梗概'].tolist())

# extract keywords, topK=10
df_curr['kw'] = df_curr['梗概'].apply(predict.extract_keywords, topK=10).copy()

# identify main characters
df_curr['主要角色'] = df_curr['梗概'].apply(predict.identify_characters).copy()

## Interactively adjust features

In [None]:
# adjust genre
df_curr['类型_ext'] = df_curr['类型'].copy()

In [None]:
i = 0
batch = 10
df_curr['类型_ext'][i:i+batch].to_numpy()

In [None]:
i+= batch
print('{} - {}'.format(i, i+batch))
df_curr['类型_ext'][i:i+batch].to_numpy() # copy out put to next cell to modify genre

In [None]:
df_curr['类型_ext'].iloc[i:i+batch] = \
['动物', '科幻', '科幻', '青少', '科幻']

In [None]:
# run cell, click make  interactive tables at the lower left to edit
df_curr[['片名', '类型', '类型_ext', '梗概']]

## Save / Load Reg_Issue

In [None]:
# Save records for current release issue
#df_curr.to_pickle(path_wd + '/records/df_reg_{}.pkl'.format(issue_name))

In [None]:
df_curr = pd.read_pickle(path_wd + '/records/df_reg_{}.pkl'.format(issue_name))

# [Generate Content]

## 【T1】 Title

In [None]:
# Title
T1 = '{year}年'.format(year=df_curr.iloc[0]['公示批次起始'][0])
T1 += '{month}月'.format(month=df_curr.iloc[0]['公示批次起始'][1])
if df_curr.iloc[0]['公示批次起始'][2] != '整月':
  T1 += '{duration}'.format(duration=df_curr.iloc[0]['公示批次起始'][2])
T1 += '电影备案公示划重点'
print(T1)

## 【P1】	oldest/newest reg

In [None]:
# Obtain Variables
df_curr.loc[:,'公示日期'] = df_curr.loc[:,'公示日期'].astype('datetime64')
pub_year = df_curr.iloc[0]['公示日期'].year
pub_month = df_curr.iloc[0]['公示日期'].month
pub_day = df_curr.iloc[0]['公示日期'].day
df_curr['备案申请年份'] = df_curr['备案申请年份'].astype('int')
df_curr_sorted = df_curr.sort_values(
    ['备案申请年份', '备案立项年度顺序号'], ascending=True
).reset_index(drop=True)

# Write Content
P1 = ''
P1 += '{year}年{month}月{day}日，'.format(year=pub_year, month=pub_month, day=pub_day)
P1 += '{month}月{part_of_month}的备案公示新鲜出炉，'.format(
    month=df_curr.iloc[0]['公示批次起始'][1], part_of_month=df_curr.iloc[0]['公示批次起始'][2])
P1 += '共计影片{}部！'.format(df_curr.shape[0])
P1 += '这一批次中，最遥远的项目是《{}》，'.format(df_curr_sorted.loc[0, '片名']) 
P1 += '备案号为{}，'.format(df_curr_sorted.loc[0, '备案立项号'])
P1 += '最近期的项目是《{}》，'.format(df_curr_sorted.loc[df_curr.shape[0]-1, '片名'])
P1 += '备案号为{}。'.format(df_curr_sorted.loc[df_curr.shape[0]-1, '备案立项号'])
print(P1)

## 【P2】type overview

In [None]:
fp_plot_type, df_by_type = utils.plot_type(df_curr, issue_name, return_df=True)

In [None]:
P2 = '按备案类别划分，本次完成备案'
for i, row in df_by_type.reset_index().iterrows():
  if i == df_by_type.shape[0]-1:
    P2 = P2.rstrip('、')
    P2 += '以及{type}{n}部，'.format(type=row['电影类别'], n=row['数量'])
  else:
    P2 += '{type}{n}部、'.format(type=row['电影类别'], n=row['数量'])

P2 += '共计{}部。'.format(df_by_type.sum())
print(P2)

## 【P3】genre overview

In [None]:
fp_plot_genre, df_by_genre = utils.plot_genre(df_curr, 
                                             'ALL_{}'.format(issue_name), 
                                             stacked=True, 
                                             return_df=True)

In [None]:
P3 = ''
P3 = '故事片中，备案占前三的类型依次为{}。'.format(
    '、'.join(df_by_genre['故事影片'][-4:-1][::-1].index))

print(P3)

## 【P4】time period overview

In [None]:
fp_plot_time, df_by_time = utils.plot_time(df_curr, issue_name, return_df=True)

In [None]:
P4 = ''
P4 = '按年代划分，当代题材占主力位置，'
df_time_sorted = df_curr.groupby('年代')['片名'].count().sort_values(ascending=False).reset_index()
df_time_sorted.columns = ['年代', '数量']

P4 += '共计{}部。'.format(df_time_sorted['数量'][0])
print(P4)

## 【P5】public company affiliation

In [None]:
ls_co = df['备案单位'].unique()

In [None]:
# 本批次中与上市影视公司关联的项目有
info_public_film_co = pd.read_csv('/content/drive/MyDrive/Github/Article/reference/info_public_film_co.csv', index_col=0, encoding='utf-8-sig')
ls = info_public_film_co['公司简称'].apply(eval).sum()
pat_public = '|'.join(ls)
pat_public += '|阿里|腾讯|爱奇艺|英皇|寰亚|银都|美亚|大盛|儒意|灿星|横店'
df_focus = df_curr.loc[df_curr['备案单位'].str.contains(pat_public), :]

In [None]:
cols = ['片名', '备案单位', '电影类别', '类型_ext', '梗概']
df_focus[cols]

In [None]:
ids = [40,28, 194, 144, 113, 58, 38, 156]
df_curr['focus'] = False
df_curr.loc[ids, 'focus'] = True
df_focus_narrowed = df_curr.loc[ids, :].copy()
#ls_ptitles = []
#for content in df_focus_narrowed['梗概'].tolist():
#    ls_ptitles.append(''.join(autotitle.generate(content,1)).lower())
#ls_ptitles =  ['夺命主持', '北斗传说', '夺命命令', '舞动吧！少年', '妈妈再爱我一次']
#print(ls_ptitles)
########
P5 = ''
P5 += '本批次中，大数据分析识别出{}部上市影视公司关联项目，'.format(df_focus.shape[0])
P5 += '占比{}%。'.format(round((df_focus.shape[0]/df_curr.shape[0]*100),2))
P5 += '结合题材与出品方实力，慷田AI聚焦关注'


for _, row in df_focus_narrowed.iterrows():
  P5 += '《{}》'.format(row['片名'])
  P5 += '、'
  #P5 += '\n编剧：{writer}'.format(writer=row['编剧'])
  #P5 += '\n备案单位：{co}'.format(co=row['备案单位'])
  #P5 += '\n{synopsis}\n'.format(synopsis=row['梗概'])

P5 = P5.rstrip('、')
P5 += '，题材覆盖{}'.format('、'.join(df_focus_narrowed['类型'].unique()))

P5 += '。'

print(P5)

## 【P5wc】 generate wordcloud image
 Create Summary Word Cloud ImageImage

In [None]:
from IPython.display import Image as Img
from PIL import Image as pil

path_posters = '/content/drive/MyDrive/Github/Article/img/posters'
path_icon = '/content/drive/MyDrive/Github/Article/img/genre_icon'

#df_label2image.to_json(path_records + '/df_label2image.json')
df_label2image = pd.read_json(path_wd + '/records/df_label2image.json')

In [None]:
df_label2image

In [None]:
df_focus_narrowed['src_img'] = None
df_focus_narrowed['tgt_img'] = None

In [None]:
from IPython.display import Image as Img
from PIL import Image as pil
path_img = '/content/drive/MyDrive/Github/Article/img'

for i in df_focus_narrowed.index:
  ls = [df_focus_narrowed.loc[i, '片名']] * 20
  ls += [df_focus_narrowed.loc[i, '预测片名']] *10
  ls += df_focus_narrowed.loc[i, '主要角色']*6
  ls += [df_focus_narrowed.loc[i, '类型_ext']] *3
  ls += [df_focus_narrowed.loc[i, '年代']]*3
  ls += df_focus_narrowed.loc[i, 'kw']*2
  ls += [df_focus_narrowed.loc[i, '备案单位']]*3
  img_fn = df_focus_narrowed.loc[i, 'src_img']
  if not img_fn:
    img_fn = df_label2image.loc[
      df_label2image['label'] == df_focus_narrowed.loc[i, '类型_ext'],
      'fn'
  ].iloc[0]
  txt = '《{}》'.format(df_focus_narrowed.loc[i, '片名'])
  txt += '\n编剧：{}'.format(df_focus_narrowed.loc[i, '编剧'])
  writer_works = df.loc[df['编剧'] == df_focus_narrowed.loc[i, '编剧'], 
                        '片名'].to_list()
  if len(writer_works) > 1:
    writer_works.remove(df_focus_narrowed.loc[i, '片名'])
    txt += '，（其它作品：《{}》）'.format('》、《'.join(writer_works))
  
  txt += '\n备案单位：{}'.format(df_focus_narrowed.loc[i, '备案单位'])
  print(txt)
  fp_img = path_img + '/genre_icon/{}'.format(img_fn)
  fp_mask = path_img + '/genre_icon/{}'.format('mask_' + img_fn)
  #fp_img = path_img + '/genre_icon/{}'.format('psychedelic-1084082_960_720.jpg')
  fp_generated_img = utils.generate_word_image(ls, 
                                               fp_img, 
                                               fp_mask,
                                               fp_prefix='ChinaFilm_Reg_Overview',
                                               fp_suffix=issue_name,
                                               img_width=400,
                                               )
  display(Img(fp_generated_img, width=400))
  df_focus_narrowed.loc[i, 'tgt_img'] = fp_generated_img

## 【P6】Writer's other works (WIP)

In [None]:
df_curr['编剧']

df_writers = df.loc[df['编剧'].isin(df_curr['编剧'].unique()),:]
df_writers = df_writers[df_writers['电影类别'] == '故事影片']
df_writer_nfilms = df_writers.groupby('编剧')['片名'].count().rename('作品数量').reset_index()
df_writers = df_writers.merge(df_writer_nfilms, on='编剧', how='left')

df_writers[df_writers['作品数量'] >1]

## 【S0】 head block
2021年4月28日，3月的电影备案公示发布，其中最遥远的是影剧备字〔2021〕第1360号的《幕后英雄》，最近期的是影特备字〔2021〕第006号的《熊猫传奇——黑洞之吻》，慷田AI聚焦、重点关注的项目有中国电影的《发明一个夏天》、光线的《计划外的姐弟恋》、《二郎神》、《土行孙之破土重生》、爱奇艺的《日常警事》以及阿里的《无价之宝》。

In [None]:
df_focus_narrowed['单位简称'] = df_focus_narrowed['备案单位'].str.extract('('+ pat_public + ")")
df_f = df_focus_narrowed.groupby('单位简称')['片名'].apply(
    lambda x: '、'.join('《' + x + '》')).rename('关注影片').reset_index()
#####
S0 = '{year}年{month}月{day}日，'.format(
    year=pub_year, month=pub_month, day=pub_day)

S0 +=  '{month}月'.format(month=df_curr.iloc[0]['公示批次起始'][1])
if df_curr.iloc[0]['公示批次起始'][2] != '整月':
  S0 += '{duration}'.format(duration=df_curr.iloc[0]['公示批次起始'][2])

S0 += '电影备案共计{}部，其中'.format(df_curr.shape[0])
S0 += '最遥远的是{}的《{}》，'.format(
    df_curr_sorted.loc[0, '备案立项号'], df_curr_sorted.loc[0, '片名'])
S0 += '最近期的是{}的《{}》，'.format(
    df_curr_sorted.loc[df_curr.shape[0]-1, '备案立项号'], 
    df_curr_sorted.loc[df_curr.shape[0]-1, '片名']
)

S0 += 'ContentAI聚焦关注的项目有'
for i, row in df_f.iterrows():
  if i == df_f.shape[0]-1:
    S0 = S0.rstrip('、')
    S0 += '以及{co}的{film}'.format(co=row['单位简称'], film=row['关注影片'])
  else:
    S0 += '{co}的{film}、'.format(co=row['单位简称'], film=row['关注影片'])
S0 += '。'

print(S0)

## 【R1】 point to ChinaFilm website


In [None]:
#####
R1 = '\n\n'
R1 += '慷田AI结合自主调研及多方大数据比对，通过分析、建模，提炼关键信息。'
R1 += '电影立项备案公示信息来自中国国家电影局 China Film Administration, 官方网址 '
R1 += ' http://www.chinafilm.gov.cn/chinafilm 。'

print(R1)

## 【R2，R3】 original link pointer

In [None]:
R2 = '点击左下角阅读原文查看本期慷田AI影片信息详表。'
R3 = '点击左下角阅读原文查看本期慷田AI影片概览分析。'

## Save or load df_{issue_name}

In [None]:
# Save records for current reg issue
path_records = '/content/drive/MyDrive/Github/Article/records'
df_curr.to_pickle(path_records + '/df_reg_{}.pkl'.format(issue_name))

In [None]:
df_curr = pd.read_pickle(path_wd + '/records/df_reg_{}.pkl'.format(issue_name))

#[Output Article in Word]

## Install and Load Libraries

In [None]:
%%capture
!pip install python-docx
!pip install lxml
from docx import Document
from docx.shared import Inches
from docx.oxml.ns import qn
#from docx.text.parargaph import Paragraph

## Write Overview to docx and download


In [None]:
path_doc = '/content/drive/MyDrive/Github/Article/docx'

doc = Document()

# Set document Font 
doc.styles['Normal'].font.name = '微软雅黑'
r = doc.styles['Normal']._element
r.rPr.rFonts.set(qn('w:eastAsia'), '微软雅黑')

# Title
doc.core_properties.title = 'China Film Registration Overview {}'.format(
    issue_name)

doc.add_heading(T1, 0)
p = doc.add_paragraph(S0, style='Intense Quote')

doc.add_paragraph(P1)

doc.add_picture(fp_plot_type, width =Inches(4))
doc.add_paragraph(P2)

doc.add_picture(fp_plot_genre, width =Inches(4))
doc.add_paragraph(P3)

doc.add_picture(fp_plot_time, width =Inches(4))
doc.add_paragraph(P4)

doc.add_paragraph(P5)
for i in df_focus_narrowed.index:
  txt = '\n《{}》'.format(df_focus_narrowed.loc[i, '片名'])
  txt += '\n编剧：{}'.format(df_focus_narrowed.loc[i, '编剧'])

  if df.loc[df['编剧'].str.contains(df_focus_narrowed.loc[i, '编剧']),
            '片名'
            ].shape[0] > 1:
    writer_works = df.loc[
        df['编剧'].str.contains(df_focus_narrowed.loc[i, '编剧']), 
        '片名'
        ].to_list()
    writer_works.remove(df_focus_narrowed.loc[i, '片名'])
    txt += '，（其它作品：《{}》）'.format('》、《'.join(writer_works))
  
  txt += '\n备案单位：{}'.format(df_focus_narrowed.loc[i, '备案单位'])
  doc.add_paragraph(txt)
  doc.add_picture(df_focus_narrowed.loc[i, 'tgt_img'], width=Inches(4))

doc.add_paragraph(R1)
doc.add_paragraph(R2)

fp_doc = path_doc + '/ChinaFilm_Reg_Overview_' + issue_name + '.docx'
doc.save(fp_doc)

In [None]:
# Download Document
from google.colab import files
files.download(fp_doc)

## Write Table to docx and downlaod

In [None]:
path_doc = '/content/drive/MyDrive/Github/Article/docx'

doc = Document()

# Set document Font 
doc.styles['Normal'].font.name = '微软雅黑'
r = doc.styles['Normal']._element
r.rPr.rFonts.set(qn('w:eastAsia'), '微软雅黑')

# Title
doc.core_properties.title = 'China Film Registration Table {}'.format(
    issue_name)

doc.add_heading('详表：{}'.format(T1), 0)
p = doc.add_paragraph(S0, style='Intense Quote')

table = doc.add_table(rows=1, cols=1, style='Light List Accent 4')

df_curr_sorted = df_curr.sort_values(
    ['电影类别', '类型'], ascending=[False, True])
i = 1
for _, row in df_curr_sorted.iterrows():
  row_cells = table.add_row().cells
  row_cells[0].text = '[' + str(i) + ']' + '《'.format(i) + row['片名'] + '》'

  row_cells = table.add_row().cells
  row_cells[0].text = '编剧: ' + row['编剧']
  if df.loc[df['编剧'].str.contains(row['编剧']), '片名'].shape[0] > 1:
    writer_works = df.loc[df['编剧'].str.contains(row['编剧']), 
                          '片名'].to_list()
    if len(writer_works) > 1:
      writer_works.remove(row['片名'])
      txt = '《{}》'.format('》、《'.join(writer_works))
    row_cells = table.add_row().cells
    row_cells[0].text = '其它作品：' + txt

  row_cells = table.add_row().cells
  row_cells[0].text = '备案单位: ' + row['备案单位']

  row_cells = table.add_row().cells
  row_cells[0].text = '类别：{}，类型:{} ，年代:{}'.format(
      row['电影类别'], row['类型'], row['年代'])

  row_cells = table.add_row().cells
  row_cells[0].text = row['梗概']

  row_cells = table.add_row().cells
  row_cells[0].text = ''
  i+=1

doc.add_paragraph(R1)
doc.add_paragraph(R3)

fp_doc = path_doc + '/ChinaFilm_Reg_Table_' + issue_name + '.docx'
doc.save(fp_doc)

In [None]:
# Download Document
from google.colab import files
files.download(fp_doc)