<a href="https://colab.research.google.com/github/CT-Cultures/Content/blob/master/RegOverview_Issue.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mount drive and install requirements

In [None]:
# Mount Drive
import os
import sys
import gc

from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install requirements in this cell,
# then restart runtime after initial installation

# fetch Content from github and install requirements
path_Content = "/content/drive/MyDrive/Github/Content" ###
if not os.path.exists(path_Content):
  !git clone https://github.com/CT-Cultures/Content.git {path_Content}
os.chdir(path_Content)
!pip install -r requirements.txt

path_Article = "/content/drive/Mydrive/Github/Article" ###
if not os.path.exists(path_Article):
  !git clone https://github.com/CT-Cultures/Article.git {path_Article}

In [None]:
# Check Environment
import pandas as pd
import transformers
import nltk
print('pandas version: {}, (>= 1.3.2)'.format(pd.__version__)) # pd has to >= 1.3.2, restart runtime
print('transformers version: {}'.format(transformers.__version__))
#print('nltk version: {}, (>=3.3)'.format(nltk.__version__))

!which python
!python --version
!nvidia-smi

In [None]:
%%capture
!pip install -r sources/ChinaFilm/requirements.txt
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install selenium
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

!pip install transformers

# Import libraries and set path

In [None]:
#Load Libraries Global
import os
import datetime as dt
import re
import pandas as pd
import numpy as np
import torch
from bs4 import BeautifulSoup

from selenium import webdriver

import matplotlib as mp
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
import matplotlib.font_manager as fm

path_fonts = '/content/drive/MyDrive/Github/Article/fonts'
fontprop = fm.FontProperties(fname=path_fonts, size= 15)

font_dirs = [path_fonts, ]
font_files = fm.findSystemFonts(fontpaths=font_dirs)
font_list = fm.createFontList(font_files)
for font in font_files:
  fm.fontManager.addfont(font)

plt.rcParams['figure.figsize'] = [15, 9]
mp.rcParams['font.family'] = ['Microsoft YaHei']

%matplotlib inline
print(mp.get_cachedir())

In [None]:
# Set Path
path_wd = path_Article = '/content/drive/MyDrive/Github/Article'
path_Article_records = '/content/drive/MyDrive/Github/Article/records'

path_font = '/content/drive/MyDrive/Github/Article/fonts/STHUPO.TTF'
path_img = '/content/drive/MyDrive/Github/Article/img'

path_Content = '/content/drive/MyDrive/Github/Content'
path_NRTA = path_Content + '/tools/sources/NRTA'

os.chdir(path_wd)

In [None]:
# Load Local Lib# Set Path

# import utils for making plots and wordclouds
os.chdir(path_wd)
%reload_ext autoreload
%autoreload 2

from generate import utils

# import tools for prediction
os.chdir(path_Content)
import predict

# intantiate tv_reg
os.chdir(path_NRTA)
from Record_Registration import Registration # assume relative path at path_wd
tv_reg = Registration()

## 1.3 Load Latest Registration Publication

In [None]:
!ls

In [None]:
!python Update_Registration.py

In [None]:
fp = path_NRTA + '/records/contents_of_registrations.json'
df = pd.read_json(fp, orient='split')

In [None]:
curr_issue_dt = df['公示年月'].iloc[0]
issue_name = str(curr_issue_dt)
issue_name

In [None]:
# clean df, run mulitple times as necessary
df['许可证号'] = df['许可证号'].apply(lambda x:
                    x.lstrip('\n+').lstrip('\t+').lstrip('\w+').rstrip('\t+').rstrip('\n+'))
df['备注'] = df['备注'].apply(lambda x:
                    x.lstrip('\n+').lstrip('\t+').lstrip('\w+').rstrip('\t+').rstrip('\n+'))

In [None]:
df_curr = df[df['公示年月'] == curr_issue_dt].copy()
print(df_curr.columns)

In [None]:
# get genre
df_curr['类型'] = df_curr['题材'].apply(lambda x: x[2:])

# get time period
df_curr['年代'] = df_curr['题材'].apply(lambda x: x[:2])

#[2] Load Model for Predictons


## Predict

In [None]:
df_curr.columns

In [None]:
os.chdir(path_Content) # change to the Content directory

# predict alternative title
df_curr['预测剧名'] = predict.predict_title(df_curr['内容提要'].tolist())

# extract keywords, topK=10
df_curr['kw'] = df_curr['内容提要'].apply(predict.extract_keywords, topK=10).copy()

# identify main characters
df_curr['主要角色'] = df_curr['内容提要'].apply(predict.identify_characters).copy()

In [None]:
import datetime

df_curr['制作周期_月'] = df_curr['制作周期'].apply(lambda x: int(x.rstrip('个月')))

def months_to_principal_photography(x: pd.Series):
  year = int(x.split('.')[0])
  month = int(x.split('.')[1])
  now = datetime.datetime.now()
  p_date = datetime.date(year, month, 15)
  months_from_now = (p_date - now.date()).days // 30
  return months_from_now

df_curr['距离开机'] = df_curr['拍摄日期'].apply(months_to_principal_photography)
#df_curr['预估拍摄速度'] = df_curr['制作周期_月'] / df_curr['集数']*30

## adjust genre (interactive)

In [None]:
genre_v2 = ['喜剧', '爱情', '动作', '犯罪', '科幻', '奇幻', '冒险', '灾难', '恐怖',
 '惊悚', '剧情', '战争', '歌舞', '悬疑', '动画', '同性']

genre_v2 +=  ['商战', '青少', '家庭', '宅斗', '战争', '体育', '谍战', '涉案', '军旅', '职业', '传奇', '宫斗']
genre_v2 +=  ['职业', '商战', '体育', '涉案']
genre_v2 +=  ['科幻', '奇幻', '冒险', '神话', '仙侠']
genre_v2 +=  ['历史', '宫廷', '宫斗', '宅斗', '传奇']
genre_v2 +=  ['革命', '战争', '军旅']
genre_v2 +=  ['都市', '农村', '校园']
genre_v2 +=  ['爱情', '喜剧', '惊悚', '恐怖', '歌舞']
genre_v2 +=  ['灾难']
genre_v2 +=  ['青少']
genre_v2 +=  ['其它']

genre_v2 = list(set(genre_v2))

In [None]:
df_curr['类型_ext'] = df_curr['类型'].copy()

In [None]:
i = 0
batch = 10
df_curr['类型_ext'][i:i+batch].to_numpy()

In [None]:
i+= batch
print('{} - {}'.format(i, i+batch))
df_curr['类型_ext'][i:i+batch].to_numpy() # copy out put to next cell to modify genre

In [None]:
df_curr['类型_ext'].iloc[i:i+batch] = \
['剧情', '谍战', '商战', '奋斗', '家庭', '涉案', '家庭', '军旅', '奋斗', '奋斗']

In [None]:
# run cell, click make  interactive tables at the lower left to edit
df_curr[['剧名', '题材', '类型', '类型_ext', '年代', '内容提要']]

In [None]:
df_curr[['剧名', '类型_ext', '类型', '内容提要']]

In [None]:
# Save records for current release issue
df_curr.to_pickle(path_wd + '/records/df_registration_nrta_{}.pkl'.format(issue_name))

# [2] Generating Article Contents

##[2.1] 2021年X月电视剧备案划重点

In [None]:
# Title
T1 = '{}电视剧备案划重点'.format(df_curr['公示年月'].iloc[0])

print(T1)

In [None]:
dfview = df_curr[[
                   '剧名', '集数', '距离开机', '类型_ext', 
                   '年代', '主要角色', '报备机构', '内容提要',
                   '预测剧名', 'kw']].sort_values('距离开机')
dfview.info()

## 按类型划分

In [None]:
# sanity check
print(df_curr['类型_ext'].to_numpy())
print(df_curr['类型_ext'].unique())

In [None]:
fp_plot_genre, df_by_genre = utils.plot_genre(
    df_curr, 
    '{}电视剧备案类型分布'.format(issue_name), 
    stacked=False,
    #xticklabels=[0, "", 1,"", 2, "", 3],
    return_df=True)

## 按年代划分

In [None]:
fp_plot_time, df_by_time = utils.plot_time(
    df_curr, 
    issue_name, y_offset=1, return_df=True)

##[2.2] 	本期通过备案的电视剧共计39部,估计已开机的1部,一个月内将开机的20部,距离开机一个月以上的18部。其中，20集(含）以下的电视剧又4部,20到40集的有35部,超过40集的有0部。


In [None]:
dfview.columns
dfview['集数'] = dfview['集数'].astype('int')

In [None]:
# Write Content
T2 = '\n'
T2 += '本期通过备案的电视剧共计{}部，'.format(dfview.shape[0])
T2 += '估计已开机的{}部，'.format(dfview[dfview['距离开机'] < 0].shape[0])
T2 += '一个月内将开机的{}部，'.format(
    dfview[(dfview['距离开机'] >= 0) & (dfview['距离开机'] < 2)].shape[0])

T2 += '距离开机一个月以上的{}部。'.format(
    dfview[(dfview['距离开机'] > 2)].shape[0])

T2 += '其中，20集(含）以下的电视剧有{}部，'.format(dfview[dfview['集数'] <= 20].shape[0])
T2 += '20到40集的有{}部，'.format(
    dfview[(dfview['集数'] > 20) & (dfview['集数'] <= 40)].shape[0])
T2 += '超过40集的有{}部。'.format(dfview[dfview['集数'] > 40].shape[0])

print(T2)

##[2.3] 	本批次中，大数据分析识别出XX部上市影视公司关联项目，占比XX%。慷田AI聚焦关注的有

In [None]:
HTML(df_curr[['剧名', '题材', '报备机构', '内容提要']].to_html())

In [None]:
df_curr['报备机构'].unique()

In [None]:
# 本批次中与上市影视公司关联的项目有
info_public_film_co = pd.read_csv('/content/drive/MyDrive/Github/Article/reference/info_public_film_co.csv', index_col=0, encoding='utf-8-sig')
ls = info_public_film_co['公司简称'].apply(eval).sum()
pat_public = '|'.join(ls)
pat_public += '|阿里|腾讯|爱奇艺|英皇|寰亚|银都|美亚|大盛|儒意|灿星|横店|华策|电视剧制作中心'
pat_public += '|得闲|芒果|新丽|欢乐|尚世|华策|稻草熊|东阳欢娱|耀客|湖南快乐阳光|山东影视制作|当代时光'
pat_public += '|唐德|欢瑞|优酷|嘉行|东阳欢愉|稻草熊|天马映像'
df_focus = dfview.loc[dfview['报备机构'].str.contains(pat_public), :]
df_focus[['剧名','报备机构', '集数','内容提要', '类型_ext']]

In [None]:
df_focus[['剧名','报备机构', '集数','内容提要', '类型_ext']].index

In [None]:
ids = [0, 1, 9, 15, 19]
df_focus_narrowed = df_focus.loc[ids]

In [None]:
df_focus_narrowed['单位简称'] = df_focus_narrowed['报备机构'].str.extract('('+ pat_public + ")")

In [None]:
df_focus_narrowed.info()

In [None]:
########
T3 = '\n'
T3 += '本批次中，ContentAI识别出{}部上市影视公司及国资参投影视公司关联项目，'.format(df_focus.shape[0])
T3 += '占比{}%。'.format(round((df_focus.shape[0]/df_curr.shape[0]*100),2))
T3 += '结合题材与出品方实力，ContentAI聚焦关注的有'

for i, row in df_focus_narrowed.iterrows():
  T3 += '{}的'.format(row['单位简称'])
  if i == df_focus_narrowed.index[-2]:
    T3 += '《{}》和'.format(row['剧名'])
  else:
    T3 += '《{}》、'.format(row['剧名'])
T3 = T3.rstrip('、') + '，'

T3 += '题材类型包括了{}。\n'.format(
    '、'.join(df_focus_narrowed['类型_ext'].unique())
)
print(T3)

## [2.4] 生成词云图

In [None]:
df_focus_narrowed

In [None]:
os.chdir(path_wd)
#%load_ext autoreload
%reload_ext autoreload
from generate import utils

from IPython.display import Image as Img
from PIL import Image as pil
path_img = '/content/drive/MyDrive/Github/Article/img'

In [None]:
from IPython.display import Image as Img
from PIL import Image as pil

path_posters = '/content/drive/MyDrive/Github/Article/img/posters'
path_icon = '/content/drive/MyDrive/Github/Article/img/genre_icon'

#df_label2image.to_json(path_records + '/df_label2image.json')
df_label2image = pd.read_json(path_wd + '/records/df_label2img.json')

In [None]:
#df_label2image.loc[27,:] = ['商战', 'swan-46510_1280.png']
#df_label2image.loc[28,:] = ['战争','explosion.png']
#df_label2image.loc[29,:] = ['体育', 'goalkeeper-294327_1280.png']
df_label2image.loc[30,:] = ['谍战', 'butterfly-47967_1280.png']
df_label2image.loc[31,:] = ['校园', 'hibiscus-304330_1280.png']
df_label2image.loc[32,:] = ['创业', 'family-2112266_1280.png']
df_label2image.loc[33,:] = ['奋斗', 'family-2112266_1280.png']


In [None]:
df_label2image

In [None]:
df_focus_narrowed['src_img'] = None
df_focus_narrowed['tgt_img'] = None

In [None]:
for i in df_focus_narrowed.index:
  ls = [df_focus_narrowed.loc[i, '剧名']] * 10
  ls += [df_focus_narrowed.loc[i, '预测剧名']] *6
  ls += df_focus_narrowed.loc[i, '主要角色']*3
  ls += [df_focus_narrowed.loc[i, '类型_ext']] *3
  ls += [df_focus_narrowed.loc[i, '年代']]*3
  ls += df_focus_narrowed.loc[i, 'kw']
  img_fn = df_focus_narrowed.loc[i, 'src_img']
  if not img_fn:
    img_fn = df_label2image.loc[
        df_label2image['label'] == df_focus_narrowed.loc[i, '类型_ext'],
        'fn'
    ].iloc[0]
  #print(txt)
  fp_img = path_img + '/genre_icon/{}'.format(img_fn)
  fp_mask = path_img + '/genre_icon/{}'.format('mask_' + img_fn)
  #fp_img = path_img + '/genre_icon/{}'.format('psychedelic-1084082_960_720.jpg')
  fp_generated_img = utils.generate_word_image(ls, 
                                               fp_img, 
                                               fp_mask,
                                               fp_prefix='NRTA_TVReg_Overview',
                                               fp_suffix=issue_name,
                                               img_width=400,
                                               )
  #display(Img(fp_generated_img, width=400))
  df_focus_narrowed.loc[i, 'tgt_img'] = fp_generated_img

##[2.5] 生成摘要

In [None]:
S0 = issue_name
S0 += '电视剧备案慷田AI聚焦关注的有'

for i, row in df_focus_narrowed.iterrows():
  S0 += '{}的'.format(row['单位简称'])
  if i == df_focus_narrowed.index[-2]:
    S0 += '《{}》和'.format(row['剧名'])
  else:
    S0 += '《{}》、'.format(row['剧名'])
S0 = S0.rstrip('、') + '，'

S0 += '题材类型包括了{}。\n'.format(
    '、'.join(df_focus_narrowed['类型_ext'].unique())
)
print(S0)

##[2.6] 指向国家广电局官网


In [None]:
#####
R1 = '\n'
R1 += 'ContentAI结合自主调研及多方大数据比对，通过分析、建模，提炼关键信息。'
R1 += '电视剧备案公示信息来自国家广播电视总局 National Radio and Televison Administration, 官方网址 '
R1 += ' http://www.nrta.gov.cn/ 。'

print(R1)

In [None]:
R2 = '点击左下角阅读原文查看本期ContentAI电视剧信息详表。'
R3 = '点击左下角阅读原文查看本期ContentAI电视剧概览分析。'

In [None]:
issue_name

## [2.7] Save df of this issue pickle

In [None]:
df_curr.to_pickle(path_Article_records+ '/df_tvreg_{}.pkl'.format(issue_name))
df_focus_narrowed.to_pickle(path_Article_records + '/df_tvreg_focus_{}.pkl'.format(issue_name))

In [None]:
df_curr = pd.read_pickle(path_Article_records + '/df_tvreg_{}.pkl'.format(issue_name))
df_focus_narrowed = pd.read_pickle(path_Article_records + '/df_tvreg_focus_{}.pkl'.format(issue_name))

#[3] Output Word Document

## 3.1 Install and Load Libraries

In [None]:
%%capture
!pip install python-docx
!pip install lxml
from docx import Document
from docx.shared import Inches
#from docx.text.parargaph import Paragraph

## 3.2 Output Docx


In [None]:
from docx import Document
from docx.shared import Inches
from docx.oxml.ns import qn

### Write Overview docx

In [None]:
path_doc = '/content/drive/MyDrive/Github/Article/docx'
path_img = '/content/drive/MyDrive/Github/Article/img'

doc = Document()

# Set document Font 
doc.styles['Normal'].font.name = '微软雅黑'
r = doc.styles['Normal']._element
r.rPr.rFonts.set(qn('w:eastAsia'), '微软雅黑')

# Set Document Title
doc.core_properties.title = 'China TV Registration Overview ' + issue_name

doc.add_heading(T1, 0)

p = doc.add_paragraph(S0, style='Intense Quote')

doc.add_paragraph(T2)

doc.add_picture(fp_plot_genre, width=Inches(6))
doc.add_picture(fp_plot_time, width=Inches(6))
doc.add_paragraph(T3)

for i, row in df_focus_narrowed.iterrows():
  if  row['距离开机'] <= 0: pstatus = '估摸着已开机'
  elif  0 < row['距离开机'] <= 1: pstatus = '预计一个月内开机'
  else: pstatus = '预计距离开机1个月以上' 
  txt = '\n《{}》，{}集，{}。'.format(row['剧名'], row['集数'], pstatus)
  txt += '\n报备机构：{}'.format(row['报备机构'])
  txt += '\n主要角色：{}'.format('、'.join(row['主要角色']))
  doc.add_paragraph(txt)
  doc.add_picture(row['tgt_img'], width =Inches(4))
  doc.add_paragraph(row['内容提要'])

doc.add_paragraph(R1)
doc.add_paragraph(R2)

fp_doc = path_doc + '/TVregHighlight_' + issue_name + '.docx'
doc.save(fp_doc)


In [None]:
# Download Document
from google.colab import files
files.download(fp_doc)

### write Table docx

In [None]:
dfview.columns

In [None]:
doc = Document()

# Set document Font 
doc.styles['Normal'].font.name = '微软雅黑'
r = doc.styles['Normal']._element
r.rPr.rFonts.set(qn('w:eastAsia'), '微软雅黑')

# Set Document Title
doc.core_properties.title = 'China TV Registration Table ' + issue_name

doc.add_heading('详表：{}'.format(T1), 0)

p = doc.add_paragraph(S0, style='Intense Quote')

table = doc.add_table(rows=1, cols=1, style='Light List Accent 3')
i = 1
for _, row in dfview.iterrows():
  row_cells = table.add_row().cells
  row_cells[0].text = '[{}] '.format(i) + \
                      '《{}》，'.format(row['剧名']) + \
                      '{}集'.format(row['集数'])

  #if row['匹配片名'] != '':
  #  row_cells[0].text += ' (原备案名《{}》)'.format(row['匹配片名'])

  row_cells = table.add_row().cells
  row_cells[0].text = row['年代'] + row['类型_ext']
  
  if  row['距离开机'] <= 0: pstatus = '估摸着已开机'
  elif  0 < row['距离开机'] <= 1: pstatus = '预计一个月内开机'
  else: pstatus = '预计{}个月后开机'.format(row['距离开机'])
  row_cells = table.add_row().cells
  row_cells[0].text = pstatus
  #if row['是否修改'] == '是':
  #  row_cells[0].text += ', 修改后通过备案'

  row_cells = table.add_row().cells
  row_cells[0].text = '报备机构: {}'.format(row['报备机构'])

  row_cells = table.add_row().cells
  row_cells[0].text = '主要角色: {}'.format('、'.join(row['主要角色']))

  row_cells = table.add_row().cells
  row_cells[0].text = row['内容提要']

  row_cells = table.add_row().cells
  row_cells[0].text = ''
  i+=1

doc.add_paragraph(R1)
doc.add_paragraph(R3)

fp_doc = path_doc + '/TVregTable_' + issue_name + '.docx'
doc.save(fp_doc)


In [None]:
# Download Document
from google.colab import files
files.download(fp_doc)

In [None]:
cols_for_download = ['剧名', '集数', '报备机构', '内容提要', '类型_ext', '年代', '主要角色', 'kw']
dfdl = dfview[cols_for_download].copy()
dfdl.columns = ['剧名', '集数', '报备机构', '内容提要', '类型', '年代', '主要角色', '关键词']
fp_dl = path_doc + '/TVreg_' + issue_name + '.xls'
dfdl.to_excel(fp_dl, encoding='utf-8')
files.download(fp_dl)