In [None]:
# 下記セルを実行すると、authorization codeの入力を求められます。
# 出力されたリンク先をクリックし、Googleアカウントにログインし、
# authorization codeをコピーし、貼り付けをおこなってください。
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
project = 'sample_data'
chapter = 8
os.chdir(f'/content/drive/MyDrive/{project}/chapter-{chapter}/')

# chapter 8 特殊なデータ加工・可視化10ノック


## ノック91: 大容量CSVデータを扱ってみよう

In [None]:
import pandas as pd
df = pd.read_csv('data/person_count_out_0001_2021011509.csv')
df

In [None]:
for df in pd.read_csv('data/person_count_out_0001_2021011509.csv', chunksize=512):
  print(df.shape)

In [None]:
i = 0
for df in pd.read_csv('data/person_count_out_0001_2021011509.csv', chunksize=64):
  df['processd_per_chunk'] = True
  df.to_csv('data/processed_big_data.csv', mode='a', index=False, header=i == 0)
  i += 1

In [None]:
df = pd.read_csv('data/processed_big_data.csv')
df

## ノック92: Json形式のファイルを扱ってみよう

In [None]:
pd.read_json('data/column_oriented.json')

In [None]:
!cat data/column_oriented.json

In [None]:
!cat data/index_oriented.json

In [None]:
pd.read_json('data/index_oriented.json')

In [None]:
pd.read_json('data/index_oriented.json', orient='index')

In [None]:
!cat data/table_oriented.json

In [None]:
pd.read_json('data/table_oriented.json')

In [None]:
pd.read_json('data/table_oriented.json', orient='table')

## ノック93: *Webからデータを取得してみよう*

In [None]:
import requests
response = requests.get('https://worldtimeapi.org/api/timezone/Asia/Tokyo')
response.content

In [None]:
result = response.json()
result

In [None]:
pd.Series(result)

In [None]:
import json

with open('data/response.json', mode='w') as f:
  json.dump(result, f)

In [None]:
import time

for _ in range(4):
  response = requests.get('https://worldtimeapi.org/api/timezone/Asia/Tokyo')
  with open('data/responses.txt', mode='a') as f:
    res = response.json()
    f.write(f'{json.dumps(res)}\n')
  time.sleep(1)

In [None]:
!cat data/responses.txt

## ノック94: configファイルを扱ってみよう

In [None]:
!cat config.yml

In [None]:
import yaml
with open('config.yml', mode='r') as f:
  config = yaml.safe_load(f)
config

In [None]:
!cat config.toml

In [None]:
import toml
with open('config.toml', mode='r') as f:
  config = toml.load(f)
config

## ノック95 : 動画ファイルを音声ファイルへ変換してみよう

In [None]:
from moviepy.editor import VideoFileClip

video_clip = VideoFileClip('data/sample_video.mp4')
video_clip.audio.write_audiofile('data/audio_by_py.mp3')

In [None]:
!ffmpeg -i data/sample_video.mp4 -y -hide_banner -loglevel error data/audio_by_ffmpeg.mp3

In [None]:
!ls data/*.mp3

##ノック96 : 動画ファイルを画像ファイルへ分割してみよう

In [None]:
import cv2
from tqdm import trange
import os

cap = cv2.VideoCapture('data/sample_video.mp4')
img_dir = 'data/images_by_py/'
os.makedirs(img_dir, exist_ok=1)
n = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

for i in trange(n):
  success, img = cap.read()
  if not success:
    continue
  cv2.imwrite(f'{img_dir}/{i:04}.png', img)

In [None]:
ls data/images_by_py

In [None]:
!mkdir data/images_by_ffmpeg
!ffmpeg -i data/sample_video.mp4 -y -hide_banner -loglevel error data/images_by_ffmpeg/%04d.png

In [None]:
!ls data/images_by_ffmpeg/

##ノック97 : PowerPointやWordファイルを読み込んでみよう

In [None]:
!pip install python-pptx
!pip install python-docx

In [None]:
import pptx
pptx_data = pptx.Presentation('data/サンプル_PowerPoint.pptx')
len(pptx_data.slides)

In [None]:
sld_0 = pptx_data.slides[0]
shp_sld_0 = sld_0.shapes
len(shp_sld_0)

In [None]:
print(shp_sld_0[0].text)
print(shp_sld_0[0].has_text_frame)

In [None]:
pptx_data = pptx.Presentation('data/サンプル_PowerPoint.pptx')
texts = []
for slide in pptx_data.slides:
    for shape in slide.shapes:
        if shape.has_text_frame:
          texts.append(shape.text)
print(texts)

In [None]:
import docx
docx_data = docx.Document('data/サンプル_Word.docx')
len(docx_data.paragraphs)

In [None]:
docx_data.paragraphs[0].text

In [None]:
texts = []
for paragraph in docx_data.paragraphs:
  texts.append(paragraph.text)
print(texts)

##ノック98 : PDFデータを読み込んでみよう

In [None]:
!pip install pdfminer.six

In [None]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams

In [None]:
pdf_data = open('data/サンプル_PDF.pdf', 'rb')
txt_file = 'data/サンプル_PDF.txt'
out_data = open(txt_file, mode='w')

rscmgr = PDFResourceManager()
laprms = LAParams()
device = TextConverter(rscmgr, out_data, laparams=laprms)
itprtr = PDFPageInterpreter(rscmgr, device)

for page in PDFPage.get_pages(pdf_data):
    itprtr.process_page(page)

out_data.close()
device.close()
pdf_data.close()

In [None]:
with open('data/サンプル_PDF.txt', mode='r') as f:
  content = f.read()
print(content)

## ノック99 : インタラクティブなグラフを作成してみよう

In [None]:
import pandas as pd
df = pd.read_csv('data/person_count_out_0001_2021011509.csv')
df.head()

In [None]:
import plotly.express as px
fig = px.line(x=df['receive_time'], y=df['in1'])
fig.show()

In [None]:
df_v = pd.melt(df[['receive_time','in1','out1']], id_vars=['receive_time'], var_name="変数名",value_name="値")
df_v.head()

In [None]:
fig = px.line(df_v, x='receive_time', y='値', color='変数名')
fig.show()

## ノック100: 3次元グラフを作成してみよう

In [None]:
import seaborn as sns
df_iris = sns.load_dataset('iris')
df_iris.head()

In [None]:
fig = px.scatter(df_iris, x='sepal_length', y='sepal_width', color='species')
fig.show()

In [None]:
fig = px.scatter_3d(df_iris, x='sepal_length', y='sepal_width', z='petal_width',color='species')
fig.show()