In [None]:
# 下記セルを実行すると、authorization codeの入力を求められます。
# 出力されたリンク先をクリックし、Googleアカウントにログインし、
# authorization codeをコピーし、貼り付けをおこなってください。
!pip install pandas==1.5.3
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
project = 'sample_data'
chapter = 1
os.chdir(f'/content/drive/MyDrive/{project}/chapter-{chapter}/')

# １章 システムデータの加工・可視化を行う２０本ノック

## ノック１：法人情報データを読み込んでみよう

In [None]:
import pandas as pd
data = pd.read_csv('data/22_shizuoka_all_20210331.csv')

In [None]:
data = pd.read_csv('data/22_shizuoka_all_20210331.csv', encoding='shift-jis')

In [None]:
data.head()

In [None]:
len(data)

In [None]:
data = pd.read_csv('data/22_shizuoka_all_20210331.csv', encoding='shift-jis', header=None)

In [None]:
data.head()

In [None]:
len(data)

## ノック２：読み込んだデータを確認しよう

In [None]:
data.columns

In [None]:
len(data.columns)

In [None]:
data

In [None]:
data.dtypes

In [None]:
data = pd.read_csv('data/22_shizuoka_all_20210331.csv', encoding='shift-jis', header=None, dtype=object)

In [None]:
data.head()

In [None]:
data.dtypes

## ノック３：ヘッダ用のテキストファイルを読み込もう

In [None]:
mst = pd.read_csv('data/mst_column_name.txt', encoding='shift-jis')
mst.head()

In [None]:
mst = pd.read_csv('data/mst_column_name.txt', encoding='shift-jis', sep='\t')
mst.head()

In [None]:
len(mst)

In [None]:
len(mst) == len(data.columns)

## ノック４：ヘッダ行を追加しよう

In [None]:
columns = mst.column_name_en.values

In [None]:
columns

In [None]:
data.columns = columns
data.head()

## ノック５：統計量や欠損値を確認しよう

In [None]:
data.describe()

In [None]:
data.isna()

In [None]:
data.isna().sum()

## ノック６：データの追加（繰り返し・読込・抽出・結合）

In [None]:
import os
os.listdir('data')

In [None]:
from glob import glob
diff_files = glob('data/diff*.csv')
diff_files

In [None]:
diff_files.sort()
diff = pd.read_csv(diff_files[0], encoding='shift-jis', header=None, dtype=object)
print(len(diff))
diff.head(3)

In [None]:
diff.columns = columns
diff = diff.loc[diff['prefectureName'] == '静岡県']
print(len(diff))
diff.head(3)

In [None]:
data_test = data                    # テスト用の変数にdataの中身をコピー
print(len(data_test))               # 既存の件数を確認
print(len(data_test) == len(data))  # 既存の件数が正しいことを確認
print(len(diff))                    # 差分の件数を確認
data_test = data_test.append(diff)  # テスト用の変数に差分データを追加
print(len(data_test))               # 追加後の件数を確認
data_test.tail(3)                   # 追加後のデータの末尾3件を確認

In [None]:
for f in diff_files:
  diff = pd.read_csv(f, encoding='shift-jis', header=None, dtype=object)
  diff.columns = columns
  diff = diff.loc[diff['prefectureName'] == '静岡県']
  data = data.append(diff)
data

In [None]:
data.describe()

In [None]:
print(data[data["corporateNumber"].duplicated()])

In [None]:
data.drop_duplicates(subset='corporateNumber', keep='last', inplace=True)

In [None]:
data.describe()

In [None]:
data.isna().sum()

## ノック７：マスタを読み込んで項目を横に繋げよう

In [None]:
os.listdir('data')

In [None]:
mst_process_kbn = pd.read_csv('data/mst_process_kbn.csv', dtype=object)
mst_process_kbn

In [None]:
data = data.merge(mst_process_kbn, on='process', how='left')

In [None]:
print(len(data.columns))
data.head(3)

In [None]:
mst_correct_kbn = pd.read_csv('data/mst_correct_kbn.csv', encoding='shift-jis', dtype=object)
mst_correct_kbn

In [None]:
data = data.merge(mst_correct_kbn, on='correct', how='left')
print(len(data.columns))
data.head(3)

In [None]:
mst_corp_kind = pd.read_csv('data/mst_corp_kind.csv', dtype=object)
mst_corp_kind

In [None]:
data = data.merge(mst_corp_kind, on='kind', how='left')
print(len(data.columns))
data.head(3)

In [None]:
mst_close_cause = pd.read_csv('data/mst_closeCause.csv', dtype=object)
mst_close_cause

In [None]:
data = data.merge(mst_close_cause, on='closeCause', how='left')
print(len(data.columns))
data.head(3)

In [None]:
mst_latest = pd.read_csv('data/mst_latest.csv', dtype=object)
mst_latest

In [None]:
data = data.merge(mst_latest, on='latest', how='left')
print(len(data.columns))
data.head(3)

In [None]:
mst_hihyoji = pd.read_csv('data/mst_hihyoji.csv', dtype=object)
mst_hihyoji

In [None]:
data = data.merge(mst_hihyoji, on='hihyoji', how='left')
print(len(data.columns))
data.head(3)

## ノック８：テキストの連結や分割をしよう

In [None]:
data[['prefectureName', 'cityName', 'streetNumber']].isna().sum()

In [None]:
data['address'] = data['prefectureName'] + data['cityName'] + data['streetNumber']
print(len(data.columns))
data.head(3)

In [None]:
data.loc[data['streetNumber'].isna()].head(3)

In [None]:
data['address'].loc[data['streetNumber'].isna()] = data['prefectureName'] + data['cityName']

In [None]:
print(data['address'].isna().sum())
data.loc[data['streetNumber'].isna()].head(3)

In [None]:
data.head(3)

In [None]:
data['postCode_head'] = data['postCode'].str[:3]
print(len(data.columns))
data.head(3)

In [None]:
data['postCode_tail'] = data['postCode'].str[-4:]
print(len(data.columns))
data.head(3)

## ノック９：日付を加工しよう

In [None]:
data['closeDate'] - data['assignmentDate']

In [None]:
tmp = pd.to_datetime(data['closeDate'])
tmp.dtypes

In [None]:
dt_columns = ['updateDate', 'changeDate', 'closeDate', 'assignmentDate']
for col in dt_columns:
  data[col] = pd.to_datetime(data[col])

In [None]:
data.dtypes

In [None]:
data['corporate_life'] = data['closeDate'] - data['assignmentDate']
print(len(data.columns))
data.head(3)

In [None]:
tmp = data.loc[data['closeDate'].notna()]
print(len(tmp))
tmp.head(3)

In [None]:
len(data.loc[data['closeCause'].notna()]) == len(data.loc[data['closeDate'].notna()])

In [None]:
data['update_YM'] = data['updateDate'].dt.to_period('M')
print(len(data.columns))
data.head()

In [None]:
dt_prefixes = ['assignment', 'change', 'update', 'close']
for pre in dt_prefixes:
  data[f'{pre}_YM'] = data[f'{pre}Date'].dt.to_period('M')

In [None]:
print(len(data.columns))
data.head(3)

In [None]:
data.dtypes

## ノック１０：年度を設定しよう

In [None]:
data['update_year'] = pd.DatetimeIndex(data['updateDate']).year         # 更新日付から年を取得
data['update_month'] = pd.DatetimeIndex(data['updateDate']).month       # 更新日付から月を取得
data['update_fiscal_year'] = pd.DatetimeIndex(data['updateDate']).year  # 更新年度に取得した年を設定
data.loc[data['update_month'] < 4, 'update_fiscal_year'] -= 1           # 更新月が3月までは更新年度-1

In [None]:
print(len(data.columns))
data.head(3)

In [None]:
for i in range(12):
  display(data[['update_YM', 'update_fiscal_year']].loc[data['update_month'] == i+1 ][:1])

## ノック１１：加工したデータをファイルに出力しよう

In [None]:
output_dir = 'data/output'
os.makedirs(output_dir, exist_ok=True)

In [None]:
output_file = 'processed_shizuoka.csv'
data.to_csv(os.path.join(output_dir, output_file), index=False)

In [None]:
output_file = output_file.replace('.csv', '.xlsx')
data.to_excel(os.path.join(output_dir, output_file), index=False)

## ノック１２：不要な項目の削除と並べ替えをしよう

In [None]:
print(len(data.columns))
print(data.columns)
data.head(3)

In [None]:
data = data[['cityName', 'corporateNumber', 'name', 'corp_kind_name', 'process', 'process_kbn_name', 'assignmentDate', 'updateDate', 'update_fiscal_year', 'update_YM']]

In [None]:
print(len(data.columns))
print(data.columns)
data.head(3)

In [None]:
data = data.drop(columns = 'process')
print(data.columns)
data.head(3)

## ノック１３：まとまった単位で集計しよう


In [None]:
tmp = data.groupby('corp_kind_name').size()
tmp

In [None]:
tmp.sort_values(inplace=True, ascending=False)
tmp

In [None]:
tmp = data.groupby('update_fiscal_year').size()
tmp

In [None]:
tmp = data.groupby(['update_fiscal_year', 'corp_kind_name']).size()
tmp

In [None]:
pt_data = pd.pivot_table(data, index='corp_kind_name', columns='update_fiscal_year', aggfunc='size')
pt_data

## ノック１４：市区町村別の法人数を可視化しよう

In [None]:
%%bash
pip install -q japanize-matplotlib

In [None]:
tmp = data.groupby('cityName').size()
tmp.head()

In [None]:
import matplotlib.pyplot as plt
import japanize_matplotlib

x = tmp.index
y = tmp.values
plt.bar(x, y)

In [None]:
plt.figure(figsize=(20, 10))
plt.bar(x, y)

## ノック１５：グラフの縦横と表示順を変えてみよう

In [None]:
tmp.sort_values(inplace=True, ascending=True)
tmp

In [None]:
plt.figure(figsize=(10, 15))
x = tmp.index
y = tmp.values
plt.barh(x, y)

## ノック１６：グラフのタイトルとラベルを設定しよう

In [None]:
tmp.sort_values(inplace=True, ascending=False)
plt.figure(figsize=(20, 10))
x = tmp[:10].index
y = tmp[:10].values
plt.bar(x, y)


In [None]:
plt.figure(figsize=(20, 10))
plt.bar(x, y)
plt.title('市区町村別の法人数', fontsize=20)
plt.xlabel('市区町村名', fontsize=15)
plt.ylabel('法人数')


## ノック１７：グラフの見た目をもっと変えてみよう

In [None]:
tmp.sort_values(inplace=True, ascending=False)
tmp = tmp[:10]
x = tmp.index
y = tmp.values
fig, ax = plt.subplots(figsize=(20, 10))
bar_list = ax.bar(x, y, color='lightgray')
bar_list[4].set_color('blue')
ax.set_title('自治体別法人数における富士市の位置づけ', fontsize=20);
ax.set_ylabel('法人数', fontsize=15)
ax.text(7.5, 9000, '上位10の自治体を抜粋して表示', fontsize=15)

## ノック１８：９０日以内に新規登録された法人数を可視化してみよう

In [None]:
base_time = pd.Timestamp.now(tz='Asia/Tokyo')
base_time

In [None]:
print(len(data))
data.head()

In [None]:
data['assignmentDate'] = data['assignmentDate'].dt.tz_localize('Asia/Tokyo')
data.head()

In [None]:
delta = pd.Timedelta(90, 'days')
tmp = data.loc[(data['process_kbn_name'] == '新規') & (base_time - data['assignmentDate'] <= delta)]
print(len(tmp))
tmp.head()

In [None]:
tmp = tmp.groupby('cityName').size()
tmp.sort_values(inplace=True, ascending=False)
tmp = tmp[:10]
x = tmp.index
y = tmp.values
plt.figure(figsize=(20, 10))
plt.bar(x, y)

In [None]:
base_time = pd.Timestamp('2020-04-16', tz='Asia/Tokyo')
tmp = data.loc[(data['process_kbn_name'] == '新規') & (base_time - data['assignmentDate'] <= delta)]
print(len(tmp))
tmp.head()

In [None]:
tmp = tmp.groupby(by='cityName').size()
tmp.sort_values(inplace=True, ascending=False)
tmp = tmp[:10]
x = tmp.index
y = tmp.values
plt.figure(figsize=(20, 10))
plt.bar(x, y)

## ノック１９：年度別の推移を可視化しよう

In [None]:
tmp = data.dropna(subset=['cityName'])
tmp = tmp.loc[tmp['cityName'].str.match('^.*区$')]
print(len(tmp))
tmp.head()

In [None]:
tmp = tmp.loc[(tmp['update_fiscal_year'] >= 2016) & (tmp['update_fiscal_year'] < 2021)]
print(len(tmp))
tmp.head()

In [None]:
tmp = tmp.groupby(['cityName', 'update_fiscal_year']).size()
tmp.name = 'count'
tmp = tmp.reset_index()
print(len(tmp))
tmp.head(6)

In [None]:
import seaborn as sns
from matplotlib.ticker import MaxNLocator

plt.figure(figsize=(20, 10))
plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
img = sns.lineplot(x=tmp['update_fiscal_year'], y=tmp['count'], hue=tmp['cityName'])

## ノック２０：データとグラフを出力しよう

In [None]:
data_file = 'knock20_graphdata.csv'
data.to_csv(os.path.join(output_dir, data_file), index=False)

In [None]:
graph_file = 'knock20_graph.png'
fig = img.get_figure()
fig.savefig(os.path.join(output_dir, graph_file))