# projection_graphsテーブル統計データ

projection_graphsテーブル(structures間の投射関係を表す重み付き有向グラフ)の統計データです。

※調整中（データ量が多すぎるため表示できない）

In [None]:
# Pythonライブラリインストール
!python --version
!pip install python-dotenv

!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install plotly
!pip install -U kaleido
!pip install -U nbformat
!pip install scikit-learn
!pip install sqlalchemy
!pip install "dask[complete]"


## 環境変数
supabase接続用URL,APIキーと、openai api接続用のAPIキーを設定します。
自身のopenaiアカウントからapi keyを取得してください。

https://platform.openai.com/account/api-keys

supabaseの情報は管理者にお尋ねください。

下記の例では、.envファイルに変数を書き込んで、JupiterNotebookで読み込む仕様で実装しております。

※.envファイルの作成が困難、.envファイルから値を読み込めない場合、
　os.getenv("◯◯")部分に変数値を直接書き込んでいただいても動作自体には問題ありません。

In [2]:
# 環境変数
import os
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
from sqlalchemy import create_engine
from sqlalchemy import text
import urllib.parse

import dask.dataframe as dd
from dask.distributed import Client

load_dotenv()

# supabase接続用変数
db_host = os.getenv("DB_HOST")
db_port = os.getenv("DB_PORT")
db_name = os.getenv("DB_NAME")
db_user = os.getenv("DB_USER")
db_pass = os.getenv("DB_PASS")

# Connect to the database
connection_config = {
    'user': db_user,
    'password': urllib.parse.quote_plus(db_pass),
    'host': db_host,
    'port': db_port, 
    'database': db_name
}
engine = create_engine('postgresql://{user}:{password}@{host}:{port}/{database}'.format(**connection_config))
sql = 'SELECT * FROM projection_graphs ORDER BY "injected-structure-id" ASC;'

with engine.begin() as conn:
    query = text(sql)
    df_original = pd.read_sql_query(query, conn)

print('環境変数読み込み完了')

環境変数読み込み完了


## 処理関数

- ヒストグラム
- 円グラフ

In [4]:

%matplotlib inline

def piechart(label:str, df, title:str):
  # Counting the occurrences of each category
  category_counts = df[label].value_counts()
  # Plotting the pie chart
  plt.pie(category_counts, labels=category_counts.index, autopct='%1.1f%%',startangle=90)
  plt.title(title+'(n='+str(len(df))+')')
  plt.axis('equal')
  plt.show()


def histgram(label:str, df, xlabel:str, ylabel:str, title:str, x_max):
  # If you want to use multiple cores, set the number of workers and threads per worker
  client = Client(n_workers=4, threads_per_worker=1)
  client

  data = df[label]

  # Compute basic statistics for the selected column
  min_value, max_value = data.min().compute(), data.max().compute()

  # Determine the number of bins and range for the histogram
  hist_range = (min_value, max_value)

  # Create a Pandas histogram from the Dask DataFrame
  hist, bin_edges = np.histogram(data, bins=len(set(df[label])), range=hist_range)

  # Plot the histogram using matplotlib
  plt.hist(bin_edges[:-1], bin_edges, weights=hist)
  plt.xlabel(xlabel)
  plt.ylabel(ylabel)
  plt.title(title)


  mean = np.mean(df[label])
  median = np.median(df[label])
  std_dev = np.std(df[label])

  y_max = max(n)
  # Add statistics to the plot
  plt.axvline(mean, color='red', linestyle='dashed', linewidth=2)
  plt.text(mean+1, (plt.ylim()[1]-y_max/4), f'Mean: {mean}', color='red')

  plt.axvline(median, color='blue', linestyle='solid', linewidth=2)
  plt.text(median+1, (plt.ylim()[1]-y_max/3), f'Median: {median}', color='blue')

  plt.axhline(std_dev, color='green', linestyle='dotted', linewidth=2)
  plt.text(plt.xlim()[0], std_dev, f'Std Dev: {std_dev:.2f}', color='green')

  # Show the plot
  if x_max is None:
    x_max = max(df[label])
  plt.xlim(min(df[label]),x_max)

  plt.show()



# 処理結果

In [None]:
# Read your large dataset (assuming it's a .csv file) in chunks using Dask DataFrame
df = dd.from_pandas(df_original, chunksize=1000)

histgram('normalized-projection-volume', df, 'normalized-projection-volume', '', 'normalized-projection-volume', None)
histgram('projection-density', df, 'projection-density', '', 'projection-density', None)
histgram('projection-energy', df, 'projection-energy', '', 'projection-energy', None)
histgram('projection-intensity', df, 'projection-intensity', '', 'projection-intensity', None)
histgram('projection-volume', df, 'projection-volume', '', 'projection-volume', None)
