In [26]:
# -*- coding: utf-8 -*-

# ライブラリインポート
!pip install jpholiday -q

import jpholiday
import pandas as pd
pd.set_option("display.max_columns", None)

In [27]:
"""ソフトバンクデータの読み込み・前処理を行う関数"""
def process_softbank_data(file_path):
    # データの読み込み
    softbank = pd.read_csv(file_path, encoding="utf-8")

    # 年月日を適切な形式に変換
    softbank["FormattedDate"] = pd.to_datetime(
        softbank["Year"].astype(str) + "/" +
        softbank["Date"].str.extract(r"(\d+)月(\d+)日")[0].fillna('0') +
        "/" + softbank["Date"].str.extract(r"(\d+)月(\d+)日")[1].fillna('0'),
        format="%Y/%m/%d"
    )

    # 曜日を追加
    softbank["Weekday"] = softbank["FormattedDate"].dt.day_name()

    return softbank

In [28]:
def process_weather_data(file_path):
    """天気データの読み込み・前処理を行う関数"""
    # データの読み込み
    weather = pd.read_csv(file_path, encoding="shift_jis")

    # 列名をリネーム
    weather.columns = [
        "yyyy/mm/dd",
        "Average_Temperature (℃)",
        "Total_Precipitation (mm)",
        "Average_wind_speed(m/s)"
    ]

    # Date列をdatetime型に変換
    weather["yyyy/mm/dd"] = pd.to_datetime(weather["yyyy/mm/dd"])

    return weather

In [29]:
def merge_datasets(softbank_df, weather_df):
    """ソフトバンクデータと天気データをマージする関数"""
    # マージ処理: 結合キーをdatetime型に合わせる
    merged_df = pd.merge(softbank_df, weather_df, how="left", left_on="FormattedDate", right_on="yyyy/mm/dd")
    return merged_df

In [30]:
def save_to_csv(df, output_path):
    """データフレームをCSVとして出力する関数"""
    df.to_csv(output_path, index=False, encoding='utf-8-sig')

In [31]:
# ファイルパスの設定
softbank_file = r"..\data\softbank_audience_full_data.csv"
weather_file = r"..\data\weather.csv"
output_file = r"..\data\final_data.csv"

# 前処理の実行
softbank_df = process_softbank_data(softbank_file)
weather_df = process_weather_data(weather_file)

# データのマージ
df = merge_datasets(softbank_df, weather_df)

In [32]:
df.head(3)

Unnamed: 0,Year,Date,Audience,Result,Score,Opponent,Pitcher,GameTime,Venue,FormattedDate,Weekday,yyyy/mm/dd,Average_Temperature (℃),Total_Precipitation (mm),Average_wind_speed(m/s)
0,2015,3月27日(金),38500,●,1 - 3,ロッテ,攝津,3:17,ヤフオクドーム,2015-03-27,Friday,2015-03-27,12.2,0.0,1.7
1,2015,3月28日(土),37397,○,4 - 2,ロッテ,スタンリッジ,2:37,ヤフオクドーム,2015-03-28,Saturday,2015-03-28,16.1,0.0,2.2
2,2015,3月29日(日),38118,●,4 - 5,ロッテ,中田,3:21,ヤフオクドーム,2015-03-29,Sunday,2015-03-29,16.1,0.0,2.5


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 714 entries, 0 to 713
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Year                      714 non-null    int64         
 1   Date                      714 non-null    object        
 2   Audience                  714 non-null    int64         
 3   Result                    714 non-null    object        
 4   Score                     714 non-null    object        
 5   Opponent                  714 non-null    object        
 6   Pitcher                   704 non-null    object        
 7   GameTime                  704 non-null    object        
 8   Venue                     714 non-null    object        
 9   FormattedDate             714 non-null    datetime64[ns]
 10  Weekday                   714 non-null    object        
 11  yyyy/mm/dd                714 non-null    datetime64[ns]
 12  Average_Temperature (℃

In [33]:
def preprocess_data(df):
    # "中止"データを排除
    df = df[df['Score'] != '中止'].copy()  # copy()を追加して警告を回避

    # 該当のドームのみ
    df = df[df["Venue"].isin(['ヤフオクドーム', 'PayPayドーム', 'みずほPayPay'])].copy()

    # 日付の処理
    df['Date'] = pd.to_datetime(df['FormattedDate'])
    df['Weekday'] = pd.to_datetime(df['FormattedDate']).dt.day_name()

    # 観客数の数値化
    df['Audience'] = pd.to_numeric(df['Audience'])

    # 降水量、気温、雲量の数値化
    df['Total_Precipitation (mm)'] = pd.to_numeric(df['Total_Precipitation (mm)'])
    df['Average_Temperature (℃)'] = pd.to_numeric(df['Average_Temperature (℃)'])
    df[ "Average_wind_speed(m/s)"] = pd.to_numeric(df[ "Average_wind_speed(m/s)"])

    # コロナ時期をfilter
    df = df[~(df["Year"].isin([2020,2021,2022]))].reset_index(drop=True)

    # 不要な列の削除
    df.drop(columns=['FormattedDate', 'Score','yyyy/mm/dd', "Venue",'Result','Pitcher','GameTime'], inplace=True)

    return df

df_preprocessed = preprocess_data(df)

In [34]:
df_preprocessed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Year                      458 non-null    int64         
 1   Date                      458 non-null    datetime64[ns]
 2   Audience                  458 non-null    int64         
 3   Opponent                  458 non-null    object        
 4   Weekday                   458 non-null    object        
 5   Average_Temperature (℃)   458 non-null    float64       
 6   Total_Precipitation (mm)  458 non-null    float64       
 7   Average_wind_speed(m/s)   458 non-null    float64       
dtypes: datetime64[ns](1), float64(3), int64(2), object(2)
memory usage: 28.8+ KB


In [35]:
df_preprocessed.describe()

Unnamed: 0,Year,Date,Audience,Average_Temperature (℃),Total_Precipitation (mm),Average_wind_speed(m/s)
count,458.0,458,458.0,458.0,458.0,458.0
mean,2018.820961,2019-04-27 12:40:52.401746688,36893.735808,23.849563,5.295852,2.81441
min,2015.0,2015-03-27 00:00:00,27871.0,10.2,0.0,1.1
25%,2016.0,2016-08-09 06:00:00,35103.75,20.3,0.0,2.2
50%,2018.0,2018-06-15 12:00:00,38047.0,23.9,0.0,2.5
75%,2023.0,2023-05-20 18:00:00,38585.0,28.35,3.5,3.2
max,2024.0,2024-10-04 00:00:00,40178.0,32.3,135.0,7.1
std,3.181359,,2828.405965,4.920267,13.778663,0.886792


In [36]:
save_to_csv(df_preprocessed, r"..\data\df_preprocessed.csv")

In [37]:
df_preprocessed.head()

Unnamed: 0,Year,Date,Audience,Opponent,Weekday,Average_Temperature (℃),Total_Precipitation (mm),Average_wind_speed(m/s)
0,2015,2015-03-27,38500,ロッテ,Friday,12.2,0.0,1.7
1,2015,2015-03-28,37397,ロッテ,Saturday,16.1,0.0,2.2
2,2015,2015-03-29,38118,ロッテ,Sunday,16.1,0.0,2.5
3,2015,2015-03-31,30268,オリックス,Tuesday,17.1,0.0,2.8
4,2015,2015-04-01,31198,オリックス,Wednesday,16.1,3.5,3.2
