In [1]:
# -*- coding: utf-8 -*-

# ライブラリインポート
import pandas as pd

In [2]:
"""ソフトバンクデータの読み込み・前処理を行う関数"""
def process_softbank_data(file_path):
    # データの読み込み
    softbank = pd.read_csv(file_path, encoding="utf-8")

    # 年月日を適切な形式に変換
    softbank["FormattedDate"] = pd.to_datetime(
        softbank["Year"].astype(str) + "/" +
        softbank["Date"].str.extract(r"(\d+)月(\d+)日")[0].fillna('0') +
        "/" + softbank["Date"].str.extract(r"(\d+)月(\d+)日")[1].fillna('0'),
        format="%Y/%m/%d"
    )

    # 曜日を追加
    softbank["Weekday"] = softbank["FormattedDate"].dt.day_name()

    return softbank

In [3]:
def process_weather_data(file_path):
    """天気データの読み込み・前処理を行う関数"""
    # データの読み込み
    weather = pd.read_csv(file_path, encoding="shift_jis")

    # 必要な列のみを抽出
    weather = weather.iloc[:, [0, 1, 5, 8]]

    # 列名をリネーム
    weather.columns = [
        "yyyy/mm/dd",
        "Total_Precipitation (mm)",
        "Average_Temperature (℃)",
        "Average_CloudCover (%)"
    ]

    # Date列をdatetime型に変換
    weather["yyyy/mm/dd"] = pd.to_datetime(weather["yyyy/mm/dd"])

    return weather

In [4]:
def merge_datasets(softbank_df, weather_df):
    """ソフトバンクデータと天気データをマージする関数"""
    # マージ処理: 結合キーをdatetime型に合わせる
    merged_df = pd.merge(softbank_df, weather_df, how="left", left_on="FormattedDate", right_on="yyyy/mm/dd")
    return merged_df

In [5]:
def save_to_csv(df, output_path):
    """データフレームをCSVとして出力する関数"""
    df.to_csv(output_path, index=False, encoding='utf-8-sig')

In [10]:
# ファイルパスの設定
softbank_file = r"..\data\softbank_audience_full_data.csv"
weather_file = r"..\data\weather.csv"
output_file = r"..\data\final_data.csv"

# 前処理の実行
softbank_df = process_softbank_data(softbank_file)
weather_df = process_weather_data(weather_file)

# データのマージ
df = merge_datasets(softbank_df, weather_df)

In [26]:
df.tail(75)

Unnamed: 0,Year,Date,Audience,Result,Score,Opponent,Pitcher,GameTime,Venue,FormattedDate,Weekday,yyyy/mm/dd,Total_Precipitation (mm),Average_Temperature (℃),Average_CloudCover (%),Home_Score,Away_Score
639,2023,2023-10-02,37413,1,6 - 0,楽天,大関,3:03,PayPayドーム,2023-10-02,Monday,2023-10-02,0.0,23.0,5.8,6,0
640,2023,2023-10-03,38597,1,7 - 3,楽天,和田,3:03,PayPayドーム,2023-10-03,Tuesday,2023-10-03,0.0,22.3,10.0,7,3
641,2024,2024-04-02,39191,1,2 - 0,ロッテ,大関,2:31,PayPayドーム,2024-04-02,Tuesday,2024-04-02,9.5,18.1,,2,0
642,2024,2024-04-03,38797,0,2 - 4,ロッテ,東浜,4:15,PayPayドーム,2024-04-03,Wednesday,2024-04-03,62.0,17.3,,2,4
643,2024,2024-04-04,38969,1,8 - 1,ロッテ,大津,2:50,PayPayドーム,2024-04-04,Thursday,2024-04-04,1.5,14.8,,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
709,2024,2024-09-26,33333,0,2 - 4,西武,有原,3:21,PayPayドーム,2024-09-26,Thursday,2024-09-26,0.0,27.1,,2,4
710,2024,2024-09-30,34132,1,1 - 0,オリックス,モイネロ,2:35,PayPayドーム,2024-09-30,Monday,2024-09-30,0.0,26.3,,1,0
711,2024,2024-10-01,35002,1,8 - 6,オリックス,前田悠,3:09,PayPayドーム,2024-10-01,Tuesday,2024-10-01,0.0,26.4,,8,6
712,2024,2024-10-03,36777,1,4 - 2,楽天,有原,2:52,PayPayドーム,2024-10-03,Thursday,2024-10-03,4.0,20.8,,4,2


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 714 entries, 0 to 713
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Year                      714 non-null    int64         
 1   Date                      714 non-null    object        
 2   Audience                  714 non-null    int64         
 3   Result                    714 non-null    object        
 4   Score                     714 non-null    object        
 5   Opponent                  714 non-null    object        
 6   Pitcher                   704 non-null    object        
 7   GameTime                  704 non-null    object        
 8   Venue                     714 non-null    object        
 9   FormattedDate             714 non-null    datetime64[ns]
 10  Weekday                   714 non-null    object        
 11  yyyy/mm/dd                714 non-null    datetime64[ns]
 12  Total_Precipitation (m

In [16]:
def preprocess_data(df):
    # "中止"データを排除
    df = df[df['Score'] != '中止'].copy()  # copy()を追加して警告を回避

    # 日付の処理
    df['Date'] = pd.to_datetime(df['FormattedDate'])
    df['Weekday'] = pd.to_datetime(df['FormattedDate']).dt.day_name()

    # 結果の数値化
    df['Result'] = df['Result'].apply(lambda x: 0 if x == '●' else 1)

    # スコアの分割
    df[['Home_Score', 'Away_Score']] = df['Score'].str.split(' - ', expand=True)
    df['Home_Score'] = pd.to_numeric(df['Home_Score'])
    df['Away_Score'] = pd.to_numeric(df['Away_Score'])

    # 観客数の数値化
    df['Audience'] = pd.to_numeric(df['Audience'])

    # 降水量、気温、雲量の数値化
    df['Total_Precipitation (mm)'] = pd.to_numeric(df['Total_Precipitation (mm)'])
    df['Average_Temperature (℃)'] = pd.to_numeric(df['Average_Temperature (℃)'])
    df['Average_CloudCover (%)'] = pd.to_numeric(df['Average_CloudCover (%)'])

    # ゲーム時間の分に変換
    df['GameTime'] = df['GameTime'].apply(lambda x: int(x.split(':')[0]) * 60 + int(x.split(':')[1]))

    # 不要な列の削除（必要に応じて）
    df.drop(columns=['FormattedDate', 'Score'], inplace=True)

    return df

# 関数の使用例
# df = pd.read_csv('your_file.csv')
preprocess_data(df)


Unnamed: 0,Year,Date,Audience,Result,Opponent,Pitcher,GameTime,Venue,Weekday,yyyy/mm/dd,Total_Precipitation (mm),Average_Temperature (℃),Average_CloudCover (%),Home_Score,Away_Score
0,2015,2015-03-27,38500,1,ロッテ,攝津,197,ヤフオクドーム,Friday,2015-03-27,0.0,12.2,5.8,1,3
1,2015,2015-03-28,37397,1,ロッテ,スタンリッジ,157,ヤフオクドーム,Saturday,2015-03-28,0.0,16.1,7.3,4,2
2,2015,2015-03-29,38118,1,ロッテ,中田,201,ヤフオクドーム,Sunday,2015-03-29,0.0,16.1,8.3,4,5
3,2015,2015-03-31,30268,1,オリックス,大隣,151,ヤフオクドーム,Tuesday,2015-03-31,0.0,17.1,8.5,7,0
4,2015,2015-04-01,31198,1,オリックス,武田,216,ヤフオクドーム,Wednesday,2015-04-01,3.5,16.1,10.0,7,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
709,2024,2024-09-26,33333,1,西武,有原,201,PayPayドーム,Thursday,2024-09-26,0.0,27.1,,2,4
710,2024,2024-09-30,34132,1,オリックス,モイネロ,155,PayPayドーム,Monday,2024-09-30,0.0,26.3,,1,0
711,2024,2024-10-01,35002,1,オリックス,前田悠,189,PayPayドーム,Tuesday,2024-10-01,0.0,26.4,,8,6
712,2024,2024-10-03,36777,1,楽天,有原,172,PayPayドーム,Thursday,2024-10-03,4.0,20.8,,4,2


In [20]:
df[df["Average_CloudCover (%)"] == None]

Unnamed: 0,Year,Date,Audience,Result,Score,Opponent,Pitcher,GameTime,Venue,FormattedDate,Weekday,yyyy/mm/dd,Total_Precipitation (mm),Average_Temperature (℃),Average_CloudCover (%),Home_Score,Away_Score
