# Imports

In [173]:
import pandas as pd
import numpy as np
from joblib import Parallel, delayed
from tqdm import tqdm
import plotly.express as px

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 300)

# Load

In [None]:
file_path = 'E:/ML/datasets/mahjong/data/2019/block_5000.parquet'

In [None]:
states = pd.read_parquet(file_path)

In [None]:
states.columns = states.columns.map(
    str
)

## Trim unneeded columns to save on space

In [None]:
dora_columns = []
for i in range(34,68):
    dora_columns = dora_columns + [str(i)]

In [None]:
hand_columns = []
for i in range(68,102):
    hand_columns = hand_columns + [str(i)]

In [None]:
states['group_id'] = states['511'].astype(str) + states['32'].astype(str) + states['2'].astype(str)

In [None]:
columns_of_interest = ['group_id'] + ['0', '1','2','10','11','12','13'] + dora_columns + hand_columns + ['510']

In [None]:
states = states[columns_of_interest] 

In [None]:
states[states.columns[1:]] = states[states.columns[1:]].astype(int)

In [None]:
states.shape

## Remove multi-dora states

In [None]:
states_dora_sum = states.loc[:, dora_columns].sum(axis=1)

In [None]:
states = states.loc[states_dora_sum == 1]

In [None]:
def is_chiitoi_tenpai(state):
    hand = state[hand_columns]
    num_pairs = len(hand.drop(hand[hand.isin([0,1])].index))
    if num_pairs == 6:
        return True
    return False

In [None]:
chiitois = Parallel(n_jobs=-1)(
    delayed(lambda i: is_chiitoi_tenpai(i))(i)
    for _, i in tqdm(states.iterrows())
)

In [None]:
pd.Series(chiitois).value_counts()

In [None]:
states['is_chiitoi'] = 0
states.loc[chiitois,'is_chiitoi'] = 1

# Generate Hand Groups

In [None]:
hand_groups = states.groupby('group_id')

# Determine chiitoi tenpais in hand groups

In [None]:
def is_chiitoi_tenpai(hand):
    num_pairs = len(hand.drop(hand[hand.isin([0,1])].index))
    if num_pairs == 6:
        return True
    return False

In [None]:
def generate_is_chiitoi_dict(hand_group):
    hand_group = hand_group.reset_index(drop=True)
    for i, r in hand_group.iterrows():
        if is_chiitoi_tenpai(r[hand_columns]):
            hand_len = i
            is_chiitoi = 1
            return {'length': hand_len, 'is_chiitoi': is_chiitoi}
            
    
    return {'length': hand_group.shape[0], 'is_chiitoi': 0}

In [None]:
len_chiitois = Parallel(n_jobs=-1)(
    delayed(lambda i: generate_is_chiitoi_dict(i))(i)
    for _, i in tqdm(hand_groups.__iter__())
)

In [None]:
len_chi_df = pd.DataFrame(len_chiitois)

In [None]:
len_chi_df.loc[len_chi_df['is_chiitoi'] == 1]

# Determine discard count distribution for chiitoi tenpai hands

In [None]:
total_chiitoi_hands = len_chi_df.loc[len_chi_df['is_chiitoi']==1]['length'].value_counts().sum()
chiitoi_value_counts = len_chi_df.loc[len_chi_df['is_chiitoi']==1]['length'].value_counts()

In [None]:
chiitoi_dist = chiitoi_value_counts/total_chiitoi_hands

# Process hand groups

In [160]:
def id_to_tile(id):
    return id %34

In [161]:
def player_round_to_discards(round_df, stop = None):
    round_df = round_df.reset_index(drop=True)
    if stop:
        round_df = round_df.iloc[:stop]
    else:
        chiitoi_hand = round_df.loc[round_df['is_chiitoi']==1]
        if chiitoi_hand.shape[0] != 0:
            round_df = round_df.iloc[:chiitoi_hand.index[0]+1]
        
    wind = [round_df['0'].iloc[0]]
    
    dealer = [int((round_df['1'].iloc[0] - round_df['2'].iloc[0]) == 0)]

    discard_num = [round_df.shape[0]]
    
    last_riichi = round_df.iloc[-1][['10','11','12','13']].reset_index(drop=True)

    is_riichi = [last_riichi[round_df['2'].iloc[0]]]
    
    dora_series = round_df[dora_columns].iloc[0]
    dora = [id_to_tile(int(dora_series.loc[dora_series==1].index[0]))]

    discards = round_df['510'].to_list()

    #return wind, dealer, is_riichi, dora, discards

    return wind + dealer + discard_num + is_riichi + dora + discards

In [162]:
def handle_hand_group(hand_group, dist_chiitois):
    
    # first, determine if this hand group contains a chiitoi tenpai.
    has_chiitoi_tenpai = hand_group['is_chiitoi'].sum() > 0

    if has_chiitoi_tenpai:
        return player_round_to_discards(hand_group), 1
    else:
        stop_val = np.random.choice(dist_chiitois.index, p=dist_chiitois)
        return player_round_to_discards(hand_group, stop=stop_val), 0

In [163]:
def generate_hand_group_dict(hand_group, dist_chiitois):
    seq, is_chiitoi = handle_hand_group(hand_group, dist_chiitois)

    return {'group_id': hand_group['group_id'].iloc[0], 'seq': seq, 'is_chiitoi': is_chiitoi}

In [None]:
full_hands = Parallel(n_jobs=-1)(
    delayed(lambda i: generate_hand_group_dict(i, chiitoi_dist))(i)
    for _, i in tqdm(hand_groups.__iter__())
)

# Export

In [165]:
full_hands_df = pd.DataFrame(full_hands)

In [171]:
full_hands_df.to_csv('full_hands_df.csv', index=False)