In [None]:
import pandas as pd
import numpy as np
from joblib import Parallel, delayed
from tqdm import tqdm
import plotly.express as px

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 300)

In [None]:
file_path = 'E:/ML/datasets/mahjong/data/2019/block_5000.parquet'

In [None]:
states = pd.read_parquet(file_path)

In [None]:
load = states.copy()

In [None]:
len(states.loc[:,511].unique())

In [None]:
states.head(5)

In [None]:
states.loc[states[511] == '009379d9'][:5]

In [None]:
states[states.columns[:-1]] = states[states.columns[:-1]].astype(int)

In [None]:
states_dora_sum = states.loc[:, 34:67].sum(axis=1)

In [None]:
states.loc[:, dora_columns].sum(axis=1).value_counts()

In [None]:
states = states.loc[states_dora_sum == 1]

In [None]:
states = states.reset_index(drop = True)

In [None]:
states.to_csv('single_dora_states.csv',index=False)

In [None]:
states = pd.read_csv('single_dora_states.csv', dtype={511:str})

# Identify Tenpai Chiitoi Hands

In [None]:
def is_chiitoi_tenpai(hand):
    num_pairs = len(hand.drop(hand[hand.isin([0,1])].index))
    if num_pairs == 6:
        return True
    return False

In [None]:
all_hands = states[states.columns[68:102]]

In [None]:
chiitois = Parallel(n_jobs=-1)(
    delayed(lambda i: is_chiitoi_tenpai(i))(i)
    for _, i in tqdm(all_hands.iterrows())
)

In [None]:
all_hands

## Only want one chiitoi per hand (the first one)

In [None]:
chiitoi_index = states[chiitois].groupby([511,32]).apply(lambda group: group.index[0])

In [None]:
pd.Series(chiitois).value_counts()

In [None]:
chiitoi_index.to_csv('ids_of_chitoii_tenpais.csv')

In [None]:
chiitoi_index = pd.read_csv('ids_of_chitoii_tenpais.csv')

In [None]:
states['group_id'] = states['511'].astype(str) + states['32'].astype(str) + states['2'].astype(str)

In [None]:
grouped_states = states.groupby('group_id')

In [None]:
total_hands = len(grouped_states.size())

In [None]:
total_hands

# Transform each chiitoi record into the needed format

We'll only focus on chiitois with one dora.

[round_number, wind, dora, discard, ...]

In [None]:
def id_to_tile(id):
    return id %34

In [None]:
states.iloc[test_chiitoi.iloc[2]]['2']

In [None]:
test_chiitoi = chiitoi_index.iloc[3]
# Get game, round, and player number
turns = states.loc[(states['511'] == test_chiitoi.iloc[0]) & (states['32'] == test_chiitoi.iloc[1]) & (states['2'] == states.iloc[test_chiitoi.iloc[2]]['2'])]

In [None]:
columns_of_interest = ['0','1'] + turns.columns[10:14].to_list() + turns.columns[34:68].to_list() + ['510']

In [None]:
turns[columns_of_interest]

## Round Wind

In [None]:
turns['0'].iloc[0]

## is Dealer

In [None]:
int((turns['1'].iloc[0] - turns['2'].iloc[0]) == 0)

## Step count

In [None]:
turns['33'].iloc[-1]

## is Riichi

In [None]:
last_riichi = turns.iloc[-1][10:14].reset_index(drop=True)

In [None]:
last_riichi[turns['2'].iloc[0]]

## Dora

In [None]:
dora_series = turns.iloc[0,34:68]

In [None]:
id_to_tile(int(dora_series.loc[dora_series==1].index[0]))

## Discard column

In [None]:
turns[columns_of_interest]['510'].to_list()

## Full Function

In [None]:
dora_columns = []
for i in range(34,68):
    dora_columns = dora_columns + [str(i)]

In [None]:
def player_round_to_discards(round_df, stop = None):
    round_df = round_df.reset_index(drop=True)
    if stop:
        round_df = round_df.iloc[:stop]
    else:
        chiitoi_hand = round_df.loc[round_df['is_chiitoi']==1]
        if chiitoi_hand.shape[0] != 0:
            round_df = round_df.iloc[:chiitoi_hand.index[0]+1]
        
    wind = [round_df['0'].iloc[0]]
    
    dealer = [int((round_df['1'].iloc[0] - round_df['2'].iloc[0]) == 0)]

    discard_num = [round_df.shape[0]]
    
    last_riichi = round_df.iloc[-1][['10','11','12','13']].reset_index(drop=True)

    is_riichi = [last_riichi[round_df['2'].iloc[0]]]
    
    dora_series = round_df[dora_columns].iloc[0]
    dora = [id_to_tile(int(dora_series.loc[dora_series==1].index[0]))]

    discards = round_df['510'].to_list()

    #return wind, dealer, is_riichi, dora, discards

    return wind + dealer + discard_num + is_riichi + dora + discards

In [None]:
some_group = hand_groups.get_group('001ada9412').reset_index(drop=True)

chiitoi_hand = some_group.loc[some_group['is_chiitoi']==1]

print(chiitoi_hand.shape[0])

if chiitoi_hand.shape[0] != 0:
    print(some_group.iloc[:chiitoi_hand.index[0]+1])

In [None]:

some_group.loc[some_group['is_chiitoi']==1]

In [None]:
chiitoi_index

In [None]:
player_round_to_discards(turns)

In [None]:
turns.shape

## Get chiitoi rounds

In [None]:
states

In [None]:
chiitoi_index = chiitoi_index.merge(states['2'], how='left', left_on=['0'], right_index=True)

In [None]:
chiitoi_index

In [None]:
turns_frame = chiitoi_index.merge(states, how='left', on=['511','32','2'])

In [None]:
turns_frame = turns_frame.rename(columns={'0_x':'state_index', '0_y': '0'})

In [None]:
columns_of_interest = ['511','32','2'] + ['0', '1', '33', '10','11','12','13'] + dora_columns + ['510']

In [None]:
turns_frame[columns_of_interest]

In [None]:
turns_frame

In [None]:
turns_frame['group_id'] = turns_frame['511'].astype(str) + turns_frame['32'].astype(str) + turns_frame['2'].astype(str)

In [None]:
turn_groups = turns_frame.groupby('group_id')

## Paralell Run

In [None]:
chiitoi_round_sequences = Parallel(n_jobs=-1)(
    delayed(lambda i: {'group_id': i['group_id'].iloc[-1], 'seq': player_round_to_discards(i)})(i)
    for _, i in tqdm(turn_groups.__iter__())
)

In [None]:
chiitoi_df = pd.DataFrame(chiitoi_round_sequences)

# Determine the needed distribution of round numbers for the non chiitoi hands
We also want to pull a percentage of hands that is equal to the percentage of chiitois we have overall.

In [None]:
chiitoi_steps = chiitoi_df['seq'].apply(lambda x: x[2])

In [None]:
chiitoi_steps.max()

In [None]:
chiitoi_dist = chiitoi_steps.value_counts()/chiitoi_steps.shape[0]

In [None]:
chiitoi_dist

In [None]:
fig = px.histogram(chiitoi_steps)

In [None]:
fig.show()

# Split states into hand groups

## Add is chiitoi to states

In [None]:
states['is_chiitoi'] = 0

In [None]:
states.loc[chiitois,'is_chiitoi'] = 1

In [None]:
states['is_chiitoi'].value_counts()

In [None]:
def handle_hand_group(hand_group, dist_chiitois):
    
    # first, determine if this hand group contains a chiitoi tenpai.
    has_chiitoi_tenpai = hand_group['is_chiitoi'].sum() > 0

    if has_chiitoi_tenpai:
        return player_round_to_discards(hand_group), 1
    else:
        stop_val = np.random.choice(dist_chiitois.index, p=dist_chiitois)
        return player_round_to_discards(hand_group, stop=stop_val), 0

In [None]:
def generate_hand_group_dict(hand_group, dist_chiitois):
    seq, is_chiitoi = handle_hand_group(hand_group, dist_chiitois)

    return {'group_id': hand_group['group_id'].iloc[0], 'seq': seq, 'is_chiitoi': is_chiitoi}

In [None]:
np.random.choice(chiitoi_dist.index, p=chiitoi_dist)

In [None]:
needed_columns = ['group_id', '0', '1', '2', '33', '10', '11', '12', '13', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '510', 'is_chiitoi']

In [None]:
hand_groups = states[needed_columns].groupby('group_id')

In [None]:
hand_groups

# Full clean data

In [None]:
full_hands = Parallel(n_jobs=-1)(
    delayed(lambda i: generate_hand_group_dict(i, chiitoi_dist))(i)
    for _, i in tqdm(hand_groups.__iter__())
)

In [None]:
full_df = pd.DataFrame(full_hands)

In [None]:
full_df.loc[full_df['is_chiitoi'] == 0]['seq'].apply(lambda x: len(x)).mean()

Need to write a more intelligent data truncator.

In [None]:
full_df

# Data Split

# Train neural net

# Test neural net

# Repeat for various thicknesses

# Attempt to explain the thinnest-effective model