In [1]:
import pandas as pd
import sys
from sklearn.pipeline import Pipeline
import numpy as np
import string
import os
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV



In [2]:
games = pd.read_csv('../data/games.csv', index_col='game_id')
train = pd.read_csv('../data/train.csv', index_col='game_id')
turns = pd.read_csv('../data/turns.csv', index_col='game_id')

In [3]:
os.path.abspath('..')
sys.path.append(os.path.abspath('..'))


In [65]:
import functions
import transformers
import processors
import importlib
import builders

importlib.reload(functions)
importlib.reload(transformers)
importlib.reload(processors)
importlib.reload(builders)

import searcher
import searcher_results_orgenizer

from searcher import Searcher
from searcher_results_orgenizer import SearcherResultsOrgenizer

from transformers.columns_setter_transformer import ColumnsSetterTransformer
from transformers.extract_set_column_transformer import ExtractSetColumnsTransformer
from transformers.name_dropper_transformer import NameDropperTransformer
from transformers.select_transformer import SelectTransformer
from transformers.select_rows_transformer import SelectRowsTransformer
from transformers.series_from_group_transformer import SeriesFromGroupTransformer
from transformers.map_set_transformer import MapSetTransformer
from transformers.one_hot_encoder_transformer import OneHotEncoderTransformer
from transformers.add_to_dict_transformer import AddToDictTransformer
from transformers.get_from_dict_transformer import GetFromDictTransformer
from transformers.select_pos_rows_transformer import SelectPosRowsTransformer


from functions.bot_extractor import BotExtarctor
from functions.is_bot_extractor import IsBotExtarctor

from processors.basic_pre_processor import BasicPreProcessor

from builders.preprocessor_builder import PreprocessorBuilder
from builders.reg_pipe_builder import RegPipeBuilder


importlib.reload(functions.is_bot_extractor)
importlib.reload(functions.bot_extractor)

importlib.reload(transformers.columns_setter_transformer)
importlib.reload(transformers.extract_set_column_transformer)
importlib.reload(transformers.name_dropper_transformer)
importlib.reload(transformers.select_transformer)
importlib.reload(transformers.select_rows_transformer)
importlib.reload(transformers.series_from_group_transformer)
importlib.reload(transformers.map_set_transformer)
importlib.reload(transformers.one_hot_encoder_transformer)
importlib.reload(transformers.add_to_dict_transformer)
importlib.reload(transformers.get_from_dict_transformer)
importlib.reload(transformers.select_pos_rows_transformer)


importlib.reload(processors.basic_pre_processor)

importlib.reload(builders.preprocessor_builder)
importlib.reload(builders.reg_pipe_builder)

importlib.reload(searcher)
importlib.reload(searcher_results_orgenizer)







<module 'searcher_results_orgenizer' from 'c:\\Users\\Bina4\\Desktop\\Guy_hafifa\\scrabble\\searcher_results_orgenizer.py'>

In [None]:
def get_n_first(n: int, col: str, df: pd.DataFrame):
    df[col].groupby()
    

In [9]:
G_NAME = 'games'
T_NAME = 'turns'
DATA_NAME = 'train'

names = ['BetterBot', 'STEEBot', 'HastyBot']

prePipe = Pipeline([('get_relavent_turns', SelectRowsTransformer(train.index, target=T_NAME)),
                     ('get_relavent_games',  SelectRowsTransformer(train.index.unique(), target=G_NAME)),
                     ('train_set_is_player', ExtractSetColumnsTransformer({'is_player': IsBotExtarctor(names, 'nickname', True)},
                                                                           src=DATA_NAME, dest=DATA_NAME)),
                    ('turns_set_is_player', ExtractSetColumnsTransformer({'is_player': IsBotExtarctor(names, 'nickname', True)},
                                                                           src=T_NAME, dest=T_NAME)),  
                     ('get_bot_rating', ExtractSetColumnsTransformer({'bot_rating': lambda train: train[~train['is_player']]['rating']},
                                                                      src=DATA_NAME, dest=G_NAME)),
                     ('get_bots_names', ExtractSetColumnsTransformer({'bot_name': lambda train: train[~train['is_player']]['nickname']},
                                                                     src=DATA_NAME, dest=G_NAME)),
                    ('data_drop_bot_rating', SelectPosRowsTransformer('is_player', target=DATA_NAME)),
                    ])

featureTransformers = [('extract_first', ColumnsSetterTransformer({'is_bot_first': IsBotExtarctor(names, name_col='first')}, target=G_NAME)),
                       ('turns_word_info_mappers', MapSetTransformer({'move_len': (lambda x: len(x) if type(x) == str else np.NAN, 'move'),
                                                                      'reused_num': (lambda x: x.count('.') if type(x) == str else np.NAN, 'move'),
                                                                    # 'jokers_num': (lambda x: sum(1 for c in x if c.islower()) if (type(x) == str) and (x not in ['(challenge)', '(time)']) else np.NAN, 'move')
                                                                     }, target=T_NAME)),
                       ('points_moments', ExtractSetColumnsTransformer({'p_points_1_moment': 
                                                                      (lambda turns: turns[turns['is_player']]['points'].groupby('game_id').mean()),
                                                                        'p_points_std':
                                                                      (lambda turns: turns[turns['is_player']]['points'].groupby('game_id').std()),
                                                                        'a_poits_1_moment': 
                                                                        lambda turns: turns['points'].groupby('game_id').mean()
                                                                       }, src=T_NAME, dest=G_NAME)),
                       ('from_turns', ExtractSetColumnsTransformer({'moves_avg': 
                                                                    lambda turns: turns[turns['is_player']]['move_len'].groupby('game_id').mean(),
                                                                    'reuded_sum': 
                                                                    lambda turns: turns[turns['is_player']]['move_len'].groupby('game_id').sum()
                                                                   }, src=T_NAME, dest=G_NAME)),
                       ('hot', OneHotEncoderTransformer({'time_control_name': games['time_control_name'].unique(),
                                                          'game_end_reason':  games['game_end_reason'].unique(),
                                                          'lexicon':  games['lexicon'].unique(),
                                                          'rating_mode':  games['rating_mode'].unique(),
                                                          'bot_name': names}
                                                        , target=G_NAME)),
                       ('drops', NameDropperTransformer(['first', 'created_at', 'time_control_name', 'game_end_reason', 'lexicon', 'rating_mode', 'bot_name'], target=G_NAME)),                                              
                      ]


In [10]:
preprocessor = PreprocessorBuilder(games, G_NAME, turns, T_NAME, prePipe).build()
n_games, n_turns, n_ratings = preprocessor.process(train, DATA_NAME)


In [11]:
reg_pipe = RegPipeBuilder(n_games, G_NAME, n_turns, T_NAME, featureTransformers, RandomForestRegressor(random_state=0)).build()

In [20]:
cv_res = cross_validate(reg_pipe, n_games, n_ratings, cv=5,
                         scoring=('neg_root_mean_squared_error'),
                         return_train_score=True,)


In [21]:
cv_res

{'fit_time': array([61.85449243, 57.80915213, 54.07170725, 48.74675083, 49.76511073]),
 'score_time': array([4.77567911, 4.16277671, 3.96476746, 3.92584729, 3.58136368]),
 'test_score': array([-105.12417386, -106.42150451, -107.10097048, -106.60122195,
        -105.098516  ]),
 'train_score': array([-39.74436997, -39.51698433, -39.47580448, -39.33310896,
        -39.65112949])}

In [34]:
param_grid = {   
    'estimator__max_depth': [5, 7],
    'estimator__min_samples_split': [2]
}
search = GridSearchCV(reg_pipe, param_grid, n_jobs=-1, scoring=('neg_root_mean_squared_error'))
search.fit(n_games, n_ratings)

In [12]:
param_grid = {   
    'estimator__max_depth': [3, 5],
}
search = GridSearchCV(reg_pipe, param_grid, n_jobs=2, scoring=('r2', 'neg_root_mean_squared_error'), refit=False)
search.fit(n_games, n_ratings)

In [14]:
pd.DataFrame(search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_estimator__max_depth,params,split0_test_r2,split1_test_r2,split2_test_r2,split3_test_r2,...,std_test_r2,rank_test_r2,split0_test_neg_root_mean_squared_error,split1_test_neg_root_mean_squared_error,split2_test_neg_root_mean_squared_error,split3_test_neg_root_mean_squared_error,split4_test_neg_root_mean_squared_error,mean_test_neg_root_mean_squared_error,std_test_neg_root_mean_squared_error,rank_test_neg_root_mean_squared_error
0,12.87894,0.616863,4.56653,0.707787,3,{'estimator__max_depth': 3},0.578397,0.584362,0.575229,0.588308,...,0.004786,2,-149.613615,-147.978877,-149.054538,-147.429262,-148.000286,-148.415316,0.79744,2
1,19.334101,0.475307,4.032127,0.294967,5,{'estimator__max_depth': 5},0.671575,0.669422,0.666284,0.673461,...,0.003134,1,-132.049809,-131.971171,-132.116199,-131.30004,-130.964385,-131.680321,0.46223,1


In [22]:
grid = {
    'random_forest_1':
    {
    'estimator': RandomForestRegressor(random_state=0),
    'param_grid':
        {
        'max_depth': [4, 6],
        'max_features': ['sqrt', 'log2']
        },
    'scores': ['neg_root_mean_squared_error', 'r2']
    },
    'random_forest_2':
    {
    'estimator': RandomForestRegressor(random_state=0),
    'param_grid':
        {
        'max_depth': [2, 3],
        'max_features': ['sqrt', None]
        },
    'scores': ['neg_root_mean_squared_error']
    },
}

In [23]:
my_searcher = Searcher(reg_pipe[:-1], grid, 2)
my_searcher.search(n_games, n_ratings)

Here for each scroe we train a new model: 

In [67]:
SearcherResultsOrgenizer().orgenize(my_searcher.get_results())

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,max_features,max_depth
model_name,scorer,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
random_forest_1,neg_root_mean_squared_error,6.986713,0.27692,4.263135,0.618311,-145.691808,-142.559752,-142.664066,-142.497111,-143.120901,-143.306728,1.212403,sqrt,4
random_forest_1,neg_root_mean_squared_error,5.907468,0.429673,4.088441,0.491282,-145.691808,-142.559752,-142.664066,-142.497111,-143.120901,-143.306728,1.212403,log2,4
random_forest_1,neg_root_mean_squared_error,7.125662,0.628841,4.188896,0.571498,-133.154428,-131.142826,-131.518018,-130.356627,-130.801026,-131.394585,0.959722,sqrt,6
random_forest_1,neg_root_mean_squared_error,8.321817,0.758674,5.509997,0.787103,-133.154428,-131.142826,-131.518018,-130.356627,-130.801026,-131.394585,0.959722,log2,6
random_forest_1,r2,8.069553,0.343546,5.195816,1.099059,0.60021,0.614247,0.610871,0.615393,0.612184,0.610581,0.005418,sqrt,4
random_forest_1,r2,6.384278,0.325241,4.458323,0.435902,0.60021,0.614247,0.610871,0.615393,0.612184,0.610581,0.005418,log2,4
random_forest_1,r2,7.699252,0.530351,4.170309,0.582863,0.666057,0.673559,0.669299,0.678137,0.676077,0.672626,0.004413,sqrt,6
random_forest_1,r2,7.470199,0.499251,4.410187,0.363213,0.666057,0.673559,0.669299,0.678137,0.676077,0.672626,0.004413,log2,6
random_forest_2,neg_root_mean_squared_error,6.714017,0.65823,5.083888,0.866621,-166.872265,-163.992401,-163.751527,-163.876581,-164.675848,-164.633725,1.164102,sqrt,2
random_forest_2,neg_root_mean_squared_error,11.561503,0.736187,4.23452,0.227863,-160.675893,-158.119858,-159.207012,-157.547383,-158.437539,-158.797537,1.081446,,2


Here we dont train a new model for each score:

In [36]:
my_searcher2 = Searcher(reg_pipe[:-1], grid, 2, redo_scores=False)
my_searcher2.search(n_games, n_ratings)

In [66]:
SearcherResultsOrgenizer(redo_scores=False).orgenize(my_searcher2.get_results())

Unnamed: 0_level_0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,split0_test_neg_root_mean_squared_error,split1_test_neg_root_mean_squared_error,split2_test_neg_root_mean_squared_error,split3_test_neg_root_mean_squared_error,split4_test_neg_root_mean_squared_error,mean_test_neg_root_mean_squared_error,...,split0_test_r2,split1_test_r2,split2_test_r2,split3_test_r2,split4_test_r2,mean_test_r2,std_test_r2,rank_test_r2,max_features,max_depth
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
random_forest_1,6.576219,0.523663,4.103623,0.632437,-145.691808,-142.559752,-142.664066,-142.497111,-143.120901,-143.306728,...,0.60021,0.614247,0.610871,0.615393,0.612184,0.610581,0.005418,3.0,sqrt,4
random_forest_1,6.070327,0.478665,4.215966,0.655233,-145.691808,-142.559752,-142.664066,-142.497111,-143.120901,-143.306728,...,0.60021,0.614247,0.610871,0.615393,0.612184,0.610581,0.005418,3.0,log2,4
random_forest_1,7.647053,0.849659,4.494783,0.490558,-133.154428,-131.142826,-131.518018,-130.356627,-130.801026,-131.394585,...,0.666057,0.673559,0.669299,0.678137,0.676077,0.672626,0.004413,1.0,sqrt,6
random_forest_1,7.910349,0.32269,4.352757,0.527192,-133.154428,-131.142826,-131.518018,-130.356627,-130.801026,-131.394585,...,0.666057,0.673559,0.669299,0.678137,0.676077,0.672626,0.004413,1.0,log2,6
random_forest_2,5.853223,0.503367,4.803546,0.496307,-166.872265,-163.992401,-163.751527,-163.876581,-164.675848,-164.633725,...,,,,,,,,,sqrt,2
random_forest_2,12.094376,1.050423,4.534772,0.547301,-160.675893,-158.119858,-159.207012,-157.547383,-158.437539,-158.797537,...,,,,,,,,,,2
random_forest_2,6.987271,0.534861,4.508625,0.311038,-154.065204,-151.047536,-150.998049,-150.708186,-151.246658,-151.613127,...,,,,,,,,,sqrt,3
random_forest_2,14.462478,1.057209,4.91496,0.836035,-149.613615,-147.978877,-149.054538,-147.429262,-148.000286,-148.415316,...,,,,,,,,,,3
