# In-game data predictions experiments (supportive regression on init dataset)

In [1]:
import pandas
import psycopg2

player_ranks_to_mmr_dict = {
  "Herald IW": [1, 1],
  "Herald I": [1, 154],
  "Herald II": [154, 308],
  "Herald III": [308, 462],
  "Herald IV": [462, 616],
  "Herald V": [616, 769],
  "Guardian IW": [770, 770],
  "Guardian I": [770, 924],
  "Guardian II": [924, 1078],
  "Guardian III": [1078, 1232],
  "Guardian IV": [1232, 1386],
  "Guardian V": [1386, 1540],
  "Crusader IW": [1540, 1540],
  "Crusader I": [1540, 1694],
  "Crusader II": [1694, 1848],
  "Crusader III": [1848, 2002],
  "Crusader IV": [2002, 2156],
  "Crusader V": [2156, 2310],
  "Archon IW": [2310, 2310],
  "Archon I": [2310, 2464],
  "Archon II": [2464, 2618],
  "Archon III": [2618, 2772],
  "Archon IV": [2772, 2926],
  "Archon V": [2926, 3080],
  "Legend IW": [3080, 3080],
  "Legend I": [3080, 3234],
  "Legend II": [3234, 3388],
  "Legend III": [3388, 3542],
  "Legend IV": [3542, 3696],
  "Legend V": [3696, 3850],
  "Ancient IW": [3850, 3850],
  "Ancient I": [3850, 4004],
  "Ancient II": [4004, 4158],
  "Ancient III": [4158, 4312],
  "Ancient IV": [4312, 4466],
  "Ancient V": [4466, 4620],
  "Divine IW": [4620, 4620],
  "Divine I": [4620, 4820],
  "Divine II": [4820, 5020],
  "Divine III": [5020, 5220],
  "Divine IV": [5220, 5420],
  "Divine V": [5420, 5620],
  "Immortal IW": [5621, 5621],
  "Immortal I": [5622, 5820],
  "Immortal II": [5820, 6020],
  "Immortal III": [6020, 6320],
  "Immortal IV": [6320, 6620]
}

def transform_to_mmr(player_rank):
    return sum(player_ranks_to_mmr_dict[player_rank]) / 2

query = """
    SELECT
    ms.*,
    h.name_local as hero_name_local,
    h.hero_pickrate_average,
    h.hero_winrate_average,
    h.hero_pickrate_up_to_crusader,
    h.hero_winrate_up_to_crusader,
    h.hero_pickrate_archon,
    h.hero_winrate_archon,
    h.hero_pickrate_legend,
    h.hero_winrate_legend,
    h.hero_pickrate_ancient,
    h.hero_winrate_ancient,
    h.hero_pickrate_divine_immortal,
    h.hero_winrate_divine_immortal,
    m.datetime as match_datetime,
    m.radiant_win as match_radiant_win,
    m.duration as match_duration,
    m.radiant_score as match_radiant_score,
    m.dire_score as match_dire_score
FROM
    match_stats ms
JOIN
    heroes h ON ms.hero_id = h.id
JOIN
    matches m ON ms.match_id = m.id;

"""

conn = psycopg2.connect(
                    dbname='dota_ai_od',
                    user='limited_user',
                    password='*removed*',
                    host='*azure*',
                    port='5432')


df = pandas.read_sql_query(query, conn)

df['player_side'] = df['player_side'].astype('category')


conn.close()

  df = pandas.read_sql_query(query, conn)


In [2]:
df['match_id'] = df['match_id'].astype('int64')
df['player_nickname'] = df['player_nickname'].astype(str)
df['player_side'] = df['player_side'].astype('category')
df['player_lasthits'] = df['player_lasthits'].astype('int64')
df['player_denies'] = df['player_denies'].astype('int64')
df['hero_name_local'] = df['hero_name_local'].astype(str)
df['player_match_rank_initial'] = df['player_match_rank_initial'].astype('category')
df['player_id'] = df['player_id'].astype('int64')

In [3]:
df['player_match_rank_initial_mmr'] = df['player_match_rank_initial'].apply(transform_to_mmr)
df['player_match_rank_initial_mmr'] = df['player_match_rank_initial_mmr'].astype('float64')


winrate_columns = [col for col in df.columns if 'winrate' in col.lower() and 'hero' not in col.lower()]
pickrate_columns = [col for col in df.columns if 'pickrate' in col.lower() and 'hero' not in col.lower()]

df[winrate_columns] *= 100
df[pickrate_columns] *= 100

df['player_hero_winrate_overall'] *= 100

df.head(100)

Unnamed: 0,match_id,player_id,hero_id,player_q_mmr_diff,player_hero_winrate_overall,player_hero_total_matches_played,dire_winrate_all_time,dire_games_played_all_time,radiant_winrate_all_time,radiant_games_played_all_time,...,hero_pickrate_ancient,hero_winrate_ancient,hero_pickrate_divine_immortal,hero_winrate_divine_immortal,match_datetime,match_radiant_win,match_duration,match_radiant_score,match_dire_score,player_match_rank_initial_mmr
0,7398770906,363104336,54,-154,50.000000,4,52.890176,346,50.381680,393,...,19.0338,54.9159,19.3351,53.8362,2023-10-25 14:14:53,False,3175,55,54,1925.0
1,7398770906,204634639,100,154,42.553192,47,48.453608,291,52.365930,317,...,6.1267,47.2815,7.5739,48.2371,2023-10-25 14:14:53,False,3175,55,54,2233.0
2,7398770906,144595206,36,-462,52.252250,333,45.172307,3221,51.850670,3134,...,8.2786,49.4327,5.0972,47.9513,2023-10-25 14:14:53,False,3175,55,54,1617.0
3,7398770906,167763814,84,-308,57.425743,101,47.641712,2311,51.565146,2364,...,9.6141,50.8513,6.1195,49.6469,2023-10-25 14:14:53,False,3175,55,54,1771.0
4,7398770906,1514889200,85,0,58.695650,46,44.047618,504,56.448203,473,...,6.5329,52.2018,6.5435,50.8980,2023-10-25 14:14:53,False,3175,55,54,2079.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,7418150617,385421629,76,-1371,30.434780,23,43.255815,645,48.826292,639,...,6.7161,50.0457,6.1592,49.4761,2023-11-04 17:03:13,True,1616,39,17,692.5
96,7418150617,879201523,138,15,59.375000,64,47.456214,1199,54.567310,1248,...,3.5713,45.5668,3.6903,47.0845,2023-11-04 17:03:13,True,1616,39,17,2079.0
97,7418150617,136311244,137,1401,62.068963,29,46.631017,1870,53.201133,1765,...,5.2408,47.5735,7.0162,49.2753,2023-11-04 17:03:13,True,1616,39,17,3465.0
98,7418150617,136561541,99,477,28.571430,21,47.729987,1674,52.736900,1699,...,5.7045,50.0346,5.3149,50.0514,2023-11-04 17:03:13,True,1616,39,17,2541.0


In [4]:
def get_hero_pickrate_winrate_for_rank(group):
    closest_player_idx = group['player_q_mmr_diff'].abs().idxmin()

    closest_player_rank = group.loc[closest_player_idx, 'player_match_rank_initial'].split(' ')[0]
    
    if closest_player_rank.lower() in ['herald', 'guardian', 'crusader']:
      closest_player_rank = 'up_to_crusader'

    pickrate_column = f'hero_pickrate_{closest_player_rank.lower()}'
    winrate_column = f'hero_winrate_{closest_player_rank.lower()}'
    group['hero_pickrate_for_rank'] = group[pickrate_column]
    group['hero_winrate_for_rank'] = group[winrate_column]
    
    return group

df_with_pickrate = df.groupby('match_id').apply(get_hero_pickrate_winrate_for_rank)

df_with_pickrate['player_heroes_pick_confidence_score_total'] = df_with_pickrate['player_heroes_pick_confidence_score_allies'] + df_with_pickrate['player_heroes_pick_confidence_score_enemies']
df_with_pickrate['player_matches_abandonment_rate'] = (df_with_pickrate['player_matches_abandoned'] / df_with_pickrate['player_all_matches_played_number']) * 100


df_with_pickrate.reset_index(drop=True, inplace=True)

df_with_pickrate['has_team_won'] = df_with_pickrate.groupby(['match_id', 'player_side'])['player_has_won'].transform('first')

  df_with_pickrate = df.groupby('match_id').apply(get_hero_pickrate_winrate_for_rank)
  df_with_pickrate['has_team_won'] = df_with_pickrate.groupby(['match_id', 'player_side'])['player_has_won'].transform('first')


In [5]:
df_with_pickrate.head(100)

Unnamed: 0,match_id,player_id,hero_id,player_q_mmr_diff,player_hero_winrate_overall,player_hero_total_matches_played,dire_winrate_all_time,dire_games_played_all_time,radiant_winrate_all_time,radiant_games_played_all_time,...,match_radiant_win,match_duration,match_radiant_score,match_dire_score,player_match_rank_initial_mmr,hero_pickrate_for_rank,hero_winrate_for_rank,player_heroes_pick_confidence_score_total,player_matches_abandonment_rate,has_team_won
0,7393139706,168352898,103,-631,0.000000,5,47.937410,1406,52.932550,1364,...,False,3368,41,48,2233.0,1.3819,50.2281,0,1.752190,False
1,7393139706,299921151,74,139,44.000000,25,57.120000,1250,57.441860,1290,...,False,3368,41,48,3003.0,17.7897,46.7979,0,1.633554,False
2,7393139706,356518944,7,-785,65.517240,29,57.670456,352,55.466664,375,...,False,3368,41,48,2079.0,11.1720,49.9146,0,0.434153,False
3,7393139706,132549973,104,139,66.666670,39,49.499545,1099,54.900180,1102,...,False,3368,41,48,3003.0,14.1323,49.7264,0,5.283505,False
4,7393139706,1034923441,70,601,59.375000,32,49.844238,1284,52.788390,1309,...,False,3368,41,48,3465.0,7.4670,49.4052,0,0.319872,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,7393596207,318590696,21,-770,68.421054,57,54.566210,438,57.049894,461,...,True,2894,54,50,2079.0,11.8661,46.7467,0,0.455927,False
96,7393596207,1225720384,54,462,44.444445,9,48.876405,712,56.133336,750,...,True,2894,54,50,3311.0,16.2930,53.9691,0,0.392157,False
97,7393596207,407552781,2,154,63.157890,19,47.140960,1504,55.423373,1429,...,True,2894,54,50,3003.0,18.7621,53.5622,1,0.937346,False
98,7393596207,156932492,83,0,65.372170,309,48.115440,4643,53.328866,4476,...,True,2894,54,50,2849.0,4.4050,52.3476,2,3.575990,False


In [6]:
df = df_with_pickrate.drop(columns=['match_datetime', 'hero_id', 'player_nickname', 'player_match_rank_initial', 'average_match_mmr', 'match_id', 'player_id', 'player_has_won', 'match_radiant_win', 'match_duration', 'match_radiant_score', 'match_dire_score', 'player_side', 'hero_name_local', 'has_team_won', 'hero_pickrate_up_to_crusader', 'hero_winrate_up_to_crusader', 'hero_pickrate_archon', 'hero_winrate_archon', 'hero_pickrate_legend', 'hero_winrate_legend', 
                                    'hero_pickrate_ancient', 'hero_winrate_ancient', 'hero_pickrate_divine_immortal', 'hero_winrate_divine_immortal'])

df.describe()

Unnamed: 0,player_q_mmr_diff,player_hero_winrate_overall,player_hero_total_matches_played,dire_winrate_all_time,dire_games_played_all_time,radiant_winrate_all_time,radiant_games_played_all_time,player_heroes_pick_confidence_score_allies,player_heroes_pick_confidence_score_enemies,player_time_played_all_matches,...,player_hero_damage_average_all_matches,player_tower_damage_average_all_matches,player_hero_healing_average_all_matches,hero_pickrate_average,hero_winrate_average,player_match_rank_initial_mmr,hero_pickrate_for_rank,hero_winrate_for_rank,player_heroes_pick_confidence_score_total,player_matches_abandonment_rate
count,42100.0,42100.0,42100.0,42100.0,42100.0,42100.0,42100.0,42100.0,42100.0,42100.0,...,42100.0,42100.0,42100.0,42100.0,42100.0,42100.0,42100.0,42100.0,42100.0,42100.0
mean,-0.002993,51.282409,112.025677,47.608343,1626.302138,53.308479,1625.710974,0.060285,0.068931,8061033.0,...,23356.802936,2682.087038,869.251196,10.528974,49.841891,2655.78639,11.008307,49.937926,0.129216,1.432761
std,664.032867,11.766337,215.277311,2.63263,1180.214991,2.664362,1179.080342,0.81418,1.359543,5854762.0,...,6465.87459,1071.18397,518.474158,5.291055,2.197483,737.244701,5.921863,2.626255,1.718393,1.689259
min,-2635.0,0.0,1.0,16.666667,6.0,28.57143,7.0,-4.0,-6.0,31807.0,...,8510.794,289.05884,0.0,1.27128,43.8774,77.5,0.8105,41.8409,-8.0,0.0
25%,-416.0,46.0,22.0,46.086619,738.0,51.829972,736.0,0.0,-1.0,3632462.0,...,20210.30825,1911.763125,567.622202,6.68138,47.88934,2233.0,6.8363,47.7176,-1.0,0.53538
50%,-31.0,51.68539,51.0,47.330353,1369.0,53.106576,1369.0,0.0,0.0,6784253.0,...,23146.551,2545.68335,758.883455,9.50872,49.92124,2695.0,9.915,50.0384,0.0,1.008742
75%,416.0,57.14286,116.0,48.818317,2236.0,54.545456,2241.0,0.0,1.0,11103770.0,...,26223.82825,3285.86875,1024.871325,13.17598,51.45366,3157.0,14.1847,52.1246,1.0,1.769088
max,3172.0,100.0,7250.0,81.81818,15156.0,81.50685,15377.0,5.0,9.0,76614880.0,...,932722.44,9858.976,10920.932,28.75738,54.9073,5621.0,29.9327,55.5378,14.0,71.830986


In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

columns_to_drop = ['player_xpm', 'player_denies', 'player_lasthits',
                   'player_assists_number', 'player_kills_number', 'player_net', 'hero_lvl', 'player_deaths_number']

temp_df = df.drop(columns=columns_to_drop).head(10000)

temp_df.head(10)

Unnamed: 0,player_q_mmr_diff,player_hero_winrate_overall,player_hero_total_matches_played,dire_winrate_all_time,dire_games_played_all_time,radiant_winrate_all_time,radiant_games_played_all_time,player_heroes_pick_confidence_score_allies,player_heroes_pick_confidence_score_enemies,player_time_played_all_matches,...,player_hero_damage_average_all_matches,player_tower_damage_average_all_matches,player_hero_healing_average_all_matches,hero_pickrate_average,hero_winrate_average,player_match_rank_initial_mmr,hero_pickrate_for_rank,hero_winrate_for_rank,player_heroes_pick_confidence_score_total,player_matches_abandonment_rate
0,-631,0.0,5,47.93741,1406,52.93255,1364,0,0,6606301,...,27282.379,3840.0867,189.48804,1.65384,50.79214,2233.0,1.3819,50.2281,0,1.75219
1,139,44.0,25,57.12,1250,57.44186,1290,0,0,6144606,...,27830.834,3779.7305,665.0613,16.13098,47.44248,3003.0,17.7897,46.7979,0,1.633554
2,-785,65.51724,29,57.670456,352,55.466664,375,0,0,1838111,...,15426.104,890.8892,1380.3566,11.20572,50.5111,2079.0,11.172,49.9146,0,0.434153
3,139,66.66667,39,49.499545,1099,54.90018,1102,0,0,5247812,...,26371.805,4444.607,1363.435,12.08786,49.25336,3003.0,14.1323,49.7264,0,5.283505
4,601,59.375,32,49.844238,1284,52.78839,1309,0,0,6457179,...,22130.459,2527.6013,812.3255,8.01404,49.80836,3465.0,7.467,49.4052,0,0.319872
5,-477,62.26415,106,46.742857,875,52.662724,845,1,0,4258488,...,25070.113,2954.2983,1113.5082,16.88724,49.62226,2387.0,16.4077,49.5248,1,1.498929
6,-323,27.272728,11,48.48,1250,51.56647,1181,1,0,5937946,...,28541.14,2986.3167,588.9391,8.46242,49.90816,2541.0,8.4002,50.3744,1,1.272149
7,755,41.17647,34,50.42392,2241,51.577955,2123,1,0,10538873,...,25304.502,3413.4014,1398.3538,9.50872,52.0656,3619.0,8.5947,52.3047,1,0.800971
8,601,50.68493,73,47.90287,3171,52.80899,3204,1,0,15674693,...,25427.8,2425.9143,961.92395,9.74936,47.3517,3465.0,8.804,46.922,1,1.35064
9,-15,55.6701,97,51.09489,1918,51.021504,1860,0,1,9263806,...,26473.541,3892.5752,754.53485,17.28832,53.7735,2849.0,16.293,53.9691,1,1.644737


In [8]:
X = temp_df.drop(columns=['player_gpm'])
y = temp_df['player_gpm']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestRegressor(n_estimators=80, random_state=42)

rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Mean Squared Error:', mse)
print('R-squared:', r2)

Mean Squared Error: 14800.631810781251
R-squared: 0.30094965159054066


## Conclusion

LinearRegression-based experiments conducted in scope of this notebook (one of which is presented) were unsuccessful.
Usage of an advanced model may have given some better results on this non-linear data but that approach would go beyond the scope of my research.

Additionally: I doubt those results (even with advanced models) would do any better than around 50% accuracy.  