In [53]:
# Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Dataset
total_data = pd.read_csv('../data/interim/player_match_data.csv')
total_data.head()

Unnamed: 0,adr,assists,deaths,fkdiff,hs,kdratio,kills,rating,match_id,player_id,...,team_2_score,team_name,data_unix,map,hour,day,week,month,year,weekday
0,163.2,3,10,1,10,90.0%,32,2.44,32227,5736,...,16,g3x,1467476700000,Train,13,2,26,7,2016,5
1,81.0,3,6,1,5,75.0%,17,1.55,32227,2532,...,16,g3x,1467476700000,Train,13,2,26,7,2016,5
2,77.6,3,10,1,11,75.0%,16,1.41,32227,7382,...,16,g3x,1467476700000,Train,13,2,26,7,2016,5
3,77.0,2,10,-1,6,85.0%,14,1.38,32227,5698,...,16,g3x,1467476700000,Train,13,2,26,7,2016,5
4,61.2,4,12,4,4,85.0%,10,1.16,32227,10563,...,16,g3x,1467476700000,Train,13,2,26,7,2016,5


In [54]:
    # Variable 1: player_team_id 

# 'player_team_id' function
def find_most_common_team_number(group):
        all_team_ids = pd.concat([group['team_1_id'], group['team_2_id']])
        team_numbers = all_team_ids.dropna().astype(int)
        most_common_team_number = np.argmax(np.bincount(team_numbers))
        return most_common_team_number

# Applying function to dataset
total_data['player_team_id'] = total_data.groupby('team_name').apply(find_most_common_team_number).reindex(total_data['team_name']).values

# ----------------------------------------------------------------------------------------------------------------------------

    # Variable 2: winning_team

# 'winning_team' function
def get_winning_team(team_1_score, team_2_score):
        if team_1_score == team_2_score: return 0
        elif team_1_score > team_2_score: return 1
        else: return 2

# Aapplying function to dataset
total_data['winning_team'] = total_data.apply(lambda row: get_winning_team(row['team_1_score'], row['team_2_score']), axis=1)

# ----------------------------------------------------------------------------------------------------------------------------

    # Variable 3: winning_team_id 

# Creating column called 'winning_team_id' (doesn't need function)
total_data['winning_team_id'] = np.where(total_data['winning_team'] == 1, total_data['team_1_id'],
                                np.where(total_data['winning_team'] == 2, total_data['team_2_id'], 0))

# ----------------------------------------------------------------------------------------------------------------------------

    # Variable 4: player_has_won 

# Creating column called 'player_has_won' (doesn't need function)
total_data['player_has_won'] = np.where(total_data['winning_team_id'] == total_data['player_team_id'], 1, 0)

# ----------------------------------------------------------------------------------------------------------------------------

# Seleccting target variables

target_1 = 'winning_team'
target_2 = 'player_has_won'

# Show small overview of the dataset
total_data.head(3)


Unnamed: 0,adr,assists,deaths,fkdiff,hs,kdratio,kills,rating,match_id,player_id,...,hour,day,week,month,year,weekday,player_team_id,winning_team,winning_team_id,player_has_won
0,163.2,3,10,1,10,90.0%,32,2.44,32227,5736,...,13,2,26,7,2016,5,6621,2,6621,1
1,81.0,3,6,1,5,75.0%,17,1.55,32227,2532,...,13,2,26,7,2016,5,6621,2,6621,1
2,77.6,3,10,1,11,75.0%,16,1.41,32227,7382,...,13,2,26,7,2016,5,6621,2,6621,1


In [55]:
unique_teams_1 = total_data[['team_1_id', 'team_1_name']].rename(columns={'team_1_id': 'team_id', 'team_1_name': 'team_name'})
unique_teams_2 = total_data[['team_2_id', 'team_2_name']].rename(columns={'team_2_id': 'team_id', 'team_2_name': 'team_name'})
unique_teams = pd.concat([unique_teams_1, unique_teams_2]).drop_duplicates().reset_index(drop=True)
unique_teams.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5256 entries, 0 to 5255
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   team_id    5256 non-null   int64 
 1   team_name  5256 non-null   object
dtypes: int64(1), object(1)
memory usage: 82.3+ KB


In [56]:
total_data = pd.merge(total_data, unique_teams, left_on='team_name', right_on='team_name', how='left')


total_data = total_data.drop('team_name', axis=1)
total_data['team_id'] = total_data['team_id'].fillna(-1).astype(int)
total_data.head()

Unnamed: 0,adr,assists,deaths,fkdiff,hs,kdratio,kills,rating,match_id,player_id,...,day,week,month,year,weekday,player_team_id,winning_team,winning_team_id,player_has_won,team_id
0,163.2,3,10,1,10,90.0%,32,2.44,32227,5736,...,2,26,7,2016,5,6621,2,6621,1,6621
1,81.0,3,6,1,5,75.0%,17,1.55,32227,2532,...,2,26,7,2016,5,6621,2,6621,1,6621
2,77.6,3,10,1,11,75.0%,16,1.41,32227,7382,...,2,26,7,2016,5,6621,2,6621,1,6621
3,77.0,2,10,-1,6,85.0%,14,1.38,32227,5698,...,2,26,7,2016,5,6621,2,6621,1,6621
4,61.2,4,12,4,4,85.0%,10,1.16,32227,10563,...,2,26,7,2016,5,6621,2,6621,1,6621


In [57]:
total_data = total_data[total_data['team_id'] != -1]

In [58]:
total_data["adr"] = pd.to_numeric(total_data["adr"], errors="coerce")
total_data["kdratio"] = (
    pd.to_numeric(total_data["kdratio"].str.rstrip("%"), errors="coerce") / 100.0
)

total_data['adr'] = total_data.groupby('player_id')['adr'].transform(lambda x: x.fillna(x[x.notna()].mean()))
total_data['kdratio'] = total_data.groupby('player_id')['kdratio'].transform(lambda x: x.fillna(x[x.notna()].mean()))


null_rows = total_data.loc[total_data.isnull().any(axis=1)]

# Deleting rows with null values
total_data = total_data.dropna()


In [59]:

total_data.drop(['hour', 'day', 'week', 'month', 'weekday', 'year', 'rating', 'team_1_name', 'team_2_name'], axis=1, inplace=True)
total_data.drop(['team_1_score', 'team_2_score', 'winning_team_id', 'player_has_won'], axis=1, inplace=True)
total_data.head()

Unnamed: 0,adr,assists,deaths,fkdiff,hs,kdratio,kills,match_id,player_id,team_1_id,team_2_id,data_unix,map,player_team_id,winning_team,team_id
0,163.2,3,10,1,10,0.9,32,32227,5736,6619,6621,1467476700000,Train,6621,2,6621
1,81.0,3,6,1,5,0.75,17,32227,2532,6619,6621,1467476700000,Train,6621,2,6621
2,77.6,3,10,1,11,0.75,16,32227,7382,6619,6621,1467476700000,Train,6621,2,6621
3,77.0,2,10,-1,6,0.85,14,32227,5698,6619,6621,1467476700000,Train,6621,2,6621
4,61.2,4,12,4,4,0.85,10,32227,10563,6619,6621,1467476700000,Train,6621,2,6621


In [60]:
# Group by MatchID and TeamID, then calculate average stats and collect PlayerIDs
team_stats = total_data.groupby(['match_id', 'team_id']).agg({
    'player_id': lambda x: frozenset(x),
    'adr': 'mean',
    'assists': 'sum',
    'deaths': 'sum',
    'fkdiff': 'sum',
    'hs': 'sum',
    'kdratio': 'mean',
    'kills': 'sum',
}).reset_index()

# Merge with the original DataFrame to get PlayerIDs
total_data = pd.merge(total_data, team_stats, on=['match_id', 'team_id'], suffixes=('_player', '_team'))

# # Drop unnecessary columns and duplicates
total_data.drop(
    [
        'adr_player', 
        'assists_player',
        'deaths_player',
        'fkdiff_player',
        'hs_player',
        'kdratio_player',
        'kills_player',
        'player_id_player'
        ], 
    axis=1,
    inplace=True
    )
total_data = total_data.groupby(['match_id', 'team_id']).first().reset_index()



total_data.head(10)

Unnamed: 0,match_id,team_id,team_1_id,team_2_id,data_unix,map,player_team_id,winning_team,player_id_team,adr_team,assists_team,deaths_team,fkdiff_team,hs_team,kdratio_team,kills_team
0,12838,4411,4411,4443,1347562800000,Inferno_se,4411,1,"(884, 7148, 29, 39)",73.369221,0,38,6,0,0.691815,73
1,12838,4443,4411,4443,1347562800000,Inferno_se,4443,1,(7150),43.525,0,19,-7,0,0.495,5
2,12839,4411,4411,4443,1347562800000,Mirage_ce,4411,1,"(884, 7148, 29, 39)",73.369221,0,30,11,0,0.691815,64
3,12839,4443,4411,4443,1347562800000,Mirage_ce,4443,1,(7150),43.525,0,17,-5,0,0.495,10
4,12840,4444,4444,4445,1347562800000,Inferno_se,7105,1,"(6796, 7154, 7156, 7158)",74.894429,0,26,3,0,0.67792,63
5,12840,4445,4444,4445,1347562800000,Inferno_se,4445,1,"(7160, 7161, 7162, 7163)",65.499279,0,65,-2,0,0.66103,33
6,12840,7105,4444,4445,1347562800000,Inferno_se,7105,1,"(6796, 7154, 7156, 7158)",74.894429,0,26,3,0,0.67792,63
7,12841,4444,4445,4444,1347562912000,Dust2_se,7105,2,"(6796, 7154, 7156, 7158)",74.894429,0,43,8,0,0.67792,73
8,12841,4445,4445,4444,1347562912000,Dust2_se,4445,2,"(7160, 7161, 7162, 7163)",65.499279,0,78,-7,0,0.66103,47
9,12841,7105,4445,4444,1347562912000,Dust2_se,7105,2,"(6796, 7154, 7156, 7158)",74.894429,0,43,8,0,0.67792,73


In [61]:
print(total_data.size)

# Add a new column with the size of frozenset
total_data['team_size'] = total_data['player_id_team'].apply(len)

# Sort the DataFrame by the size of frozenset
total_data = total_data.sort_values(by='team_size')

match_ids = total_data[total_data['team_size'] == 5]['match_id'].to_list()

total_data = total_data[total_data['match_id'].isin(match_ids)]

# Drop the temporary 'team_size' column if needed
total_data.drop(columns=['team_size'], inplace=True)

total_data.size


3554400


3472672

In [62]:
# # Rename columns for clarity
# total_data = total_data.rename(columns={'player_id_team': 'team_members', 'Headshots_team': 'Avg_Headshots', 'PlayerID': 'TeamMembers'})
total_data.sort_values(by='match_id')
total_data.head(10)

Unnamed: 0,match_id,team_id,team_1_id,team_2_id,data_unix,map,player_team_id,winning_team,player_id_team,adr_team,assists_team,deaths_team,fkdiff_team,hs_team,kdratio_team,kills_team
1873,13850,4603,4500,4532,1361656800000,Dust2_se,4500,2,(7264),56.886364,3,19,1,6,0.67375,16
34026,27347,6559,6560,6559,1450024200000,Cache,6559,1,(10512),75.527273,1,17,0,7,0.676909,15
1871,13850,4500,4500,4532,1361656800000,Dust2_se,4500,2,(7264),56.886364,3,19,1,6,0.67375,16
34991,27939,6712,5974,6712,1455483600000,Cobblestone,5974,1,(10615),76.25,1,20,-2,5,0.7235,16
23281,22356,6114,4445,6114,1439838000000,Cache,6114,1,(7605),70.99,4,18,-3,4,0.6915,7
4251,14727,4448,4532,4914,1369764000000,Dust2_se,4914,1,(7708),84.098462,3,20,3,6,0.729538,19
4253,14727,4683,4532,4914,1369764000000,Dust2_se,4914,1,(7708),84.098462,3,20,3,6,0.729538,19
4257,14727,5789,4532,4914,1369764000000,Dust2_se,4914,1,(7708),84.098462,3,20,3,6,0.729538,19
4255,14727,4914,4532,4914,1369764000000,Dust2_se,4914,1,(7708),84.098462,3,20,3,6,0.729538,19
4254,14727,4901,4532,4914,1369764000000,Dust2_se,4914,1,(7708),84.098462,3,20,3,6,0.729538,19


# Model Training

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Concatenate

# Assume you have a DataFrame with a time series dataset
# df is your DataFrame with 10 columns, and you want to predict 'target_column'
# 'feature1' and 'feature2' are the primary input features for prediction
# 'other_feature1' through 'other_feature7' are additional columns to be included

# Split the data into training and testing sets
train_data, test_data = train_test_split(total_data, test_size=0.2, shuffle=False)

# Standardize the data
scaler = StandardScaler()
train_data_scaled = scaler.fit_transform(train_data)
test_data_scaled = scaler.transform(test_data)

# Separate features and target variable
X_train_main = train_data_scaled[:, :2]          # Primary input features: feature1, feature2
X_train_additional = train_data_scaled[:, 2:8]   # Additional input features: other_feature1 through other_feature7
y_train = train_data_scaled[:, 8]                # Target variable: target_column

X_test_main = test_data_scaled[:, :2]            # Primary input features for testing
X_test_additional = test_data_scaled[:, 2:8]     # Additional input features for testing
y_test = test_data_scaled[:, 8]                  # True values for testing

# Build the Keras model with multiple inputs
main_input = Input(shape=(2,), name='main_input')
additional_input = Input(shape=(6,), name='additional_input')

# Main pathway
x = Dense(64, activation='relu')(main_input)
x = Dense(32, activation='relu')(x)

# Additional pathway
y = Dense(32, activation='relu')(additional_input)

# Concatenate the main and additional pathways
merged = Concatenate()([x, y])

# Output layer
output_layer = Dense(1, activation='linear', name='output')(merged)

# Create the model
model = Model(inputs=[main_input, additional_input], outputs=output_layer)

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit([X_train_main, X_train_additional], y_train, epochs=50, batch_size=32, validation_split=0.1)

# Evaluate the model on the test set
loss = model.evaluate([X_test_main, X_test_additional], y_test)
print(f'Mean Squared Error on Test Set: {loss}')

# Make predictions
predictions = model.predict([X_test_main, X_test_additional])

# You can inverse transform the predictions if you want them in the original scale
predictions_original_scale = scaler.inverse_transform(np.hstack((X_test_main, X_test_additional, predictions)))

2024-01-31 17:38:30.577896: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-31 17:38:30.895600: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-31 17:38:30.895643: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-31 17:38:30.952213: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-31 17:38:31.063880: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-31 17:38:31.066079: I tensorflow/core/platform/cpu_feature_guard.cc:1