In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Mount Google Drive
drive.mount('/content/drive')

# Load data from Google Drive
match_data = pd.read_csv('/content/drive/MyDrive/match_data.csv')
immortal_vs_all_winrates = pd.read_csv('/content/drive/MyDrive/immortal_vs_all_winrates.csv')
immortal_with_all_winrates = pd.read_csv('/content/drive/MyDrive/immortal_with_all_winrates.csv')

# Filter matches with duration < 15 minutes and remove matches with leavers
match_data = match_data[(match_data['durationSeconds'] >= 900) &
                        (match_data['leaverStatus'] == 'NONE')]

# Initialize lists for storing features
radiant_features_sparse = []
dire_features_sparse = []
match_index = {}
synergy_features = []
counter_synergy_features = []

# Iterate through each match
for idx, (match_id, group) in enumerate(match_data.groupby('match_id')):
    match_index[match_id] = idx


    radiant_heroes = np.zeros(138)
    dire_heroes = np.zeros(138)

    # Separate Radiant and Dire heroes
    for _, row in group.iterrows():
        if row['isRadiant']:
            radiant_heroes[row['heroId'] - 1] = 1
        else:
            dire_heroes[row['heroId'] - 1] = 1


    radiant_features_sparse.append(radiant_heroes)
    dire_features_sparse.append(dire_heroes)

    # Calculate synergy within Radiant and Dire teams
    radiant_synergy = 0
    dire_synergy = 0
    radiant_hero_ids = group[group['isRadiant']]['heroId'].values
    dire_hero_ids = group[~group['isRadiant']]['heroId'].values

    for i in range(len(radiant_hero_ids)):
        for j in range(i + 1, len(radiant_hero_ids)):
            synergy = immortal_with_all_winrates[(immortal_with_all_winrates['HeroId1'] == radiant_hero_ids[i]) &
                                                 (immortal_with_all_winrates['HeroId2'] == radiant_hero_ids[j])]['WinRate']
            if not synergy.empty:
                radiant_synergy += synergy.values[0]

    for i in range(len(dire_hero_ids)):
        for j in range(i + 1, len(dire_hero_ids)):
            synergy = immortal_with_all_winrates[(immortal_with_all_winrates['HeroId1'] == dire_hero_ids[i]) &
                                                 (immortal_with_all_winrates['HeroId2'] == dire_hero_ids[j])]['WinRate']
            if not synergy.empty:
                dire_synergy += synergy.values[0]

    # Calculate counter synergy between Radiant and Dire teams
    counter_synergy = 0
    for radiant_hero in radiant_hero_ids:
        for dire_hero in dire_hero_ids:
            counter = immortal_vs_all_winrates[(immortal_vs_all_winrates['HeroId1'] == radiant_hero) &
                                               (immortal_vs_all_winrates['HeroId2'] == dire_hero)]['WinRate']
            if not counter.empty:
                counter_synergy += counter.values[0]

    synergy_features.append(radiant_synergy - dire_synergy)
    counter_synergy_features.append(counter_synergy)


radiant_df = pd.DataFrame(radiant_features_sparse, columns=[f'radiant_hero_{i}' for i in range(1, 139)])
dire_df = pd.DataFrame(dire_features_sparse, columns=[f'dire_hero_{i}' for i in range(1, 139)])
synergy_df = pd.DataFrame({
    'match_id': list(match_index.keys()),
    'synergy': synergy_features,
    'counter_synergy': counter_synergy_features
})

final_df = pd.concat([radiant_df, dire_df], axis=1)
final_df['match_id'] = list(match_index.keys())
final_df = final_df.merge(synergy_df, on='match_id')


target = match_data.groupby('match_id').first()['didRadiantWin'].reset_index()
final_df = final_df.merge(target[['match_id', 'didRadiantWin']], on='match_id')


X = final_df.drop(columns=['didRadiantWin', 'match_id'])
y = final_df['didRadiantWin']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Evaluate the model
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

print(f'Training Accuracy: {train_accuracy}')
print(f'Test Accuracy: {test_accuracy}')
print(classification_report(y_test, y_pred_test))


Mounted at /content/drive
Training Accuracy: 0.6849925120201782
Test Accuracy: 0.6590479192938209
              precision    recall  f1-score   support

       False       0.66      0.63      0.65      3132
        True       0.66      0.69      0.67      3212

    accuracy                           0.66      6344
   macro avg       0.66      0.66      0.66      6344
weighted avg       0.66      0.66      0.66      6344



In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from google.colab import drive
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Mount Google Drive
drive.mount('/content/drive')

# Load data from Google Drive
match_data = pd.read_csv('/content/drive/MyDrive/match_data.csv')
immortal_vs_all_winrates = pd.read_csv('/content/drive/MyDrive/immortal_vs_all_winrates.csv')
immortal_with_all_winrates = pd.read_csv('/content/drive/MyDrive/immortal_with_all_winrates.csv')

# Filter matches with duration < 15 minutes and remove matches with leavers
match_data = match_data[(match_data['durationSeconds'] >= 900) &
                        (match_data['leaverStatus'] == 'NONE')]

# Initialize lists for storing features
radiant_features_sparse = []
dire_features_sparse = []
match_index = {}
synergy_features = []
counter_synergy_features = []

# Iterate through each match
for idx, (match_id, group) in enumerate(match_data.groupby('match_id')):
    match_index[match_id] = idx

    # Initialize feature vectors for Radiant and Dire teams
    radiant_heroes = np.zeros(138)
    dire_heroes = np.zeros(138)

    # Separate Radiant and Dire heroes
    for _, row in group.iterrows():
        if row['isRadiant']:
            radiant_heroes[row['heroId'] - 1] = 1  # HeroId adjusted for 0-based indexing
        else:
            dire_heroes[row['heroId'] - 1] = 1  # HeroId adjusted for 0-based indexing

    # Append the features to the list
    radiant_features_sparse.append(radiant_heroes)
    dire_features_sparse.append(dire_heroes)

    # Calculate synergy within Radiant and Dire teams
    radiant_synergy = 0
    dire_synergy = 0
    radiant_hero_ids = group[group['isRadiant']]['heroId'].values
    dire_hero_ids = group[~group['isRadiant']]['heroId'].values

    for i in range(len(radiant_hero_ids)):
        for j in range(i + 1, len(radiant_hero_ids)):
            synergy = immortal_with_all_winrates[(immortal_with_all_winrates['HeroId1'] == radiant_hero_ids[i]) &
                                                 (immortal_with_all_winrates['HeroId2'] == radiant_hero_ids[j])]['WinRate']
            if not synergy.empty:
                radiant_synergy += synergy.values[0]

    for i in range(len(dire_hero_ids)):
        for j in range(i + 1, len(dire_hero_ids)):
            synergy = immortal_with_all_winrates[(immortal_with_all_winrates['HeroId1'] == dire_hero_ids[i]) &
                                                 (immortal_with_all_winrates['HeroId2'] == dire_hero_ids[j])]['WinRate']
            if not synergy.empty:
                dire_synergy += synergy.values[0]

    # Calculate counter synergy between Radiant and Dire teams
    counter_synergy = 0
    for radiant_hero in radiant_hero_ids:
        for dire_hero in dire_hero_ids:
            counter = immortal_vs_all_winrates[(immortal_vs_all_winrates['HeroId1'] == radiant_hero) &
                                               (immortal_vs_all_winrates['HeroId2'] == dire_hero)]['WinRate']
            if not counter.empty:
                counter_synergy += counter.values[0]

    synergy_features.append(radiant_synergy - dire_synergy)  # Net synergy
    counter_synergy_features.append(counter_synergy)

# Convert hero features and synergy features to DataFrames
radiant_df = pd.DataFrame(radiant_features_sparse, columns=[f'radiant_hero_{i}' for i in range(1, 139)])
dire_df = pd.DataFrame(dire_features_sparse, columns=[f'dire_hero_{i}' for i in range(1, 139)])
synergy_df = pd.DataFrame({
    'match_id': list(match_index.keys()),
    'synergy': synergy_features,
    'counter_synergy': counter_synergy_features
})

# Combine Radiant and Dire features with synergy features
final_df = pd.concat([radiant_df, dire_df], axis=1)
final_df['match_id'] = list(match_index.keys())
final_df = final_df.merge(synergy_df, on='match_id')

# Add the target variable
target = match_data.groupby('match_id').first()['didRadiantWin'].reset_index()
final_df = final_df.merge(target[['match_id', 'didRadiantWin']], on='match_id')

# Separate features and target variable
X = final_df.drop(columns=['didRadiantWin', 'match_id'])
y = final_df['didRadiantWin']

# Define the model
model = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=1000)

# Define the hyperparameter grid, including train-test split ratios
param_grid = {
    'C': [  0.1, 1.0, 10.0 ],  # Broad range of C values
    'l1_ratio': [ 0.25, 0.5, 1.0],   # Different L1 ratio values
    'split_ratio': [ 0.6, 0.65, 0.7]        # Train-test split ratios
}

# Custom train-test split function to be used in GridSearchCV
class CustomTrainTestSplit:
    def __init__(self, split_ratio):
        self.split_ratio = split_ratio

    def split(self, X, y):
        sss = StratifiedShuffleSplit(n_splits=1, test_size=1-self.split_ratio, random_state=42)
        for train_index, test_index in sss.split(X, y):
            yield train_index, test_index

# Extend GridSearchCV to include the split ratio
def grid_search_with_split_ratio(X, y, param_grid, model):
    best_score = -np.inf
    best_params = None
    best_model = None
    best_split_ratio = None  # Track the best split ratio

    for split_ratio in param_grid['split_ratio']:
        splitter = CustomTrainTestSplit(split_ratio)
        for train_index, test_index in splitter.split(X, y):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            grid_search = GridSearchCV(estimator=model,
                                       param_grid={k: v for k, v in param_grid.items() if k != 'split_ratio'},
                                       cv=5,
                                       scoring='accuracy')
            grid_search.fit(X_train, y_train)

            if grid_search.best_score_ > best_score:
                best_score = grid_search.best_score_
                best_params = grid_search.best_params_
                best_model = grid_search.best_estimator_
                best_split_ratio = split_ratio  # Store the best split ratio

    return best_model, best_params, best_score, best_split_ratio

# Perform the grid search including train-test split ratios
best_model, best_params, best_score, best_split_ratio = grid_search_with_split_ratio(X, y, param_grid, model)

print(f"Best Parameters: {best_params}")
print(f"Best Cross-Validation Accuracy: {best_score}")
print(f"Best Train-Test Split Ratio: {best_split_ratio}")  # Print the best split ratio

# Evaluate the best model on the test data
y_pred_test = best_model.predict(X_test)
print(f"Test Accuracy: {accuracy_score(y_test, y_pred_test)}")
print(classification_report(y_test, y_pred_test))



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


KeyboardInterrupt: 

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Assuming you've already done the preprocessing and have X and y
# Initialize the model with the best parameters
final_model = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=1000, C=1.0, l1_ratio=1.0)

# Use the best Train-Test Split Ratio
split_ratio = 0.65
sss = StratifiedShuffleSplit(n_splits=1, test_size=1-split_ratio, random_state=42)

for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# Train the final model on the training data
final_model.fit(X_train, y_train)

# Evaluate the final model on the test data
y_pred_test = final_model.predict(X_test)
print(f"Test Accuracy: {accuracy_score(y_test, y_pred_test)}")
print(classification_report(y_test, y_pred_test))

# Save the model using joblib
joblib.dump(final_model, '/content/drive/MyDrive/final_logistic_model.pkl')

print("Model saved successfully.")


Test Accuracy: 0.6635741307872456
              precision    recall  f1-score   support

       False       0.66      0.62      0.64      5362
        True       0.67      0.70      0.68      5740

    accuracy                           0.66     11102
   macro avg       0.66      0.66      0.66     11102
weighted avg       0.66      0.66      0.66     11102

Model saved successfully.


In [None]:
import numpy as np

# Assuming `model` is your trained model and you have loaded any required data (e.g., synergy data)

def prepare_input_vector(radiant_hero_ids, dire_hero_ids, immortal_with_all_winrates, immortal_vs_all_winrates):
    #  Initialize empty hero vectors
    radiant_heroes = np.zeros(138)
    dire_heroes = np.zeros(138)

    #  Update vectors based on input hero IDs
    for hero_id in radiant_hero_ids:
        radiant_heroes[hero_id - 1] = 1  # -1 for 0-based indexing

    for hero_id in dire_hero_ids:
        dire_heroes[hero_id - 1] = 1  # -1 for 0-based indexing

    #  Calculate synergy and counter synergy
    radiant_synergy = 0
    dire_synergy = 0
    counter_synergy = 0

    # Calculate Radiant synergy
    for i in range(len(radiant_hero_ids)):
        for j in range(i + 1, len(radiant_hero_ids)):
            synergy = immortal_with_all_winrates[(immortal_with_all_winrates['HeroId1'] == radiant_hero_ids[i]) &
                                                 (immortal_with_all_winrates['HeroId2'] == radiant_hero_ids[j])]['WinRate']
            if not synergy.empty:
                radiant_synergy += synergy.values[0]

    # Calculate Dire synergy
    for i in range(len(dire_hero_ids)):
        for j in range(i + 1, len(dire_hero_ids)):
            synergy = immortal_with_all_winrates[(immortal_with_all_winrates['HeroId1'] == dire_hero_ids[i]) &
                                                 (immortal_with_all_winrates['HeroId2'] == dire_hero_ids[j])]['WinRate']
            if not synergy.empty:
                dire_synergy += synergy.values[0]

    # Calculate counter synergy
    for radiant_hero in radiant_hero_ids:
        for dire_hero in dire_hero_ids:
            counter = immortal_vs_all_winrates[(immortal_vs_all_winrates['HeroId1'] == radiant_hero) &
                                               (immortal_vs_all_winrates['HeroId2'] == dire_hero)]['WinRate']
            if not counter.empty:
                counter_synergy += counter.values[0]

    # Net synergy
    net_synergy = radiant_synergy - dire_synergy

    #  Combine all features into a single input vector
    input_vector = np.concatenate((radiant_heroes, dire_heroes, [net_synergy, counter_synergy]))

    return input_vector

# Example usage:
radiant_hero_ids = [1, 2, 3, 4, 5]  # Example Radiant hero IDs input by the user
dire_hero_ids = [6, 7, 8, 9, 10]    # Example Dire hero IDs input by the user

# Prepare the input vector
input_vector = prepare_input_vector(radiant_hero_ids, dire_hero_ids, immortal_with_all_winrates, immortal_vs_all_winrates)

# Reshape the input vector to match the model's expected input shape
input_vector = input_vector.reshape(1, -1)

# Step 5: Predict the outcome using the trained model
prediction = model.predict(input_vector)
predicted_probability = model.predict_proba(input_vector)

# Print the results
print(f"Prediction (1 = Radiant win, 0 = Dire win): {prediction[0]}")
print(f"Probability of Radiant win: {predicted_probability[0][1]:.2f}")



Prediction (1 = Radiant win, 0 = Dire win): True
Probability of Radiant win: 0.72




In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from google.colab import drive
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Mount Google Drive
drive.mount('/content/drive')

# Load data from Google Drive
match_data = pd.read_csv('/content/drive/MyDrive/match_data.csv')
immortal_vs_all_winrates = pd.read_csv('/content/drive/MyDrive/immortal_vs_all_winrates.csv')
immortal_with_all_winrates = pd.read_csv('/content/drive/MyDrive/immortal_with_all_winrates.csv')

# Filter matches with duration < 15 minutes and remove matches with leavers
match_data = match_data[(match_data['durationSeconds'] >= 900) &
                        (match_data['leaverStatus'] == 'NONE')]

# Initialize lists for storing features
radiant_features_sparse = []
dire_features_sparse = []
match_index = {}
synergy_features = []
counter_synergy_features = []

# Initialize counters for missing pairings
missing_synergy_pairings = []
missing_counter_synergy_pairings = []

# Iterate through each match
for idx, (match_id, group) in enumerate(match_data.groupby('match_id')):
    match_index[match_id] = idx

    # Initialize feature vectors for Radiant and Dire teams
    radiant_heroes = np.zeros(138)
    dire_heroes = np.zeros(138)

    # Separate Radiant and Dire heroes
    for _, row in group.iterrows():
        if row['isRadiant']:
            radiant_heroes[row['heroId'] - 1] = 1  # HeroId adjusted for 0-based indexing
        else:
            dire_heroes[row['heroId'] - 1] = 1  # HeroId adjusted for 0-based indexing

    # Append the features to the list
    radiant_features_sparse.append(radiant_heroes)
    dire_features_sparse.append(dire_heroes)

    # Calculate synergy within Radiant and Dire teams
    radiant_synergy = 0
    dire_synergy = 0
    radiant_hero_ids = group[group['isRadiant']]['heroId'].values
    dire_hero_ids = group[~group['isRadiant']]['heroId'].values

    for i in range(len(radiant_hero_ids)):
        for j in range(i + 1, len(radiant_hero_ids)):
            synergy = immortal_with_all_winrates[
                (immortal_with_all_winrates['HeroId1'] == radiant_hero_ids[i]) &
                (immortal_with_all_winrates['HeroId2'] == radiant_hero_ids[j])
            ]['WinRate']

            if synergy.empty:
                # Check the reversed pairing
                synergy = immortal_with_all_winrates[
                    (immortal_with_all_winrates['HeroId1'] == radiant_hero_ids[j]) &
                    (immortal_with_all_winrates['HeroId2'] == radiant_hero_ids[i])
                ]['WinRate']

            if synergy.empty:
                missing_synergy_pairings.append((radiant_hero_ids[i], radiant_hero_ids[j]))
            else:
                radiant_synergy += synergy.values[0]

    for i in range(len(dire_hero_ids)):
        for j in range(i + 1, len(dire_hero_ids)):
            synergy = immortal_with_all_winrates[
                (immortal_with_all_winrates['HeroId1'] == dire_hero_ids[i]) &
                (immortal_with_all_winrates['HeroId2'] == dire_hero_ids[j])
            ]['WinRate']

            if synergy.empty:
                # Check the reversed pairing
                synergy = immortal_with_all_winrates[
                    (immortal_with_all_winrates['HeroId1'] == dire_hero_ids[j]) &
                    (immortal_with_all_winrates['HeroId2'] == dire_hero_ids[i])
                ]['WinRate']

            if synergy.empty:
                missing_synergy_pairings.append((dire_hero_ids[i], dire_hero_ids[j]))
            else:
                dire_synergy += synergy.values[0]

    # Calculate counter synergy between Radiant and Dire teams
    counter_synergy = 0
    for radiant_hero in radiant_hero_ids:
        for dire_hero in dire_hero_ids:
            counter = immortal_vs_all_winrates[
                (immortal_vs_all_winrates['HeroId1'] == radiant_hero) &
                (immortal_vs_all_winrates['HeroId2'] == dire_hero)
            ]['WinRate']

            if counter.empty:
                # Check the reversed pairing
                counter = immortal_vs_all_winrates[
                    (immortal_vs_all_winrates['HeroId1'] == dire_hero) &
                    (immortal_vs_all_winrates['HeroId2'] == radiant_hero)
                ]['WinRate']

                if not counter.empty:
                    counter_synergy += 100 - counter.values[0]  # Complement the win rate

            if counter.empty:
                missing_counter_synergy_pairings.append((radiant_hero, dire_hero))
            else:
                counter_synergy += counter.values[0]

    synergy_features.append(radiant_synergy - dire_synergy)  # Net synergy
    counter_synergy_features.append(counter_synergy)


# Convert hero features and synergy features to DataFrames
radiant_df = pd.DataFrame(radiant_features_sparse, columns=[f'radiant_hero_{i}' for i in range(1, 139)])
dire_df = pd.DataFrame(dire_features_sparse, columns=[f'dire_hero_{i}' for i in range(1, 139)])
synergy_df = pd.DataFrame({
    'match_id': list(match_index.keys()),
    'synergy': synergy_features,
    'counter_synergy': counter_synergy_features
})

# Combine Radiant and Dire features with synergy features
final_df = pd.concat([radiant_df, dire_df], axis=1)
final_df['match_id'] = list(match_index.keys())
final_df = final_df.merge(synergy_df, on='match_id')

# Add the target variable
target = match_data.groupby('match_id').first()['didRadiantWin'].reset_index()
final_df = final_df.merge(target[['match_id', 'didRadiantWin']], on='match_id')

# Separate features and target variable
X = final_df.drop(columns=['didRadiantWin', 'match_id'])
y = final_df['didRadiantWin']

# Define the model with the best parameters
model = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=1000, C=1.0, l1_ratio=1.0)

# Best split ratio from the grid search
split_ratio = 0.65

# Split the data using the best split ratio
sss = StratifiedShuffleSplit(n_splits=1, test_size=1-split_ratio, random_state=42)
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# Train the model on the best split
model.fit(X_train, y_train)

# Evaluate the model on the test data
y_pred_test = model.predict(X_test)
print(f"Test Accuracy: {accuracy_score(y_test, y_pred_test)}")
print(classification_report(y_test, y_pred_test))
# At the end of your processing, you can print or log the missing pairings
print(f"Missing synergy pairings: {len(missing_synergy_pairings)} pairs")
print(f"Missing counter-synergy pairings: {len(missing_counter_synergy_pairings)} pairs")
# Save the trained model
model_filename = 'dota2_model.pkl'
joblib.dump(model,'/content/drive/MyDrive/dota2_mode.pkl')

print(f"Model saved to {model_filename}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Test Accuracy: 0.6569987389659521
              precision    recall  f1-score   support

       False       0.67      0.57      0.62      5362
        True       0.65      0.74      0.69      5740

    accuracy                           0.66     11102
   macro avg       0.66      0.65      0.65     11102
weighted avg       0.66      0.66      0.65     11102

Missing synergy pairings: 0 pairs
Missing counter-synergy pairings: 0 pairs
Model saved to dota2_model.pkl




In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from google.colab import drive
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Mount Google Drive
drive.mount('/content/drive')

# Load data from Google Drive
match_data = pd.read_csv('/content/drive/MyDrive/match_data.csv')
immortal_vs_all_winrates = pd.read_csv('/content/drive/MyDrive/immortal_vs_all_winrates.csv')
immortal_with_all_winrates = pd.read_csv('/content/drive/MyDrive/immortal_with_all_winrates.csv')

# Filter matches with duration < 15 minutes and remove matches with leavers
match_data = match_data[(match_data['durationSeconds'] >= 900) &
                        (match_data['leaverStatus'] == 'NONE')]

# Initialize lists for storing features
radiant_features_sparse = []
dire_features_sparse = []
match_index = {}
synergy_features = []
counter_synergy_features = []

# Initialize counters for missing pairings
missing_synergy_pairings = []
missing_counter_synergy_pairings = []

# Iterate through each match
for idx, (match_id, group) in enumerate(match_data.groupby('match_id')):
    match_index[match_id] = idx

    # Initialize feature vectors for Radiant and Dire teams
    radiant_heroes = np.zeros(138)
    dire_heroes = np.zeros(138)

    # Separate Radiant and Dire heroes
    for _, row in group.iterrows():
        if row['isRadiant']:
            radiant_heroes[row['heroId'] - 1] = 1  # HeroId adjusted for 0-based indexing
        else:
            dire_heroes[row['heroId'] - 1] = 1  # HeroId adjusted for 0-based indexing

    # Append the features to the list
    radiant_features_sparse.append(radiant_heroes)
    dire_features_sparse.append(dire_heroes)

    # Calculate synergy within Radiant and Dire teams
    radiant_synergy = 0
    dire_synergy = 0
    radiant_hero_ids = group[group['isRadiant']]['heroId'].values
    dire_hero_ids = group[~group['isRadiant']]['heroId'].values

    for i in range(len(radiant_hero_ids)):
        for j in range(i + 1, len(radiant_hero_ids)):
            synergy = immortal_with_all_winrates[
                (immortal_with_all_winrates['HeroId1'] == radiant_hero_ids[i]) &
                (immortal_with_all_winrates['HeroId2'] == radiant_hero_ids[j])
            ]['WinRate']

            if synergy.empty:
                # Check the reversed pairing
                synergy = immortal_with_all_winrates[
                    (immortal_with_all_winrates['HeroId1'] == radiant_hero_ids[j]) &
                    (immortal_with_all_winrates['HeroId2'] == radiant_hero_ids[i])
                ]['WinRate']

            if synergy.empty:
                missing_synergy_pairings.append((radiant_hero_ids[i], radiant_hero_ids[j]))
            else:
                radiant_synergy += synergy.values[0]

    for i in range(len(dire_hero_ids)):
        for j in range(i + 1, len(dire_hero_ids)):
            synergy = immortal_with_all_winrates[
                (immortal_with_all_winrates['HeroId1'] == dire_hero_ids[i]) &
                (immortal_with_all_winrates['HeroId2'] == dire_hero_ids[j])
            ]['WinRate']

            if synergy.empty:
                # Check the reversed pairing
                synergy = immortal_with_all_winrates[
                    (immortal_with_all_winrates['HeroId1'] == dire_hero_ids[j]) &
                    (immortal_with_all_winrates['HeroId2'] == dire_hero_ids[i])
                ]['WinRate']

            if synergy.empty:
                missing_synergy_pairings.append((dire_hero_ids[i], dire_hero_ids[j]))
            else:
                dire_synergy += synergy.values[0]

    # Calculate counter synergy between Radiant and Dire teams
    counter_synergy = 0
    for radiant_hero in radiant_hero_ids:
        for dire_hero in dire_hero_ids:
            counter = immortal_vs_all_winrates[
                (immortal_vs_all_winrates['HeroId1'] == radiant_hero) &
                (immortal_vs_all_winrates['HeroId2'] == dire_hero)
            ]['WinRate']

            if counter.empty:
                # Check the reversed pairing
                counter = immortal_vs_all_winrates[
                    (immortal_vs_all_winrates['HeroId1'] == dire_hero) &
                    (immortal_vs_all_winrates['HeroId2'] == radiant_hero)
                ]['WinRate']

                if not counter.empty:
                    counter_synergy += 100 - counter.values[0]  # Complement the win rate

            if counter.empty:
                missing_counter_synergy_pairings.append((radiant_hero, dire_hero))
            else:
                counter_synergy += counter.values[0]

    synergy_features.append(radiant_synergy - dire_synergy)  # Net synergy
    counter_synergy_features.append(counter_synergy)

# Convert hero features and synergy features to DataFrames
radiant_df = pd.DataFrame(radiant_features_sparse, columns=[f'radiant_hero_{i}' for i in range(1, 139)])
dire_df = pd.DataFrame(dire_features_sparse, columns=[f'dire_hero_{i}' for i in range(1, 139)])
synergy_df = pd.DataFrame({
    'match_id': list(match_index.keys()),
    'synergy': synergy_features,
    'counter_synergy': counter_synergy_features
})

# Combine Radiant and Dire features with synergy features
final_df = pd.concat([radiant_df, dire_df], axis=1)
final_df['match_id'] = list(match_index.keys())
final_df = final_df.merge(synergy_df, on='match_id')

# Add the target variable
target = match_data.groupby('match_id').first()['didRadiantWin'].reset_index()
final_df = final_df.merge(target[['match_id', 'didRadiantWin']], on='match_id')

# Separate features and target variable
X = final_df.drop(columns=['didRadiantWin', 'match_id'])
y = final_df['didRadiantWin']

# Define the model
model = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=1000)

# Define the hyperparameter grid, including train-test split ratios
param_grid = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear'], 'gamma': [0.001, 0.0001], }
 ]

# Custom train-test split function to be used in GridSearchCV
class CustomTrainTestSplit:
    def __init__(self, split_ratio):
        self.split_ratio = split_ratio

    def split(self, X, y):
        sss = StratifiedShuffleSplit(n_splits=1, test_size=1-self.split_ratio, random_state=42)
        for train_index, test_index in sss.split(X, y):
            yield train_index, test_index

# Extend GridSearchCV to include the split ratio
def grid_search_with_split_ratio(X, y, param_grid, model):
    best_score = -np.inf
    best_params = None
    best_model = None

    for split_ratio in param_grid['split_ratio']:
        splitter = CustomTrainTestSplit(split_ratio)
        for train_index, test_index in splitter.split(X, y):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            grid_search = GridSearchCV(estimator=model,
                                       param_grid={k: v for k, v in param_grid.items() if k != 'split_ratio'},
                                       cv=5,
                                       scoring='accuracy')
            grid_search.fit(X_train, y_train)

            if grid_search.best_score_ > best_score:
                best_score = grid_search.best_score_
                best_params = grid_search.best_params_
                best_model = grid_search.best_estimator_

    return best_model, best_params, best_score

# Perform the grid search including train-test split ratios
best_model, best_params, best_score = grid_search_with_split_ratio(X, y, param_grid, model)

print(f"Best Parameters: {best_params}")
print(f"Best Cross-Validation Accuracy: {best_score}")

# Evaluate the best model on the test data
y_pred_test = best_model.predict(X_test)
print(f"Test Accuracy: {accuracy_score(y_test, y_pred_test)}")
print(classification_report(y_test, y_pred_test))


Mounted at /content/drive


TypeError: list indices must be integers or slices, not str

In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
import joblib

# Mount Google Drive
drive.mount('/content/drive')

# Load data from Google Drive
match_data = pd.read_csv('/content/drive/MyDrive/match_data.csv')
immortal_vs_all_winrates = pd.read_csv('/content/drive/MyDrive/immortal_vs_all_winrates.csv')
immortal_with_all_winrates = pd.read_csv('/content/drive/MyDrive/immortal_with_all_winrates.csv')

# Filter matches with duration < 15 minutes and remove matches with leavers
match_data = match_data[(match_data['durationSeconds'] >= 900) &
                        (match_data['leaverStatus'] == 'NONE')]

# Initialize lists for storing features
radiant_features_sparse = []
dire_features_sparse = []
match_index = {}
synergy_features = []
counter_synergy_features = []

# Initialize counters for missing pairings
missing_synergy_pairings = []
missing_counter_synergy_pairings = []

# Iterate through each match
for idx, (match_id, group) in enumerate(match_data.groupby('match_id')):
    match_index[match_id] = idx

    # Initialize feature vectors for Radiant and Dire teams
    radiant_heroes = np.zeros(138)
    dire_heroes = np.zeros(138)

    # Separate Radiant and Dire heroes
    for _, row in group.iterrows():
        if row['isRadiant']:
            radiant_heroes[row['heroId'] - 1] = 1  # HeroId adjusted for 0-based indexing
        else:
            dire_heroes[row['heroId'] - 1] = 1  # HeroId adjusted for 0-based indexing

    # Append the features to the list
    radiant_features_sparse.append(radiant_heroes)
    dire_features_sparse.append(dire_heroes)

    # Calculate synergy within Radiant and Dire teams
    radiant_synergy = 0
    dire_synergy = 0
    radiant_hero_ids = group[group['isRadiant']]['heroId'].values
    dire_hero_ids = group[~group['isRadiant']]['heroId'].values

    for i in range(len(radiant_hero_ids)):
        for j in range(i + 1, len(radiant_hero_ids)):
            synergy = immortal_with_all_winrates[
                (immortal_with_all_winrates['HeroId1'] == radiant_hero_ids[i]) &
                (immortal_with_all_winrates['HeroId2'] == radiant_hero_ids[j])
            ]['WinRate']

            if synergy.empty:
                # Check the reversed pairing
                synergy = immortal_with_all_winrates[
                    (immortal_with_all_winrates['HeroId1'] == radiant_hero_ids[j]) &
                    (immortal_with_all_winrates['HeroId2'] == radiant_hero_ids[i])
                ]['WinRate']

            if synergy.empty:
                missing_synergy_pairings.append((radiant_hero_ids[i], radiant_hero_ids[j]))
            else:
                radiant_synergy += synergy.values[0]

    for i in range(len(dire_hero_ids)):
        for j in range(i + 1, len(dire_hero_ids)):
            synergy = immortal_with_all_winrates[
                (immortal_with_all_winrates['HeroId1'] == dire_hero_ids[i]) &
                (immortal_with_all_winrates['HeroId2'] == dire_hero_ids[j])
            ]['WinRate']

            if synergy.empty:
                # Check the reversed pairing
                synergy = immortal_with_all_winrates[
                    (immortal_with_all_winrates['HeroId1'] == dire_hero_ids[j]) &
                    (immortal_with_all_winrates['HeroId2'] == dire_hero_ids[i])
                ]['WinRate']

            if synergy.empty:
                missing_synergy_pairings.append((dire_hero_ids[i], dire_hero_ids[j]))
            else:
                dire_synergy += synergy.values[0]

    # Calculate counter synergy between Radiant and Dire teams
    counter_synergy = 0
    for radiant_hero in radiant_hero_ids:
        for dire_hero in dire_hero_ids:
            counter = immortal_vs_all_winrates[
                (immortal_vs_all_winrates['HeroId1'] == radiant_hero) &
                (immortal_vs_all_winrates['HeroId2'] == dire_hero)
            ]['WinRate']

            if counter.empty:
                # Check the reversed pairing
                counter = immortal_vs_all_winrates[
                    (immortal_vs_all_winrates['HeroId1'] == dire_hero) &
                    (immortal_vs_all_winrates['HeroId2'] == radiant_hero)
                ]['WinRate']

                if not counter.empty:
                    counter_synergy += 100 - counter.values[0]

            if counter.empty:
                missing_counter_synergy_pairings.append((radiant_hero, dire_hero))
            else:
                counter_synergy += counter.values[0]

    synergy_features.append(radiant_synergy - dire_synergy)
    counter_synergy_features.append(counter_synergy)

# Convert hero features and synergy features to DataFrames
radiant_df = pd.DataFrame(radiant_features_sparse, columns=[f'radiant_hero_{i}' for i in range(1, 139)])
dire_df = pd.DataFrame(dire_features_sparse, columns=[f'dire_hero_{i}' for i in range(1, 139)])
synergy_df = pd.DataFrame({
    'match_id': list(match_index.keys()),
    'synergy': synergy_features,
    'counter_synergy': counter_synergy_features
})

# Combine Radiant and Dire features with synergy features
final_df = pd.concat([radiant_df, dire_df], axis=1)
final_df['match_id'] = list(match_index.keys())
final_df = final_df.merge(synergy_df, on='match_id')

# Add the target variable
target = match_data.groupby('match_id').first()['didRadiantWin'].reset_index()
final_df = final_df.merge(target[['match_id', 'didRadiantWin']], on='match_id')

# Separate features and target variable
X = final_df.drop(columns=['didRadiantWin', 'match_id'])
y = final_df['didRadiantWin']

# Standardize the synergy and counter-synergy features
scaler = StandardScaler()
X[['synergy', 'counter_synergy']] = scaler.fit_transform(X[['synergy', 'counter_synergy']])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)

# Define the model
model = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=1000 , l1_ratio=0.5)

# Train the logistic regression model
model.fit(X_train, y_train)

# Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Evaluate the model
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

print(f'Training Accuracy: {train_accuracy}')
print(f'Test Accuracy: {test_accuracy}')
print(classification_report(y_test, y_pred_test))

# Save the model
model_filename = 'lastmodel.pkl'
joblib.dump(model, '/content/drive/MyDrive/lastmodel.pkl')
print(f"Model saved to {model_filename}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Training Accuracy: 0.6922293364377182
Test Accuracy: 0.6612322104125383
              precision    recall  f1-score   support

       False       0.66      0.63      0.65      5420
        True       0.66      0.69      0.68      5682

    accuracy                           0.66     11102
   macro avg       0.66      0.66      0.66     11102
weighted avg       0.66      0.66      0.66     11102

Model saved to lastmodel.pkl


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
param_grid = {
    'penalty': ['l1', 'l2'],  # Types of regularization
    'C': [0.01, 0.1, 1, 10],           # Inverse of regularization strength
    'l1_ratio': [0.0, 0.5, 1.0],  # Only used if penalty is 'elasticnet'
    'solver': ['saga'],                     # 'saga' is needed for l1 or elasticnet
    'max_iter': [1000]                      # Number of iterations
}
# Initialize the model
model = LogisticRegression()

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)
# Get the best parameters
best_params = grid_search.best_params_
print(f'Best parameters: {best_params}')

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions
y_pred_train_best = best_model.predict(X_train)
y_pred_test_best = best_model.predict(X_test)

# Evaluate the model
train_accuracy_best = accuracy_score(y_train, y_pred_train_best)
test_accuracy_best = accuracy_score(y_test, y_pred_test_best)

print(f'Training Accuracy (Best Model): {train_accuracy_best}')
print(f'Test Accuracy (Best Model): {test_accuracy_best}')
print(classification_report(y_test, y_pred_test_best))


Fitting 5 folds for each of 24 candidates, totalling 120 fits




Best parameters: {'C': 0.01, 'l1_ratio': 0.0, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'saga'}
Training Accuracy (Best Model): 0.6835952658129608
Test Accuracy (Best Model): 0.6634840569266799
              precision    recall  f1-score   support

       False       0.67      0.62      0.64      5420
        True       0.66      0.71      0.68      5682

    accuracy                           0.66     11102
   macro avg       0.66      0.66      0.66     11102
weighted avg       0.66      0.66      0.66     11102

