**LOGISIC REGRESSION**

Let's begin by importing necessary libaries

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

Let's load the data and take a look at the beautiful X matrix

In [15]:
df = pd.read_csv('masterfile done.csv')

X = df.iloc[:, 1:-2]
y = df['future result']
X

Unnamed: 0,team kpm,dragons,barons,towers,dpm,vspm,earned gpm,monsterkills_pm,cspm,goldat15
0,0.076987,0.028571,-0.090476,-0.695238,-134.718128,-0.185134,-27.306101,-0.968975,-1.398766,-872.666667
1,0.046773,0.000000,-0.200000,-1.266667,-157.622987,-0.214460,-59.177227,-1.279145,-1.704927,-851.666667
2,-0.003039,-0.025000,-0.295833,-2.016667,-211.525307,-0.393220,-94.854949,-1.253363,-1.796630,-1027.604167
3,-0.019650,0.000000,-1.000000,2.000000,705.006500,5.300050,93.700150,2.777322,3.850050,154.000000
4,0.479667,1.000000,1.000000,6.333333,187.260467,1.957733,441.377333,3.706456,3.117767,1349.333333
...,...,...,...,...,...,...,...,...,...,...
1451,0.004700,-1.500000,-0.500000,-2.500000,302.183200,-0.090650,-32.767300,0.195975,-1.532400,11.500000
1452,0.701100,5.500000,2.000000,14.500000,1698.845950,5.274850,900.492850,10.928177,9.999650,7614.500000
1453,-0.035333,-2.333333,-0.333333,-1.333333,-126.338950,-1.799233,-85.683450,-0.548742,-1.591583,273.833333
1454,-0.124610,-3.523810,-0.285714,-0.190476,-556.476652,-1.759019,-121.471481,-0.745499,-2.804424,-222.809524


Then, we need to divide the data into the training and test sets. Adittionaly I used the StandardScaler function to boost the performance of the regression.

The fit_transform and transform methods are used to make sure that there is no data leakage

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify= y, random_state = 1)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Now, I set up the cross-validation and hyper parameter tuning functions

In [17]:
kf = KFold(n_splits=10, shuffle=True, random_state=1)
param_grid = {'C': np.arange(0.01, 2, 200),
            'penalty': ['l1', 'l2', 'elasticnet', 'none'],
            'solver': ['liblinear', 'lbfgs', 'saga'],
            'fit_intercept': [True, False]}

The rest is just fitting the model

In [18]:
logreg = LogisticRegression()
logreg_cv = GridSearchCV(logreg, param_grid=param_grid, cv=kf)
logreg_cv.fit(X_train_scaled, y_train)
score = logreg_cv.best_score_
print("Accuracy: ", score)
print("Best hyperparameters: ", logreg_cv.best_params_)

Accuracy:  0.676959917477159
Best hyperparameters:  {'C': np.float64(0.01), 'fit_intercept': True, 'penalty': 'l1', 'solver': 'liblinear'}


140 fits failed out of a total of 240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Jakub\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Jakub\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Jakub\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
    solver = _check_solver(s

Turns out that the model achives quite high accuracy of about 68% with the the best possible combination of hyperparameters. The errors above are just the result of testing every solver with every type of penalty. The lbfgs solver doesn't support l2 type of regularisation

Now let's test if by throwing out some predictor the model's performance can be increased

In [19]:
def powerset(s):
    """funcion that spits out every subset of a given set"""
    all_sets = []
    x = len(s)
    for i in range(1 << x):
        #print([s[j] for j in range(x) if (i & (1 << j))])
        subsets = [s[j] for j in range(x) if (i & (1 << j))]
        all_sets.append(subsets)
    return(all_sets)   

In [None]:
sets = powerset([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
list = []
df = pd.read_csv('masterfile done.csv')
for i in range(1, (len(sets))):
    X = df.iloc[:, sets[i]]
    y = df['future result']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify= y, random_state = 1)
    kf = KFold(n_splits=10, shuffle=True, random_state=1)
    param_grid = {'C': np.arange(0.001, 2, 200),
            'penalty': ['l1', 'l2', 'elasticnet', 'none'],
            'solver': ['liblinear', 'lbfgs', 'saga'],
            'fit_intercept': [True, False]}
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    logreg = LogisticRegression()
    logreg_cv = GridSearchCV(logreg, param_grid=param_grid, cv=kf)
    logreg_cv.fit(X_train_scaled, y_train)
    score = logreg_cv.best_score_
    list.append([score, sets[i], logreg_cv.best_params_])
df_r = pd.DataFrame(list)
df_r.to_csv('logreg_result_done.csv')