# importing a library https://archive.ics.uci.edu/dataset/367/dota2+games+results

In [119]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# importing a first dataset

In [74]:
df = pd.read_excel('dota_test.xlsx',engine='openpyxl')

In [75]:
columns = df.columns

In [76]:
list_col = ['Who_won', 'cluster_id', 'game_mode', 'game_type']
for col in  range(1, 114):
    list_col.append(f'Heroes{col}')
df.columns = list_col
df = df.drop(columns=['cluster_id']) 
df

Unnamed: 0,Who_won,game_mode,game_type,Heroes1,Heroes2,Heroes3,Heroes4,Heroes5,Heroes6,Heroes7,...,Heroes104,Heroes105,Heroes106,Heroes107,Heroes108,Heroes109,Heroes110,Heroes111,Heroes112,Heroes113
0,-1,8,2,0,-1,0,0,0,0,0,...,-1,0,0,0,0,0,0,0,0,0
1,1,8,2,0,0,0,0,0,0,0,...,-1,0,0,0,0,0,0,0,0,0
2,-1,2,2,1,0,0,0,-1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,2,2,-1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,2,3,0,0,0,-1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10289,1,2,2,0,0,0,0,0,0,1,...,0,-1,0,0,0,0,0,0,0,0
10290,1,9,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10291,1,9,2,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
10292,1,2,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# CV f1 + accuracy_score and f1_score

In [87]:
def f1(method, x, y, grid_params):
    grid_acc = GridSearchCV(method, grid_params, cv=5, scoring='f1', return_train_score=False, verbose=1)
    grid_acc.fit(x, y)
    best_estimator_ = grid_acc.best_estimator_
    best_estimator_.fit(x,y)
    itog = {
        'best_params': grid_acc.best_params_,
        'accuracy score' : accuracy_score(y, best_estimator_.predict(x)),
        'f1 score' : sklearn.metrics.f1_score(y, best_estimator_.predict(x))
    }
    return itog

# CV accuracy + accuracy_score and f1_score

In [88]:
def f_acc(method, x, y, grid_params):
    grid_acc = GridSearchCV(method, grid_params, cv=5, scoring='accuracy', return_train_score=False, verbose=1)
    grid_acc.fit(x, y)
    best_estimator_ = grid_acc.best_estimator_
    best_estimator_.fit(x,y)
    itog = {
        'best_params': grid_acc.best_params_,
        'accuracy score' : accuracy_score(y, best_estimator_.predict(x)),
        'f1 score' : sklearn.metrics.f1_score(y, best_estimator_.predict(x))
    }
    return itog

# KNN

In [89]:
f_acc(KNeighborsClassifier(),df.drop('Who_won', axis=1), df['Who_won'], {'n_neighbors': [20, 100, 300, 1000, 1500, 2000]} )


Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.6min finished


{'best_params': {'n_neighbors': 100},
 'accuracy score': 0.584903827472314,
 'f1 score': 0.6615981626673001}

In [90]:
f1(KNeighborsClassifier(),df.drop('Who_won', axis=1), df['Who_won'], {'n_neighbors': [20, 100, 300, 1000, 1500, 2000]} )

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.6min finished


{'best_params': {'n_neighbors': 2000},
 'accuracy score': 0.5441033611812707,
 'f1 score': 0.6968150397312488}

# RandomForest

In [91]:
f_acc(
    RandomForestClassifier(),
    df.drop('Who_won', axis=1), 
    df['Who_won'],
    {'n_estimators':[10, 50, 100, 200], 'max_depth':[1, 2, 3,  5], 'max_leaf_nodes':[None, 2, 3, 5, 10]}
)

Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:  1.5min finished


{'best_params': {'max_depth': 5, 'max_leaf_nodes': 10, 'n_estimators': 10},
 'accuracy score': 0.5539149018845929,
 'f1 score': 0.6961355214399154}

In [92]:
f1(
    RandomForestClassifier(),
    df.drop('Who_won', axis=1), 
    df['Who_won'],
    {'n_estimators':[10, 50, 100, 200], 'max_depth':[1, 2, 3,  5], 'max_leaf_nodes':[None, 2, 3, 5, 10]}
)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:  1.5min finished


{'best_params': {'max_depth': 3, 'max_leaf_nodes': None, 'n_estimators': 200},
 'accuracy score': 0.5370118515640179,
 'f1 score': 0.6976272046694582}

# LogisticRegression

In [102]:
LogisticRegression().get_params().keys()

dict_keys(['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'l1_ratio', 'max_iter', 'multi_class', 'n_jobs', 'penalty', 'random_state', 'solver', 'tol', 'verbose', 'warm_start'])

In [111]:
f_acc(
    LogisticRegression(),
    df.drop('Who_won', axis=1), 
    df['Who_won'],
    {'penalty':['l1', 'l2'], 'C':[0.01, 0.001, 0.1, 0.00001, 10],'solver':['liblinear']}
)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.7s finished


{'best_params': {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'},
 'accuracy score': 0.6094812512142996,
 'f1 score': 0.6561163387510692}

In [112]:
f1(
    LogisticRegression(),
    df.drop('Who_won', axis=1), 
    df['Who_won'],
    {'penalty':['l1', 'l2'], 'C':[0.01, 0.001, 0.1, 0.00001, 10],'solver':['liblinear']}
)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.8s finished


{'best_params': {'C': 1e-05, 'penalty': 'l2', 'solver': 'liblinear'},
 'accuracy score': 0.5344861084126675,
 'f1 score': 0.6966320587490503}

# DecisionTree

In [117]:
f_acc(
    DecisionTreeClassifier(),
    df.drop('Who_won', axis=1), 
    df['Who_won'],
    {'max_depth':[ 3,  5, 7, 10], 'max_leaf_nodes':[None, 2, 3, 5, 10]}
)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    2.9s finished


{'best_params': {'max_depth': 10, 'max_leaf_nodes': 10},
 'accuracy score': 0.5667379055760637,
 'f1 score': 0.6662675845555223}

In [118]:
f1(
    DecisionTreeClassifier(),
    df.drop('Who_won', axis=1), 
    df['Who_won'],
    {'max_depth':[ 3,  5, 7, 10], 'max_leaf_nodes':[None, 2, 3, 5, 10]}
)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    2.9s finished


{'best_params': {'max_depth': 3, 'max_leaf_nodes': 2},
 'accuracy score': 0.5344861084126675,
 'f1 score': 0.6966320587490503}

# importing a second dataset https://www.kaggle.com/datasets/ulrikthygepedersen/speed-dating

In [162]:
df2 = pd.read_csv('speeddating.csv', sep=',', usecols=['gender','age','age_o','d_age','pref_o_intelligence','pref_o_funny','pref_o_ambitious','pref_o_shared_interests','intelligence_o','funny_o','ambitous_o','shared_interests_o','like','guess_prob_liked', 'met','match'])
df2['gender'] = np.where(df2['gender'] == "b'female'", 0, np.where(df2['gender'] == "b'male'", 1, -1))
df2['match'] = np.where(df2['match'] == "b'0'", 0, np.where(df2['match'] == "b'1'", 1, -1))
df2 = df2.fillna(0)
df2

Unnamed: 0,gender,age,age_o,d_age,pref_o_intelligence,pref_o_funny,pref_o_ambitious,pref_o_shared_interests,intelligence_o,funny_o,ambitous_o,shared_interests_o,like,guess_prob_liked,met,match
0,0,21.0,27.0,6.0,20.0,20.0,0.0,5.0,8.0,8.0,8.0,6.0,7.0,6.0,0.0,0
1,0,21.0,22.0,1.0,0.0,40.0,0.0,0.0,10.0,7.0,7.0,5.0,7.0,5.0,1.0,0
2,0,21.0,22.0,1.0,19.0,18.0,14.0,12.0,10.0,10.0,10.0,10.0,7.0,0.0,1.0,1
3,0,21.0,23.0,2.0,15.0,40.0,5.0,5.0,9.0,8.0,9.0,8.0,7.0,6.0,0.0,1
4,0,21.0,24.0,3.0,20.0,10.0,10.0,20.0,9.0,6.0,9.0,7.0,6.0,6.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8373,1,25.0,26.0,1.0,30.0,20.0,10.0,15.0,3.0,2.0,6.0,5.0,2.0,5.0,0.0,0
8374,1,25.0,24.0,1.0,10.0,5.0,10.0,5.0,7.0,3.0,7.0,2.0,4.0,4.0,0.0,0
8375,1,25.0,29.0,4.0,30.0,10.0,10.0,0.0,2.0,2.0,2.0,1.0,6.0,5.0,0.0,0
8376,1,25.0,22.0,3.0,25.0,10.0,10.0,20.0,5.0,5.0,3.0,6.0,5.0,5.0,0.0,0


# Knn

In [152]:
f_acc(KNeighborsClassifier(),df2.drop('match', axis=1), df2['match'], {'n_neighbors': [20, 100, 300, 1000, 1500, 2000]} )

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   17.2s finished


{'best_params': {'n_neighbors': 100},
 'accuracy score': 0.8363571258056816,
 'f1 score': 0.020014295925661185}

In [155]:
f1(KNeighborsClassifier(),df2.drop('match', axis=1), df2['match'], {'n_neighbors': [20, 100, 300, 1000, 1500, 2000]} )

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   17.2s finished


{'best_params': {'n_neighbors': 20},
 'accuracy score': 0.8465027452852709,
 'f1 score': 0.16925064599483206}

# RandomForest

In [156]:
f_acc(
    RandomForestClassifier(),
    df2.drop('match', axis=1), df2['match'],
    {'n_estimators':[10, 50, 100, 200], 'max_depth':[1, 2, 3,  5], 'max_leaf_nodes':[None, 2, 3, 5, 10]}
)

Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:  1.3min finished


{'best_params': {'max_depth': 5, 'max_leaf_nodes': None, 'n_estimators': 50},
 'accuracy score': 0.8438768202434949,
 'f1 score': 0.13147410358565736}

In [157]:
f1(
    RandomForestClassifier(),
    df2.drop('match', axis=1), df2['match'],
    {'n_estimators':[10, 50, 100, 200], 'max_depth':[1, 2, 3,  5], 'max_leaf_nodes':[None, 2, 3, 5, 10]}
)

Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:  1.3min finished


{'best_params': {'max_depth': 5, 'max_leaf_nodes': None, 'n_estimators': 10},
 'accuracy score': 0.8493673907853903,
 'f1 score': 0.2209876543209877}

# LogisticRegression

In [102]:
LogisticRegression().get_params().keys()

dict_keys(['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'l1_ratio', 'max_iter', 'multi_class', 'n_jobs', 'penalty', 'random_state', 'solver', 'tol', 'verbose', 'warm_start'])

In [158]:
f_acc(
    LogisticRegression(),
    df2.drop('match', axis=1), df2['match'],
    {'penalty':['l1', 'l2'], 'C':[0.01, 0.001, 0.1, 0.00001, 10],'solver':['liblinear']}
)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    2.0s finished


{'best_params': {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'},
 'accuracy score': 0.8503222726187635,
 'f1 score': 0.3184782608695652}

In [159]:
f1(
    LogisticRegression(),
    df2.drop('match', axis=1), df2['match'],
    {'penalty':['l1', 'l2'], 'C':[0.01, 0.001, 0.1, 0.00001, 10],'solver':['liblinear']}
)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    2.1s finished


{'best_params': {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'},
 'accuracy score': 0.8499641919312485,
 'f1 score': 0.33667546174142476}

# DecisionTree

In [160]:
f_acc(
    DecisionTreeClassifier(),
    df2.drop('match', axis=1), df2['match'],
    {'max_depth':[ 3,  5, 7, 10], 'max_leaf_nodes':[None, 2, 3, 5, 10]}
)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.1s finished


{'best_params': {'max_depth': 5, 'max_leaf_nodes': 10},
 'accuracy score': 0.8476963475769873,
 'f1 score': 0.44084136722173534}

In [161]:
f1(
    DecisionTreeClassifier(),
    df2.drop('match', axis=1), df2['match'],
    {'max_depth':[ 3,  5, 7, 10], 'max_leaf_nodes':[None, 2, 3, 5, 10]}
)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.2s finished


{'best_params': {'max_depth': 10, 'max_leaf_nodes': None},
 'accuracy score': 0.9108379088087849,
 'f1 score': 0.6957230142566192}

# importing a second dataset https://www.kaggle.com/datasets/houcembenmansour/predict-diabetes-based-on-diagnostic-measures 

In [187]:
df3 = pd.read_csv('diabetes.csv', sep=',', usecols=['cholesterol','glucose','hdl_chol','chol_hdl_ratio','age','gender','height','weight','bmi','systolic_bp','diastolic_bp','waist','hip','diabetes'])
df3['gender'] = np.where(df3['gender'] == 'female', 0, np.where(df3['gender'] == 'male', 1, -1))
df3['diabetes'] = np.where(df3['diabetes'] == 'Diabetes', 1, np.where(df3['diabetes'] == 'No diabetes', 0, -1))
df3['chol_hdl_ratio'] = df3['chol_hdl_ratio'].str.replace(',', '.').astype(float)
df3['bmi'] = df3['bmi'].str.replace(',', '.').astype(float)
df3

Unnamed: 0,cholesterol,glucose,hdl_chol,chol_hdl_ratio,age,gender,height,weight,bmi,systolic_bp,diastolic_bp,waist,hip,diabetes
0,193,77,49,3.9,19,0,61,119,22.5,118,70,32,38,0
1,146,79,41,3.6,19,0,60,135,26.4,108,58,33,40,0
2,217,75,54,4.0,20,0,67,187,29.3,110,72,40,45,0
3,226,97,70,3.2,20,0,64,114,19.6,122,64,31,39,0
4,164,91,67,2.4,20,0,70,141,20.2,122,86,32,39,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
385,227,105,44,5.2,83,0,59,125,25.2,150,90,35,40,0
386,226,279,52,4.3,84,0,60,192,37.5,144,88,41,48,1
387,301,90,118,2.6,89,0,61,115,21.7,218,90,31,41,0
388,232,184,114,2.0,91,0,61,127,24.0,170,82,35,38,1


# Knn

In [189]:
f_acc(KNeighborsClassifier(),df3.drop('diabetes', axis=1), df3['diabetes'], {'n_neighbors': [10, 20, 30, 100, 150]} )

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.1s finished


{'best_params': {'n_neighbors': 20},
 'accuracy score': 0.9205128205128205,
 'f1 score': 0.6804123711340206}

In [190]:
f1(KNeighborsClassifier(),df3.drop('diabetes', axis=1), df3['diabetes'], {'n_neighbors': [10, 20, 30, 100, 150]} )

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.1s finished


{'best_params': {'n_neighbors': 20},
 'accuracy score': 0.9205128205128205,
 'f1 score': 0.6804123711340206}

# RandomForest

In [191]:
f_acc(
    RandomForestClassifier(),
    df3.drop('diabetes', axis=1), df3['diabetes'],
    {'n_estimators':[10, 50, 100, 200], 'max_depth':[1, 2, 3,  5], 'max_leaf_nodes':[None, 2, 3, 5, 10]}
)

Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:   42.0s finished


{'best_params': {'max_depth': 2, 'max_leaf_nodes': 10, 'n_estimators': 50},
 'accuracy score': 0.8871794871794871,
 'f1 score': 0.4358974358974359}

In [192]:
f1(
    RandomForestClassifier(),
    df3.drop('diabetes', axis=1), df3['diabetes'],
    {'n_estimators':[10, 50, 100, 200], 'max_depth':[1, 2, 3,  5], 'max_leaf_nodes':[None, 2, 3, 5, 10]}
)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:   40.9s finished


{'best_params': {'max_depth': 5, 'max_leaf_nodes': None, 'n_estimators': 100},
 'accuracy score': 0.9666666666666667,
 'f1 score': 0.8785046728971964}

# LogisticRegression

In [193]:
f_acc(
    LogisticRegression(),
    df3.drop('diabetes', axis=1), df3['diabetes'],
    {'penalty':['l1', 'l2'], 'C':[0.01, 0.001, 0.1, 0.00001, 10],'solver':['liblinear']}
)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.3s finished


{'best_params': {'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'},
 'accuracy score': 0.9230769230769231,
 'f1 score': 0.7058823529411764}

In [194]:
f1(
    LogisticRegression(),
    df3.drop('diabetes', axis=1), df3['diabetes'],
    {'penalty':['l1', 'l2'], 'C':[0.01, 0.001, 0.1, 0.00001, 10],'solver':['liblinear']}
)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.3s finished


{'best_params': {'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'},
 'accuracy score': 0.9230769230769231,
 'f1 score': 0.7058823529411764}

# DecisionTree

In [195]:
f_acc(
    DecisionTreeClassifier(),
    df3.drop('diabetes', axis=1), df3['diabetes'],
    {'max_depth':[ 3,  5, 7, 10], 'max_leaf_nodes':[None, 2, 3, 5, 10]}
)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.4s finished


{'best_params': {'max_depth': 3, 'max_leaf_nodes': 3},
 'accuracy score': 0.9282051282051282,
 'f1 score': 0.7358490566037735}

In [196]:
f1(
    DecisionTreeClassifier(),
    df3.drop('diabetes', axis=1), df3['diabetes'],
    {'max_depth':[ 3,  5, 7, 10], 'max_leaf_nodes':[None, 2, 3, 5, 10]}
)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.4s finished


{'best_params': {'max_depth': 5, 'max_leaf_nodes': 5},
 'accuracy score': 0.9461538461538461,
 'f1 score': 0.832}