In [1]:
import pandas as pd
from tabulate import tabulate

file_path = 'data/teams-stats.ds.csv'
df = pd.read_csv(file_path)

champions_league_clubs = [
    'Real Madrid', 'Eintracht Frankfurt', 'Manchester City', 'Liverpool',
    'Chelsea', 'Tottenham Hotspur', 'Barcelona', 'Atlético Madrid', 'Sevilla',
    'Milan', 'Inter Milan', 'Napoli', 'Juventus', 'Bayern Munich',
    'Borussia Dortmund', 'Bayer Leverkusen', 'RB Leipzig', 'Paris Saint-Germain',
    'Marseille', 'Porto', 'Sporting CP', 'Ajax', 'Club Brugge', 'Red Bull Salzburg',
    'Celtic', 'Shakhtar Donetsk', 'Trabzonspor', 'Copenhagen'
]

df_2022 = df[df['Year'] == 2022].copy()


df_2022['ChampionsLeague'] = df_2022['Club'].apply(lambda x: 1 if x in champions_league_clubs else 0)

df_2022['GVB'] = df_2022['Overall'].apply(lambda x: 1 if x >= 68 else 0)

selected_columns = [
    'Club', 'IntReputation', 'Age', 'SkillMoves', 'Crossing',
    'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys', 'Dribbling',
    'BallControl', 'Acceleration', 'SprintSpeed', 'Agility', 'ShotPower',
    'Jumping', 'LongShots', 'Aggression', 'Interceptions', 'Positioning',
    'Vision', 'Penalties', 'Composure', 'StandingTackle', 'SlidingTackle',
    'ChampionsLeague', 'GVB'
]

print(tabulate(df_2022[selected_columns], headers='keys', tablefmt='pretty'))

output_file_path = 'data/teams-stats.csv'
df_2022[selected_columns].to_csv(output_file_path, index=False)


+------+---------------------------------------+--------------------+--------------------+------------+----------+-----------+--------------------+-------------------+--------------------+-----------+-------------+--------------+-------------+---------+-----------+---------+-----------+------------+--------------------+-------------+--------+-----------+-----------+--------------------+--------------------+-----------------+-----+
|      |                 Club                  |   IntReputation    |        Age         | SkillMoves | Crossing | Finishing |  HeadingAccuracy   |   ShortPassing    |      Volleys       | Dribbling | BallControl | Acceleration | SprintSpeed | Agility | ShotPower | Jumping | LongShots | Aggression |   Interceptions    | Positioning | Vision | Penalties | Composure |   StandingTackle   |   SlidingTackle    | ChampionsLeague | GVB |
+------+---------------------------------------+--------------------+--------------------+------------+----------+-----------+----

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

df = pd.read_csv('data/teams-stats.csv')


missing_values = df.isnull().sum()
if missing_values.sum() == 0:
    print("Brakujące wartości w zbiorze danych: Brak brakujących wartości")
else:
    print("Brakujące wartości:\n", missing_values)


class_balance = df['GVB'].value_counts()
print("Rozkład klasy docelowej GVB:\n", class_balance)
X = df.drop(columns=['Club', 'GVB'])
y = df['GVB']
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
new_class_balance = y_resampled.value_counts()
print("Rozkład klasy docelowej GVB po nadpróbkowaniu:\n", new_class_balance)

original_index = X.index
resampled_index = X_resampled.index

new_samples_index = set(resampled_index) - set(original_index)

new_samples = X_resampled.iloc[list(new_samples_index)]
print("Nowe próbki wygenerowane przez SMOTE:\n", new_samples)

new_clubs = [f"Club_{i}" for i in range(len(X), len(X_resampled) + len(X))]

scaler = StandardScaler()
X_resampled_scaled = scaler.fit_transform(X_resampled)

X_resampled_scaled_df = pd.DataFrame(X_resampled_scaled, columns=X.columns)

X_resampled_scaled_df['Club'] = new_clubs

X_resampled_scaled_df['GVB'] = y_resampled  
X_resampled_scaled_df.to_csv('data/teams-stats-standard.csv', index=False)
print("Dane zostały zapisane do 'teams-stats-standard.csv'")

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


file_path = 'data/teams-stats.ds.csv'
df = pd.read_csv(file_path)

champions_league_clubs = [
    'Real Madrid', 'Eintracht Frankfurt', 'Manchester City', 'Liverpool',
    'Chelsea', 'Tottenham Hotspur', 'Barcelona', 'Atlético Madrid', 'Sevilla',
    'Milan', 'Inter Milan', 'Napoli', 'Juventus', 'Bayern Munich',
    'Borussia Dortmund', 'Bayer Leverkusen', 'RB Leipzig', 'Paris Saint-Germain',
    'Marseille', 'Porto', 'Sporting CP', 'Ajax', 'Club Brugge', 'Red Bull Salzburg',
    'Celtic', 'Shakhtar Donetsk', 'Trabzonspor', 'Copenhagen'
]

df['ChampionsLeague'] = df['Club'].apply(lambda x: 1 if x in champions_league_clubs else 0)
df['GVB'] = df['Overall'].apply(lambda x: 1 if x >= 68 else 0)
skills = ['Dribbling', 'Crossing', 'Finishing', 'BallControl']
df_means = df.groupby('GVB')[skills].mean()

df_means.T.plot(kind='bar', figsize=(10, 6), color=['#1f77b4', '#ff7f0e'])
plt.title('Średnie umiejętności techniczne dla dobrych i słabszych klubów')
plt.xlabel('Umiejętności')
plt.ylabel('Średnia ocena')
plt.xticks(rotation=45)
plt.legend(['Słabszy klub (GVB=0)', 'Dobry klub (GVB=1)'])
plt.tight_layout()
plt.show()

plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='ChampionsLeague', hue='GVB', palette=['#1f77b4', '#ff7f0e'])
plt.title('Liczba klubów z i bez udziału w Lidze Mistrzów dla różnych klas GVB')
plt.xlabel('Udział w Lidze Mistrzów')
plt.ylabel('Liczba klubów')
plt.xticks([0, 1], ['Bez Ligi Mistrzów', 'Z Ligą Mistrzów'])
plt.legend(['Słabszy klub (GVB=0)', 'Dobry klub (GVB=1)'])
plt.show()


In [None]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('../data/teams-stats-standard.csv')

X = df.drop(columns=['Club', 'GVB'])
y = df['GVB']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

svm_model = SVC(kernel='linear')

svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

print("Dokładność modelu SVM:", accuracy_score(y_test, y_pred))
print("Raport klasyfikacji:\n", classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(12, 8))  
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Bad", "Good"], yticklabels=["Bad", "Good"])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Macierz pomyłek - SVM')
plt.show()

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

svm_model_pca = SVC(kernel='linear')
svm_model_pca.fit(X_pca[:len(X_train)], y_train)

y_pred_pca = svm_model_pca.predict(X_pca[len(X_train):])

plt.figure(figsize=(12, 8))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y, palette='coolwarm', style=y, s=100, alpha=0.7)
plt.title("SVM Decision Boundary in 2D Space (PCA)")
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()

sns.pairplot(df, hue="GVB", vars=['IntReputation', 'Age', 'SkillMoves', 'Crossing', 'Finishing', 'HeadingAccuracy'])
plt.show()

coefficients = svm_model.coef_.flatten()
features = X.columns

plt.figure(figsize=(14, 8))  
plt.bar(features, coefficients)
plt.title("SVM Model Coefficients (Linear Kernel)")
plt.xlabel("Features")
plt.ylabel("Coefficient Value")
plt.xticks(rotation=90)
plt.show()


In [None]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('../data/teams-stats-standard.csv')

X = df.drop(columns=['Club', 'GVB'])
y = df['GVB']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto', 0.1, 1]
}

grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Najlepsze parametry: ", grid_search.best_params_)

best_svm_model = grid_search.best_estimator_

y_pred_opt = best_svm_model.predict(X_test)

print("Dokładność modelu SVM (optymalizowanego):", accuracy_score(y_test, y_pred_opt))
print("Raport klasyfikacji:\n", classification_report(y_test, y_pred_opt))

cm_opt = confusion_matrix(y_test, y_pred_opt)
plt.figure(figsize=(12, 8))  # Increased width
sns.heatmap(cm_opt, annot=True, fmt="d", cmap="Blues", xticklabels=["Bad", "Good"], yticklabels=["Bad", "Good"])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Macierz pomyłek - SVM optymalizowany')
plt.show()

pca_opt = PCA(n_components=2)
X_pca_opt = pca_opt.fit_transform(X_scaled)

svm_model_pca_opt = best_svm_model
svm_model_pca_opt.fit(X_pca_opt[:len(X_train)], y_train)

y_pred_pca_opt = svm_model_pca_opt.predict(X_pca_opt[len(X_train):])

plt.figure(figsize=(12, 8)) 
sns.scatterplot(x=X_pca_opt[:, 0], y=X_pca_opt[:, 1], hue=y, palette='coolwarm', style=y, s=100, alpha=0.7)
plt.title("SVM Decision Boundary in 2D Space (Optimized SVM)")
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()

sns.pairplot(df, hue="GVB", vars=['IntReputation', 'Age', 'SkillMoves', 'Crossing', 'Finishing', 'HeadingAccuracy'])
plt.show()

if grid_search.best_params_['kernel'] == 'linear':
    coefficients_opt = best_svm_model.coef_.flatten()
    features_opt = X.columns

    plt.figure(figsize=(14, 8))  
    plt.bar(features_opt, coefficients_opt)
    plt.title("SVM Model Coefficients (Optimized Linear Kernel)")
    plt.xlabel("Features")
    plt.ylabel("Coefficient Value")
    plt.xticks(rotation=90)
    plt.show()


In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


df = pd.read_csv('../data/teams-stats-standard.csv')

X = df.drop(columns=['Club', 'GVB'])
y = df['GVB']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

dt_model_default = DecisionTreeClassifier(random_state=42)

dt_model_default.fit(X_train, y_train)

y_pred_default = dt_model_default.predict(X_test)

print("Dokładność modelu drzewa decyzyjnego (domyślne parametry):", accuracy_score(y_test, y_pred_default))
print("Raport klasyfikacji:\n", classification_report(y_test, y_pred_default))

cm_default = confusion_matrix(y_test, y_pred_default)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_default, annot=True, fmt="d", cmap="Blues", xticklabels=["Bad", "Good"], yticklabels=["Bad", "Good"])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Macierz pomyłek - Drzewo decyzyjne')
plt.show()


In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('../data/teams-stats-standard.csv')

X = df.drop(columns=['Club', 'GVB'])
y = df['GVB']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

param_grid = {
    'max_depth': [5, 10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, n_jobs=-1, scoring='accuracy')

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Najlepsze parametry modelu drzewa decyzyjnego:", best_params)

y_pred_optimized = grid_search.predict(X_test)

print("Dokładność modelu drzewa decyzyjnego (optymalizowane parametry):", accuracy_score(y_test, y_pred_optimized))
print("Raport klasyfikacji:\n", classification_report(y_test, y_pred_optimized))

cm_optimized = confusion_matrix(y_test, y_pred_optimized)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_optimized, annot=True, fmt="d", cmap="Blues", xticklabels=["Bad", "Good"], yticklabels=["Bad", "Good"])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Macierz pomyłek - Drzewo decyzyjne (po optymalizacji)')
plt.show()


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('../data/teams-stats-standard.csv')

X = df.drop(columns=['Club', 'GVB'])
y = df['GVB']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf_model_default = RandomForestClassifier(random_state=1)

rf_model_default.fit(X_train, y_train)

y_pred_default = rf_model_default.predict(X_test)

print("Dokładność modelu lasu losowego (domyślne parametry):", accuracy_score(y_test, y_pred_default))
print("Raport klasyfikacji (domyślne parametry):\n", classification_report(y_test, y_pred_default))

cm_default = confusion_matrix(y_test, y_pred_default)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_default, annot=True, fmt="d", cmap="Blues", xticklabels=["Bad", "Good"], yticklabels=["Bad", "Good"])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Macierz pomyłek - Las losowy')
plt.savefig("rf_matrix_default.png")  
plt.show()


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('../data/teams-stats-standard.csv')

X = df.drop(columns=['Club', 'GVB'])
y = df['GVB']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf_model_optimized = RandomForestClassifier(
    n_estimators=10000,          
    max_depth=50,                
    min_samples_split=2,         
    min_samples_leaf=1,          
    max_features='sqrt',         
    bootstrap=True,              
)

rf_model_optimized.fit(X_train, y_train)

y_pred_optimized = rf_model_optimized.predict(X_test)

print("Dokładność modelu lasu losowego (optymalizacja):", accuracy_score(y_test, y_pred_optimized))
print("Raport klasyfikacji (optymalizacja):\n", classification_report(y_test, y_pred_optimized))

cm_optimized = confusion_matrix(y_test, y_pred_optimized)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_optimized, annot=True, fmt="d", cmap="Blues", xticklabels=["Bad", "Good"], yticklabels=["Bad", "Good"])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Macierz pomyłek - Las losowy (po optymalizacji)')
plt.savefig("rf_matrix_aggressive_optimized.png")  
plt.show()

feature_importances = rf_model_optimized.feature_importances_

plt.figure(figsize=(10, 6))
plt.barh(X.columns, feature_importances, color='skyblue')
plt.xlabel('Ważność cech')
plt.ylabel('Cechy')
plt.title('Ważność cech w modelu lasu losowego')
plt.tight_layout()

plt.show()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('../data/teams-stats-standard.csv')
X, y = df.drop(columns=['Club', 'GVB']), df['GVB']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf_model = RandomForestClassifier()
gb_model = GradientBoostingClassifier()
lr_model = LogisticRegression()

voting_model = VotingClassifier(
    estimators=[('rf', rf_model), ('gb', gb_model), ('lr', lr_model)],
    voting='hard'  
)

voting_model.fit(X_train, y_train)

y_pred_voting = voting_model.predict(X_test)
print(f"Dokładność (Voting Classifier): {accuracy_score(y_test, y_pred_voting)}")
print("Raport klasyfikacji (Voting Classifier):\n", classification_report(y_test, y_pred_voting))

cm_voting = confusion_matrix(y_test, y_pred_voting)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_voting, annot=True, fmt="d", cmap="Blues", xticklabels=["Bad", "Good"], yticklabels=["Bad", "Good"])
plt.xlabel('Predykcja')
plt.ylabel('Rzeczywiste')
plt.title('Macierz pomyłek - Voting Classifier')
plt.savefig("voting_matrix.png")
plt.show()


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('../data/teams-stats-standard.csv')
X, y = df.drop(columns=['Club', 'GVB']), df['GVB']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf_params = {
    'n_estimators': [100, 200, 500],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
rf_grid = RandomizedSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=5, n_jobs=-1, verbose=0, n_iter=10)
rf_grid.fit(X_train, y_train)
rf_best = rf_grid.best_estimator_

gb_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7]
}
gb_grid = GridSearchCV(GradientBoostingClassifier(random_state=42), gb_params, cv=5, n_jobs=-1, verbose=0)
gb_grid.fit(X_train, y_train)
gb_best = gb_grid.best_estimator_

svm_params = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
svm_grid = GridSearchCV(SVC(probability=True, random_state=42), svm_params, cv=5, n_jobs=-1, verbose=0)
svm_grid.fit(X_train, y_train)
svm_best = svm_grid.best_estimator_

knn_params = {'n_neighbors': [3, 5, 7, 9], 'weights': ['uniform', 'distance']}
knn_grid = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5, n_jobs=-1, verbose=0)
knn_grid.fit(X_train, y_train)
knn_best = knn_grid.best_estimator_

lr_best = LogisticRegression(max_iter=1000, random_state=42)
lr_best.fit(X_train, y_train)

voting_model = VotingClassifier(
    estimators=[('rf', rf_best), ('gb', gb_best), ('svm', svm_best), ('knn', knn_best), ('lr', lr_best)],
    voting='soft',
    weights=[2, 2, 1, 1, 0.5]
)
voting_model.fit(X_train, y_train)

model_names = ["Random Forest", "Gradient Boosting", "SVM", "kNN", "Logistic Regression", "Voting Classifier"]
accuracies = [
    accuracy_score(y_test, rf_best.predict(X_test)),
    accuracy_score(y_test, gb_best.predict(X_test)),
    accuracy_score(y_test, svm_best.predict(X_test)),
    accuracy_score(y_test, knn_best.predict(X_test)),
    accuracy_score(y_test, lr_best.predict(X_test)),
    accuracy_score(y_test, voting_model.predict(X_test))
]

y_pred_voting = voting_model.predict(X_test)

cm_voting = confusion_matrix(y_test, y_pred_voting)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_voting, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.title("Macierz pomyłek - Voting Classifier optymalizowany")
plt.xlabel("Przewidywane etykiety")
plt.ylabel("Rzeczywiste etykiety")
plt.show()

plt.figure(figsize=(10, 6))
sns.barplot(x=model_names, y=accuracies, palette="viridis")
plt.xlabel("Model")
plt.ylabel("Dokładność")
plt.title("Porównanie dokładności modeli")
plt.xticks(rotation=45)
plt.ylim(0.8, 1)  
plt.tight_layout()
plt.show()

for name, accuracy in zip(model_names, accuracies):
    print(f"{name} - Dokładność: {accuracy:.4f}")
