In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
import torch
import torch.nn as nn
import torch.optim as optim
import tensorflow as tf

<h1> Data source: https://www.kaggle.com/datasets/maso0dahmed/football-players-data </h1>

In [6]:
dataframe = pd.read_csv("fifa_players.csv")

print(dataframe)
print(dataframe.info())

dataframe.dropna(inplace=True)
print(dataframe)

                 name                         full_name birth_date  age  \
0            L. Messi    Lionel Andrés Messi Cuccittini  6/24/1987   31   
1          C. Eriksen      Christian  Dannemann Eriksen  2/14/1992   27   
2            P. Pogba                        Paul Pogba  3/15/1993   25   
3          L. Insigne                   Lorenzo Insigne   6/4/1991   27   
4        K. Koulibaly                 Kalidou Koulibaly  6/20/1991   27   
...               ...                               ...        ...  ...   
17949     R. McKenzie                    Rory  McKenzie  10/7/1993   25   
17950       M. Sipľak                     Michal Sipľak   2/2/1996   23   
17951      J. Bekkema                       Jan Bekkema   4/9/1996   22   
17952      A. Al Yami               Abdulrahman Al Yami  6/19/1997   21   
17953  Júnior Brumado  José Francisco dos Santos Júnior  5/15/1999   19   

       height_cm  weight_kgs  positions   nationality  overall_rating  \
0         170.18        72

<h1>Губим само 9 записа след като махнем nan записите</h1>

<h1>Проверяваме зависимостите в различните колони</h1>

In [3]:
numerical_columns = dataframe.select_dtypes(include=['float64', 'int64'])
correlation = numerical_columns.corr()['value_euro'].sort_values(ascending=False)
print(correlation)

value_euro                       1.000000
release_clause_euro              0.994147
wage_euro                        0.858674
overall_rating                   0.828327
potential                        0.770785
reactions                        0.742834
international_reputation(1-5)    0.731106
national_rating                  0.591612
composure                        0.564779
vision                           0.432182
short_passing                    0.348208
long_passing                     0.331346
ball_control                     0.323684
volleys                          0.323286
penalties                        0.312280
finishing                        0.308051
freekick_accuracy                0.303380
curve                            0.296017
long_shots                       0.292872
skill_moves(1-5)                 0.283953
positioning                      0.278042
dribbling                        0.277897
shot_power                       0.275448
agility                          0

<h1>Ще тренираме моделите които ще изпробвам с тези колони:
wage_euro (0.858674)
overall_rating (0.828327)
potential (0.770785)
international_reputation(1-5) (0.731106)
reactions (0.742834)
composure (0.564779)
vision (0.432182)</h1>

<h1>Първо ще ползвам sklearn защото e подходящ за по малак dataset като моя.
Ще ползвам първо дървета</h1>

<h1>1.Избирам полезните колони</h1>

In [4]:
data = dataframe[['wage_euro', 'overall_rating', 'potential', 'international_reputation(1-5)', 'reactions', 'composure', 'vision', 'value_euro']]

<h1>Преобразуване на категориални данни</h1>

In [5]:
label_encoder = LabelEncoder()
data['international_reputation(1-5)'] = label_encoder.fit_transform(data['international_reputation(1-5)'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['international_reputation(1-5)'] = label_encoder.fit_transform(data['international_reputation(1-5)'])


<h1>Разделяне на данните на входни и изходни</h1>

In [6]:
X = data.drop('value_euro', axis=1)
y = data['value_euro']

<h1>Разделяне на данните на обучаваща и тестова част</h1>

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

<h1>Създаване и обучение на модела</h1>

In [8]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

<h1>Оценка на модела</h1>

In [9]:
y_pred = model.predict(X_test)

<h1>Изчисляване на метрики</h1>

In [10]:
mse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')

Mean Squared Error: 2484678.733760162
Mean Absolute Error: 1470750.0
R-squared: 0.9750647898621769


<h1>Метриките показват че модела е сравнително добър</h1>

In [11]:
def predict_player_value(wage_euro, overall, potential, international_reputation, reactions, composure, vision):
    input_data = pd.DataFrame({
        'wage_euro': [wage_euro],
        'overall_rating': [overall],
        'potential': [potential],
        'international_reputation(1-5)': [label_encoder.transform([international_reputation])[0]],
        'reactions': [reactions],
        'composure': [composure],
        'vision': [vision]
    })
    return model.predict(input_data)[0]

# Пример за предсказване
player_value = predict_player_value(565000, 94, 94, 5, 95, 96, 94)
print(f'Прогнозна стойност на футболиста: {player_value} евро')

Прогнозна стойност на футболиста: 106685000.0 евро


<h1>Примерните данни са за Меси неговата цена е 110500000</h1>

<h1 style="color: red;">2.Ще ползвам deep learning този път: регресионен модел на pytorch</h1>

<h1>Преобразуваме на данните в тензори</h1>

In [12]:
X = data.drop('value_euro', axis=1).values
y = data['value_euro'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.FloatTensor(y_train)
X_test_tensor = torch.FloatTensor(X_test)
y_test_tensor = torch.FloatTensor(y_test)

<h1>Създаваме модел</h1>

In [13]:
class RegressionModel(nn.Module):
    def __init__(self):
        super(RegressionModel, self).__init__()
        self.fc1 = nn.Linear(X_train.shape[1], 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

model = RegressionModel()

<h1>Определяme загубаta и оптимизатор</h1>

In [14]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

<h1>Обучаваме модела</h1>

In [15]:
for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor.view(-1, 1))
    loss.backward()
    optimizer.step()
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch + 1}/100], Loss: {loss.item():.4f}')

Epoch [10/100], Loss: 439172686938112.0000
Epoch [20/100], Loss: 438906734510080.0000
Epoch [30/100], Loss: 438601489842176.0000
Epoch [40/100], Loss: 438206319296512.0000
Epoch [50/100], Loss: 437677367230464.0000
Epoch [60/100], Loss: 436983629348864.0000
Epoch [70/100], Loss: 436089940606976.0000
Epoch [80/100], Loss: 434957310754816.0000
Epoch [90/100], Loss: 433543897415680.0000
Epoch [100/100], Loss: 431806113382400.0000


<h1>Оценка на модела</h1>

In [16]:
with torch.no_grad():
    y_pred_tensor = model(X_test_tensor)
    y_pred = y_pred_tensor.numpy()

<h1>Изчисляваме метриките</h1>

In [25]:
mse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')

Mean Squared Error: 6390535.557453341
Mean Absolute Error: 4565171.631329114
R-squared: 0.8350519985089928


<h1>Метриките показват че модела не работи</h1>

In [18]:
def predict_player_value(wage_euro, overall_rating, potential, international_reputation, reactions, composure, vision):
    model.eval()
    input_data = torch.FloatTensor([[wage_euro, overall_rating, potential, international_reputation, reactions, composure, vision]])
    with torch.no_grad():
        predicted_value = model(input_data)
    return predicted_value.item()

predicted_value = predict_player_value(565000, 94, 94, 5, 95, 96, 94)
print(f"Предсказана стойност на футболиста: {predicted_value:.2f} евро")

Предсказана стойност на футболиста: 1550562.38 евро


<h1>Данните са на Меси които струва 110500000</h1>

<h1 style="color: yellow;">Накрая ще използвам Sequential модел на керас</h1>

<h1>Създаваме модел</h1>

In [19]:
X = data.drop('value_euro', axis=1)
y = data['value_euro']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1)
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


<h1>Компилираme модела</h1>

In [20]:
model.compile(optimizer='adam', loss='mean_squared_error')

<h1>Обучavame модела</h1>

In [21]:
model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=1)

Epoch 1/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 493188645126144.0000
Epoch 2/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 496311690330112.0000  
Epoch 3/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 428179785252864.0000 
Epoch 4/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 387239553007616.0000 
Epoch 5/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 459243169775616.0000 
Epoch 6/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 379919485894656.0000
Epoch 7/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 388159716196352.0000
Epoch 8/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 375213611024384.0000
Epoch 9/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/ste

<keras.src.callbacks.history.History at 0x2bfbf052fe0>

<h1>Правим оценка на модела</h1>

In [22]:
y_pred = model.predict(X_test)

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step


<h1>Изчисляваме метрикиte</h1>

In [26]:
mse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')

Mean Squared Error: 6390535.557453341
Mean Absolute Error: 4565171.631329114
R-squared: 0.8350519985089928


<h1>Метриките показват че модела е сравнително добър</h1>

In [24]:
def predict_player_value(wage_euro, overall_rating, potential, international_reputation, reactions, composure, vision):
    input_data = np.array([[wage_euro, overall_rating, potential, international_reputation, reactions, composure, vision]])
    predicted_value = model.predict(input_data)
    return predicted_value[0][0]

predicted_value = predict_player_value(565000, 94, 94, 5, 95, 96, 94)
print(f"Предсказана стойност на футболиста: {predicted_value:.2f} евро")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step
Предсказана стойност на футболиста: 139275200.00 евро


<h1>Данните са на Меси които струва 110500000</h1>

<h1 style="color: red;">Заключение:</h1>

<h1>Метриките показват че най–добрият модел е: <h1 style="color: green;">RandomForestRegressor на sklearn</h1></h1>

<h1>На второ място e:<h1 style="color: yellow;">Sequential на keras</h1><h1>

<h1>На трето място и единственият който не работи e:<h1 style="color: red">RegressionModel на pytorch</h1></h1>

<table style="width: 100%; border-collapse: collapse;">
  <tr>
    <th style="border: 1px solid black; padding: 8px;">Model</th>
    <th style="border: 1px solid black; padding: 8px;">Mean Absolute Error</th>
    <th style="border: 1px solid black; padding: 8px;">R-squared</th>
  </tr>
  <tr style="background-color: yellow;">
    <td style="border: 1px solid black; padding: 8px; font-weight: bold; color: black;">Sequential</td>
    <td style="border: 1px solid black; padding: 8px; font-weight: bold; color: black;">4566260.08</td>
    <td style="border: 1px solid black; padding: 8px; font-weight: bold; color: black;">0.8350</td>
  </tr>
  <tr style="background-color: red;">
    <td style="border: 1px solid black; padding: 8px; font-weight: bold; color: black;">RegressionModel</td>
    <td style="border: 1px solid black; padding: 8px; font-weight: bold; color: black;">13026226.57</td>
    <td style="border: 1px solid black; padding: 8px; font-weight: bold; color: black;">-0.6712</td>
  </tr>
  <tr style="background-color: green;">
    <td style="border: 1px solid black; padding: 8px; font-weight: bold; color: black;">RandomForestRegressor</td>
    <td style="border: 1px solid black; padding: 8px; font-weight: bold; color: black;">1470750.00</td>
    <td style="border: 1px solid black; padding: 8px; font-weight: bold; color: black;">0.9751</td>
  </tr>
</table>