In [36]:
import pandas as pd
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import MinMaxScaler
import numpy as np

df = pd.read_csv("players_20.csv")

colonnes_exclue=["sofifa_id","pace", "shooting", "passing", "dribbling", "defending", "physic", "gk_diving", "gk_handling", "gk_kicking", "gk_reflexes", "gk_speed", "gk_positioning"]

df = df.drop(columns=colonnes_exclue)


def normalize_dataframe_mixed(df, num_mode="minmax", cat_mode="onehot"):
    """
    Normalise un DataFrame mixte avec colonnes numériques et catégorielles.

    Paramètres :
        df (pd.DataFrame) : DataFrame d'entrée
        num_mode (str) : 'minmax' ou 'zscore' pour colonnes numériques
        cat_mode (str) : 'onehot', 'label' ou 'frequency' pour colonnes catégorielles

    Retour :
        pd.DataFrame : DataFrame transformé
    """
    
    df = df.copy()
    
    # Colonnes numériques
    num_cols = df.select_dtypes(include=[np.number]).columns
    for col in num_cols:
        if num_mode == "minmax":
            min_val, max_val = df[col].min(), df[col].max()
            df[col] = 0 if max_val == min_val else (df[col] - min_val) / (max_val - min_val)
        elif num_mode == "zscore":
            mean, std = df[col].mean(), df[col].std()
            df[col] = 0 if std == 0 else (df[col] - mean) / std
        else:
            raise ValueError("num_mode doit être 'minmax' ou 'zscore'")
    
    # Colonnes catégorielles
    df["preferred_foot"] = df["preferred_foot"].map({"Left": 0, "Right": 1})
    df = df.dropna(subset=["preferred_foot"])

    df["work_rate"] = df["work_rate"].map({
        "Low/Low": 0,
        "Low/Medium": 1,
        "Medium/Low": 1,
        "Medium/Medium": 2,
        "High/Medium": 3,
        "Medium/High": 3,
        "High/High": 4
    })
    df = df.dropna(subset=["work_rate"])

    df["player_positions"] = df["player_positions"].str.split(r",\s*")

    mlb = MultiLabelBinarizer()
    positions_encoded = mlb.fit_transform(df["player_positions"])

    positions_df = pd.DataFrame(
        positions_encoded,
        columns=mlb.classes_,
        index=df.index
    )

    df = pd.concat([df, positions_df], axis=1)
    
    return df

df = normalize_dataframe_mixed(df)
print(df)
df.to_csv("nouveau_csv.csv")

"""
# Variable cible
y = df["value_eur"]

# Variables explicatives = toutes les colonnes sauf la cible
X = df.drop(columns=["value_eur"])
# Test F univarié
f_scores, p_values = f_regression(X, y)

while (max(p_values) > 0.05):
    colonnes_exclue.append(X.columns[np.argmax(p_values)])
    df = df.select_dtypes(include=["number"]).drop(columns=colonnes_exclue).dropna()
    X = df.drop(columns=["value_eur"])    
    f_scores, p_values = f_regression(X, y)

# Mise en forme des résultats
resultats = pd.DataFrame({
    "Variable": X.columns,
    "F_score": f_scores,
    "p_value": p_values
}).sort_values(by="p_value", ascending=False)
print('###############################')
print(resultats)
"""

        age  height_cm  weight_kg  overall  potential  value_eur  wage_eur  \
1      0.80   0.666667   0.595238    1.000   0.944444   0.554207  0.861407   
2      0.45   0.333333   0.238095    0.975   0.916667   1.000000  0.616205   
3      0.40   0.694444   0.690476    0.950   0.944444   0.734421  0.264392   
4      0.50   0.333333   0.380952    0.950   0.888889   0.852983  1.000000   
5      0.50   0.500000   0.285714    0.950   0.888889   0.852983  0.786780   
...     ...        ...        ...      ...        ...        ...       ...   
15673  0.20   0.500000   0.166667    0.150   0.194444   0.001423  0.004264   
15723  0.35   0.861111   0.714286    0.150   0.138889   0.000759  0.000000   
16177  0.20   0.277778   0.333333    0.125   0.194444   0.001043  0.004264   
16950  0.20   0.666667   0.428571    0.075   0.166667   0.000664  0.004264   
17763  0.00   0.638889   0.523810    0.000   0.000000   0.000000  0.000000   

       international_reputation  weak_foot  skill_moves  ...  \

'\n# Variable cible\ny = df_clean["value_eur"]\n\n# Variables explicatives = toutes les colonnes sauf la cible\nX = df_clean.drop(columns=["value_eur"])\n# Test F univarié\nf_scores, p_values = f_regression(X, y)\n\nwhile (max(p_values) > 0.05):\n    colonnes_exclue.append(X.columns[np.argmax(p_values)])\n    df_clean = df.select_dtypes(include=["number"]).drop(columns=colonnes_exclue).dropna()\n    X = df_clean.drop(columns=["value_eur"])    \n    f_scores, p_values = f_regression(X, y)\n\n# Mise en forme des résultats\nresultats = pd.DataFrame({\n    "Variable": X.columns,\n    "F_score": f_scores,\n    "p_value": p_values\n}).sort_values(by="p_value", ascending=False)\nprint(\'###############################\')\nprint(resultats)\n'

In [28]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("players_20.csv")

threshold = 0.7
colonnes_exclue = []
df_corr = df.select_dtypes(include=["number"]).drop(columns=colonnes_exclue).corr()['value_eur'].sort_values(ascending=False)
while (df_corr[-1] < 0.7):
    colonnes_exclue.append(df_corr.keys()[-1])
    df_corr = df.select_dtypes(include=["number"]).drop(columns=colonnes_exclue).corr()['value_eur'].sort_values(ascending=False)
print(df_corr)

  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_co

value_eur             1.000000
release_clause_eur    0.993735
wage_eur              0.858052
Name: value_eur, dtype: float64


  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
  while (df_corr[-1] < 0.7):
