## Initialization

In [None]:
# Import modules
import pandas as pd
import warnings

#settings
pd.set_option('display.max_row', 100)
pd.set_option('display.max_columns', 1200)
pd.set_option('display.width', 1200)
warnings.filterwarnings("ignore")
%matplotlib inline

def csv_to_df(filename):
    file_path = f"../DataSets/EncodedData/{filename}.csv"
    df = pd.read_csv(file_path, sep=",", encoding="UTF-8")
    return df

def df_to_csv(df, filename):
    df.to_csv(f"{filename}.csv", index=False)
    print(f"{filename}.csv er laget")
    return


print("Packages installed")



## Read Clean data, drop some columns

In [None]:
df = csv_to_df("cleaned_data_04_24")

#,player_id,country_of_birth,country_of_citizenship,date_of_birth,height_in_cm,sub_position,club_id,year,goals_for,goals_against,goals,assists,red_cards,yellow_cards,minutes_played,market_value_in_eur,date,age_at_evaluation,domestic_competition_id,log_market_value_base10

df.drop(["player_id", "country_of_birth", "date_of_birth", "date"], axis=1, inplace=True)
#df.dropna(inplace=True)
df.head()

## TargetEncode: Country, Position, Club, League

In [None]:
from category_encoders.target_encoder import TargetEncoder

# Initialize TargetEncoder
encoder = TargetEncoder()

# Specify columns to target encode
columns_to_encode = ['country_of_citizenship', 'sub_position', 'club_id', 'domestic_competition_id']

df['club_id'] = df['club_id'].astype(str)

# Fit and transform the DataFrame with target encoding
df_encoded = encoder.fit_transform(df[columns_to_encode], df['market_value_in_eur'])

# Merge the encoded DataFrame with the original DataFrame
df = pd.concat([df, df_encoded.add_suffix('_encoded')], axis=1)
df = df.drop(["country_of_citizenship", "sub_position", "domestic_competition_id", "club_id"], axis=1)
# Display the resulting DataFrame
print(df.head())



In [None]:
df_to_csv(df, "encoded_data_04_24")

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Assuming df is your DataFrame
columns_to_normalize = ["goals", "assists", "minutes_played", "height_in_cm", "year", "goals_for", "goals_against", "red_cards", "yellow_cards", "age_at_evaluation"]

scaler = MinMaxScaler()
df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])
df.head()

df_to_csv(df, "encoded_data_04_16")

In [None]:
df = csv_to_df("encoded_data_04_16")

top_5_competition_ids = df["domestic_competition_id_encoded"].value_counts().nlargest(5).index
df = df[df["domestic_competition_id_encoded"].isin(top_5_competition_ids)]

df_to_csv(df, "encoded_data_04_16_top5.csv")