## Initialization

In [None]:
# Import modules
import pandas as pd
import warnings

#settings
pd.set_option('display.max_row', 100)
pd.set_option('display.max_columns', 1200)
pd.set_option('display.width', 1200)
warnings.filterwarnings("ignore")
%matplotlib inline

def csv_to_df(filename):
    file_path = f"../DataSets/EncodedData/{filename}.csv"
    df = pd.read_csv(file_path, sep=",", encoding="UTF-8")
    return df

def df_to_csv(df, filename):
    df.to_csv(f"{filename}.csv", index=False)
    print(f"{filename}.csv er laget")
    return


print("Packages installed")



## Read Clean data, drop some columns

In [None]:
df = csv_to_df("encoded_data_beta")

#,player_id,country_of_birth,country_of_citizenship,date_of_birth,height_in_cm,sub_position,club_id,year,goals_for,goals_against,goals,assists,red_cards,yellow_cards,minutes_played,market_value_in_eur,date,age_at_evaluation,domestic_competition_id,log_market_value_base10

#df.drop(["player_id", "country_of_birth", "date_of_birth", "date"], axis=1, inplace=True)
#df.dropna(inplace=True)
df.head()

## TargetEncode: Country, Position, Club, League

In [None]:
from category_encoders.target_encoder import TargetEncoder

# Initialize TargetEncoder
encoder = TargetEncoder()

# Specify columns to target encode
columns_to_encode = ['sub_position']

# Fit and transform the DataFrame with target encoding
df_encoded = encoder.fit_transform(df[columns_to_encode], df['Market Value'])

# Merge the encoded DataFrame with the original DataFrame
df = pd.concat([df, df_encoded.add_suffix('_encoded')], axis=1)
# Display the resulting DataFrame
df.head()

In [None]:
df.head()

In [None]:
df = df.rename(columns={"sub_position_encoded" : "Position Value"})

df_to_csv(df, "encoded_data_gamma")

In [None]:
from category_encoders.target_encoder import TargetEncoder

# Initialize TargetEncoder
encoder = TargetEncoder()

# Specify columns to target encode
columns_to_encode = ['country_of_citizenship', 'club_id','sub_position', 'domestic_competition_id']

df['club_id'] = df['club_id'].astype(str)

# Fit and transform the DataFrame with target encoding
df_encoded = encoder.fit_transform(df[columns_to_encode], df['market_value_in_eur'])

# Merge the encoded DataFrame with the original DataFrame
df = pd.concat([df, df_encoded.add_suffix('_encoded')], axis=1)
df = df.drop(["country_of_citizenship", "domestic_competition_id", "club_id"], axis=1)
# Display the resulting DataFrame
df.head()



In [None]:
# Get all unique values of the column
unique_values = df['sub_position'].unique()

# Print the unique values
print("Unique values of the column:")
for value in unique_values:
    print(value)

###One-Hot Encoding of Sub Positions

In [None]:

# Define a dictionary to map the original positions to the desired abbreviations
position_mapping = {
    'Centre-Forward': 'CF',
    'Goalkeeper': 'GK',
    'Attacking Midfield': 'AM',
    'Central Midfield': 'CM',
    'Defensive Midfield': 'DM',
    'Left-Back': 'LB',
    'Centre-Back': 'CB',
    'Right-Back': 'RB',
    'Second Striker': 'SS',
    'Right Winger': 'RW',
    'Right Midfield': 'RM',
    'Left Winger': 'LW',
    'Left Midfield': 'LM'
}


# Replace the positions with the abbreviations
df['sub_position'] = df['sub_position'].replace(position_mapping)

In [None]:

# Perform one-hot encoding
one_hot_encoded = pd.get_dummies(df['sub_position'], prefix='POS')

# Concatenate the one-hot encoded columns with the original DataFrame
df_encoded = pd.concat([df, one_hot_encoded], axis=1)

print(df_encoded)


In [None]:
df_to_csv(df, "encoded_data_04_24_1")

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Assuming df is your DataFrame
columns_to_normalize = ["goals", "assists", "minutes_played", "height_in_cm", "year", "goals_for", "goals_against", "red_cards", "yellow_cards", "age_at_evaluation"]

scaler = MinMaxScaler()
df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])
df.head()

df_to_csv(df, "encoded_data_04_16")

In [None]:
df = csv_to_df("encoded_data_04_16")

top_5_competition_ids = df["domestic_competition_id_encoded"].value_counts().nlargest(5).index
df = df[df["domestic_competition_id_encoded"].isin(top_5_competition_ids)]

df_to_csv(df, "encoded_data_04_16_top5.csv")

In [None]:
#df = csv_to_df("encoded_data_04_24")
df = df_encoded
df.rename(columns={"height_in_cm" : "Height", "year":"Year", "goals_for":"Club Goals", "goals_against":"Club Goals Conceded", "goals":"Goals"}, inplace=True)
df.rename(columns={"assists" : "Assists", "red_cards":"Red Cards", "yellow_cards":"Yellow Cards", "minutes_played":"Minutes Played", "market_value_in_eur":"Market Value"}, inplace=True)
df.rename(columns={"name" : "Name", "age_at_evaluation":"Age", "log_market_value_base10":"Logarithmic Market Value", "goals/90":"Goals/Game", "assists/90":"Assists/Game"}, inplace=True)
df.rename(columns={"country_of_citizenship_encoded" : "Country Value", "sub_position_encoded":"Position Value", "club_id_encoded":"Club Value", "domestic_competition_id_encoded":"League Value"}, inplace=True)

df_to_csv(df, "encoded_data_beta")
df.head()

In [None]:
df = csv_to_df("encoded_data_gamma")
df.head()

In [None]:
cols_to_convert = ["POS_AM", "POS_CB", "POS_CF", "POS_CM", "POS_DM", "POS_GK", "POS_LB", "POS_LM", "POS_LW", "POS_RB","POS_RM", "POS_RW", "POS_SS"]
df[cols_to_convert] = df[cols_to_convert].astype(int)
df.head()
df_to_csv(df, "encoded_data_int")