## Import modules

In [None]:
# Import modules
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings("ignore")


print("Packages installed")

## 

## Read CSV

In [None]:
dataframes=[]
for dirname, _, filenames in os.walk('../DataSets/RawData/'):
    for filename in filenames:
        file=filename.split('.')
        file=((file[0]+"_df"))
        if file !="_df":
            filepath=os.path.join(dirname,filename)
            df=pd.read_csv(filepath,sep=",",encoding = "UTF-8")
            exec(f'{file} = df.copy()')
            print(file, df.shape)
            dataframes.append(df)
print('Data imported') #Tar ca. 10 sekunder å lese alle filene

## Date functions

In [None]:
#0.25 vårsesongen, 0.75 høstsesongen.
def date_to_year(df):
    df["year"] = pd.to_datetime(df["date"]).dt.year
    df["month"] = pd.to_datetime(df["date"]).dt.month

    df["year"] = np.where(df["month"] <= 6, df["year"] + 0.25, df["year"] + 0.75)
    return df

def valuation_date_to_year(df):
    df["year"] = pd.to_datetime(df["date"]).dt.year
    df["month"] = pd.to_datetime(df["date"]).dt.month

    conditions = [
    (df['month'] > 2) & (df['month'] < 9),
    (df['month'] >= 9),
    (df['month'] < 3)
    ]

    choices = ['Spring', 'Fall', 'Fall']
    df['semester'] = np.select(conditions, choices)
    df.loc[(df['month'] < 3), 'year'] -= 1
    df["year"] = np.where(df["semester"] == "Spring", df["year"] + 0.25, df["year"] + 0.75)
    
    return df

def calculate_age(row):
    date_of_birth = pd.to_datetime(row['date_of_birth'])
    date = pd.to_datetime(row['date'])
    age = date.year - date_of_birth.year - ((date.month, date.day) < (date.month, date.day))
    return age

## 1. Games & Appearances

In [None]:
games_df = games_df[["game_id", "date", "home_club_id", "away_club_id", "home_club_goals", "away_club_goals"]]
appearances_df = appearances_df[["game_id", "player_id", "player_club_id", "yellow_cards", "red_cards", "goals", "assists", "minutes_played"]]

games_df = pd.merge(games_df, appearances_df, on="game_id")

games_df['goals_for'] = games_df.apply(lambda row: row['home_club_goals'] if row['home_club_id'] == row['player_club_id'] else row['away_club_goals'], axis=1)
games_df['goals_against'] = games_df.apply(lambda row: row['away_club_goals'] if row['home_club_id'] == row['player_club_id'] else row['home_club_goals'], axis=1)
games_df = date_to_year(games_df)
games_df.head()

## 2. Player performance

In [None]:
games_df = games_df[["player_id", "player_club_id", "yellow_cards", "red_cards", "goals", "assists", "minutes_played", "goals_for", "goals_against", "year"]]
player_performance_df = games_df.groupby(['player_id', 'player_club_id', 'year']).agg({
    'goals_for': 'sum',
    'goals_against': 'sum',
    'goals': 'sum', 
    'assists': 'sum', 
    'red_cards': 'sum', 
    'yellow_cards': 'sum', 
    'minutes_played': 'sum'
}).reset_index()

player_performance_df.head()

## 

## 3. Player valuation

In [None]:
player_valuations_df = valuation_date_to_year(player_valuations_df)[["player_id", "market_value_in_eur", "year", "date"]]
players_performance_value_df = pd.merge(player_performance_df, player_valuations_df, on=["player_id", "year"])


players_performance_value_df.sort_values(by=['player_id', 'year', 'date'], ascending=[True, True, False], inplace=True)

# Fjern duplikater basert på 'player_id' og 'year', behold raden med nyeste dato
players_performance_value_df.drop_duplicates(subset=['player_id', 'year'], keep='first', inplace=True)
players_performance_value_df.head()

## 4. Player Characteristic

In [None]:
players_characteristics_df = players_df[['player_id', 'name', 'country_of_birth', 'country_of_citizenship','date_of_birth', 'height_in_cm', 'sub_position']]
players_characteristics_df['country_of_birth'].fillna(players_characteristics_df['country_of_citizenship'], inplace=True)
players_characteristics_df.dropna(inplace=True)
players_characteristics_df.head()

## 5. Combine and Calculate Age

In [None]:
total_df = pd.merge(players_characteristics_df, players_performance_value_df, on='player_id', how='left')
total_df.dropna(inplace=True)

#total_df['age_at_evaluation'] = total_df.apply(calculate_age, axis=1)
total_df.head()

## 6. Add league as feature

In [None]:
#df_player = pd.read_csv('/work/cleaned_data_28.csv', sep=",", encoding="UTF-8")
df_player = total_df
df_club = pd.read_csv('../DataSets/RawData/clubs.csv', sep=",", encoding="UTF-8")

df_player = df_player.rename(columns={'player_club_id': 'club_id'})
df_club = df_club[["club_id", "domestic_competition_id"]]

df = pd.merge(df_player, df_club, on='club_id', how="left")
df.head()


## Normalize market value

In [None]:
df['log_market_value_base10'] = np.log10(df['market_value_in_eur'])


In [None]:
df.to_csv("csv3.csv", index=False)

In [None]:
df = pd.read_csv("csv3.csv")
df.head()

In [None]:
import pandas as pd

filename = "cleaned_data.csv"
df = pd.read_csv(f"../DataSets/EncodedData/{filename}", sep=",", encoding="UTF-8")

df["goals/90"] = df["goals"]*90/df["minutes_played"]
df["assists/90"] = df["assists"]*90/df["minutes_played"]
df["Goal Difference"] = df["goals_for"]-df["goals_against"]

df.head()

## Write to CSV

In [None]:
df.to_csv("../DataSets/EncodedData/cleaned_data_04_24.csv", index=False)
