Importations

In [2]:
from pathlib import Path
import pandas as pd

Cleaning Data 

In [5]:

data_path = Path('../data')
csv_files = sorted(data_path.glob('season-*.csv'))

# Loading data 
dfs_raw = []
for file in csv_files:
    df= pd.read_csv(file)
    dfs_raw.append(df)

df = pd.concat(dfs_raw) #in case of problem we have to use ignore_index=True

# Not necesary 
df.drop(columns=['HTHG', 'HTAG', 'HTR'], inplace=True)

# Converting into appropriate data types
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
df['FTHG'] = df['FTHG'].astype(int)
df['FTAG'] = df['FTAG'].astype(int)
df['FTR'] = df['FTR'].astype('category')
df['Season'] = df['Season'].astype('category')

# Extracting date features
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['DayOfWeek'] = df['Date'].dt.dayofweek  # 0 = Lunes
df['IsWeekend'] = df['DayOfWeek'].isin([5, 6]).astype(int)

# Creating new variables further analysis 
df['TotalGoals'] = df['FTHG'] + df['FTAG']
df['GoalDifference'] = df['FTHG'] - df['FTAG']
df['IsHomeWin'] = (df['FTR'] == 'H').astype(int)
df['IsDraw'] = (df['FTR'] == 'D').astype(int)
df['IsAwayWin'] = (df['FTR'] == 'A').astype(int)

#  Coding teams as categorical variables
df['HomeTeam'] = df['HomeTeam'].astype('category')
df['AwayTeam'] = df['AwayTeam'].astype('category')

# Verifing duplicates 
duplicates = df.duplicated(subset=['Date', 'HomeTeam', 'AwayTeam']).sum()
print(f"Duplicados encontrados: {duplicates}")

df.sort_values('Date', inplace=True)

# Necesary afther sorting 
df.reset_index(drop=True, inplace=True)

# 10. Ver resumen
print(df.info())
print(df.head())

# 11. Guardar (opcional)
df.to_csv('cleaned_football_data.csv', index=False)

Duplicados encontrados: 0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12324 entries, 0 to 12323
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Date            12324 non-null  datetime64[ns]
 1   HomeTeam        12324 non-null  category      
 2   AwayTeam        12324 non-null  category      
 3   FTHG            12324 non-null  int64         
 4   FTAG            12324 non-null  int64         
 5   FTR             12324 non-null  category      
 6   Season          12322 non-null  category      
 7   Year            12324 non-null  int32         
 8   Month           12324 non-null  int32         
 9   Day             12324 non-null  int32         
 10  DayOfWeek       12324 non-null  int32         
 11  IsWeekend       12324 non-null  int64         
 12  TotalGoals      12324 non-null  int64         
 13  GoalDifference  12324 non-null  int64         
 14  IsHomeWin       12324 non-nu

  df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
