In [9]:
import pandas as pd

# Load the file
df = pd.read_csv('EPL-season-2020-2021.csv')

# Adjust pandas display settings to show all columns without truncation
pd.set_option('display.max_columns', None)  # Ensure all columns are displayed
pd.set_option('display.max_colwidth', None)  # Ensure full width of column names is displayed

# Print all column names
print(df.columns.tolist())


In [10]:
import pandas as pd

# Load the datasets for each season
df_2020_2021 = pd.read_csv('EPL-season-2020-2021.csv')
df_2021_2022 = pd.read_csv('EPL-season-2021-2022.csv')
df_2022_2023 = pd.read_csv('EPL-season-2022-2023.csv')
df_2023_2024 = pd.read_csv('EPL-season-2023-2024.csv')
df_2024_2025 = pd.read_csv('EPL-season-2024-2025.csv')

# Combine the dataframes into one (assuming columns are the same)
df_all_seasons = pd.concat([df_2020_2021, df_2021_2022, df_2022_2023, df_2023_2024, df_2024_2025], ignore_index=True)

# Check the first few rows of the combined data
print(df_all_seasons.head())

# Optionally, save the combined dataframe to a new CSV file
df_all_seasons.to_csv('EPL_all_seasons_combined.csv', index=False)


In [11]:
import pandas as pd

# Load the dataset
df = pd.DataFrame({
    "Div": ["E0", "E0", "E0", "E0", "E0"],
    "Date": ["12/9/2020", "12/9/2020", "12/9/2020", "12/9/2020", "13/09/2020"],
    "Time": ["12:30", "15:00", "17:30", "20:00", "14:00"],
    "HomeTeam": ["Fulham", "Crystal Palace", "Liverpool", "West Ham", "West Brom"],
    "AwayTeam": ["Arsenal", "Southampton", "Leeds", "Newcastle", "Leicester"],
    "FTHG": [0, 1, 4, 0, 0],
    "FTAG": [3, 0, 3, 2, 3],
    "FTR": ["A", "H", "H", "A", "A"],
    "HTHG": [0, 1, 3, 0, 0],
    "HTAG": [1, 0, 2, 0, 0],
    "HTR": ["A", "H", "H", "D", "D"],
    "Referee": ["C Kavanagh", "J Moss", "M Oliver", "S Attwell", "A Taylor"],
    "HS": [5, 5, 22, 15, 7],
    "AS": [13, 9, 6, 15, 13],
    "HST": [2, 3, 6, 3, 1],
    "AST": [6, 5, 3, 2, 7],
    "HF": [12, 14, 9, 13, 12],
    "AF": [12, 11, 6, 7, 9],
    "HC": [2, 7, 9, 8, 2],
    "AC": [3, 3, 0, 7, 5],
    "HY": [2, 2, 1, 2, 1],
    "AY": [2, 1, 0, 2, 1],
    "HR": [0, 0, 0, 0, 0],
    "AR": [0, 0, 0, 0, 0],
    "B365H": [6.00, 3.10, 1.28, 2.15, 3.80],
    "B365D": [4.33, 3.25, 6.00, 3.40, 3.60],
    "B365A": [1.53, 2.37, 9.50, 3.40, 1.95],
    "BWH": [5.50, 3.00, 1.26, 2.15, 3.70],
    "BWD": [4.25, 3.20, 6.25, 3.40, 3.60],
    "BWA": [1.57, 2.45, 10.50, 3.40, 2.00],
    "IWH": [6.00, 3.15, 1.35, 2.15, 3.85],
    "IWD": [3.90, 2.95, 5.00, 3.15, 3.20],
    "IWA": [1.57, 2.40, 8.50, 3.40, 2.00]
})

# Convert 'Date' to datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

# Remove columns that are not necessary for analysis
columns_to_drop = ['HR', 'AR', 'Referee', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY']
df.drop(columns=columns_to_drop, inplace=True)

# Handle missing values (if any)
df.fillna(method='ffill', inplace=True)

# Normalize column names (making them lowercase and consistent)
df.columns = df.columns.str.lower()

# Handle duplicates
df.drop_duplicates(inplace=True)

# Save the cleaned dataframe to a new CSV file
df.to_csv('cleaned_data.csv', index=False)

# Notify the user
print("The cleaned data has been saved to 'cleaned_data.csv'.")


# Display cleaned data
print(df.head())


