In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

def load_team_data():
    # We are going to load the data set of the team and clean this one
    
    # 1. Uploading the data
    df = pd.read_csv('/files/ps3-Athithsko/Projet Ml/ProjetBarca.csv', sep=';', encoding='latin1')

    # 2. Clean the dataset
    
    
    print("=" * 70)
    
    print(f"Dataset Shape: {df.shape[0]} matches, {df.shape[1]} columns")
    print(f"Columns: {df.columns.tolist()}")
    
    """" I wrote this for being sure that the dataset isn't wrong"""
    print("\nFirst 5 matches preview:")
    print(df.head())

    # Date removing
    df['Date'] = pd.to_datetime(df['Date'], format='%d.%m.%Y')
    
    # Convert commas to dots for numeric values to avoid issue with float and int
    numeric_columns = ['xG', 'xGA', 'Poss']
    for col in numeric_columns:
        df[col] = df[col].astype(str).str.replace(',', '.').astype(float)

    # Create some target variables
    df['Victory'] = (df['Result'] == 'W').astype(int)
    df['Defeat'] = (df['Result'] == 'L').astype(int)
    df['Draw'] = (df['Result'] == 'D').astype(int)

    print("\n" + "-" * 40)
    print("Results distribution")
    print("-" * 40)
    print(df['Result'].value_counts())
    print(f"Win rate: {(df['Victory'].mean()*100):.1f}%")
    
    
    
    
    return df

load_team_data()



Dataset Shape: 38 matches, 13 columns
Columns: ['Date', 'Time', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA', 'Opponent', 'xG', 'xGA', 'Poss', 'Equipe_type']

First 5 matches preview:
         Date   Time        Round  Day  Venue Result  GF  GA        Opponent  \
0  17.08.2024  21:30  Matchweek 1  Sat      1      W   2   1        Valencia   
1  24.08.2024  19:00  Matchweek 2  Sat      0      W   2   1   Athletic Club   
2  27.08.2024  21:30  Matchweek 3  Tue      1      W   2   1  Rayo Vallecano   
3  31.08.2024  17:00  Matchweek 4  Sat      0      W   7   0      Valladolid   
4  15.09.2024  16:15  Matchweek 5  Sun      1      W   4   1          Girona   

    xG  xGA  Poss  Equipe_type  
0  3,2    1    63            0  
1  1,8    1    64            1  
2  1,4  0,4    64            1  
3  4,7  0,5    70            1  
4  1,9  1,3    55            1  

----------------------------------------
Results distribution
----------------------------------------
W    28
L     6
D     4
Name: Re

Unnamed: 0,Date,Time,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Equipe_type,Victory,Defeat,Draw
0,2024-08-17,21:30,Matchweek 1,Sat,1,W,2,1,Valencia,3.2,1.0,63.0,0,1,0,0
1,2024-08-24,19:00,Matchweek 2,Sat,0,W,2,1,Athletic Club,1.8,1.0,64.0,1,1,0,0
2,2024-08-27,21:30,Matchweek 3,Tue,1,W,2,1,Rayo Vallecano,1.4,0.4,64.0,1,1,0,0
3,2024-08-31,17:00,Matchweek 4,Sat,0,W,7,0,Valladolid,4.7,0.5,70.0,1,1,0,0
4,2024-09-15,16:15,Matchweek 5,Sun,1,W,4,1,Girona,1.9,1.3,55.0,1,1,0,0
5,2024-09-22,18:30,Matchweek 6,Sun,1,W,5,1,Villarreal,3.9,2.2,63.0,1,1,0,0
6,2024-09-25,21:00,Matchweek 7,Wed,0,W,1,0,Getafe,1.9,0.7,76.0,0,1,0,0
7,2024-09-28,21:00,Matchweek 8,Sat,1,L,2,4,Osasuna,0.8,2.2,73.0,0,0,1,0
8,2024-10-06,16:15,Matchweek 9,Sun,1,W,3,0,Alavés,2.7,0.7,71.0,1,1,0,0
9,2024-10-20,21:00,Matchweek 10,Sun,0,W,5,1,Sevilla,3.8,1.0,67.0,1,1,0,0
