In [1]:
#Import all relevant libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
 
## This statement allows the visuals to render within your Jupyter Notebook.
%matplotlib inline

In [2]:
pd.set_option('display.max_columns', 22)

## Loading the data
We can now load the dataset into pandas using the read_csv() function. This converts the CSV file into a Pandas dataframe.

In [3]:
#Read in the csv file and convert to a Pandas dataframe

In [4]:
df_football = pd.read_csv('final_data.csv')

### Viewing the dataframe
We can get a quick sense of the size of our dataset by using the shape method. This returns a tuple with the number of rows and columns in the dataset.

In [5]:
# we have 10754 rows and 22 columns
df_football.shape

(10754, 22)

## 1. Data Profiling:
Data profiling is a comprehensive process of examining the data available in an existing dataset and collecting statistics and information about that data. 

In [6]:
df_football.head()

Unnamed: 0,player,team,name,position,height,age,appearance,goals,assists,yellow cards,second yellow cards,red cards,goals conceded,clean sheets,minutes played,days_injured,games_injured,award,current_value,highest_value,position_encoded,winger
0,/david-de-gea/profil/spieler/59377,Manchester United,David de Gea,Goalkeeper,189.0,32.0,104,0.0,0.0,0.009585,0.0,0.0,1.217252,0.335463,9390,42,5,13,15000000,70000000,1,0
1,/jack-butland/profil/spieler/128899,Manchester United,Jack Butland,Goalkeeper,196.0,30.0,15,0.0,0.0,0.069018,0.0,0.0,1.242331,0.207055,1304,510,58,1,1500000,22000000,1,0
2,/tom-heaton/profil/spieler/34130,Manchester United,Tom Heaton,Goalkeeper,188.0,37.0,4,0.0,0.0,0.0,0.0,0.0,0.616438,0.924658,292,697,84,4,600000,6000000,1,0
3,/lisandro-martinez/profil/spieler/480762,Manchester United,Lisandro Martínez,Defender Centre-Back,175.0,25.0,82,0.02809,0.05618,0.224719,0.0,0.0,0.0,0.0,6408,175,22,9,50000000,50000000,2,0
4,/raphael-varane/profil/spieler/164770,Manchester United,Raphaël Varane,Defender Centre-Back,191.0,30.0,63,0.017889,0.017889,0.053667,0.0,0.0,0.0,0.0,5031,238,51,21,40000000,80000000,2,0


In [7]:
# player: The player's unique identifier or profile link.
# team: The name of the team the player is associated with.
# name: The player's full name.
# position: The playing position of the player (for ex: Goalkeeper, Defender Centre-Back).
# height: The height of the player in centimeters.
# age: The age of the player in years.
# appearance: The number of appearances the player has made.
# goals: The number of goals scored by the player.
# assists: The number of assists provided by the player.
# yellow cards: The number of yellow cards received by the player.
# second yellow cards: The number of second yellow cards (leading to a red card) received by the player.
# red cards: The number of red cards received by the player.
# goals conceded: The number of goals conceded by the player's team while the player is on the field.
# clean sheets: The number of games in which the player's team did not concede any goals while the player was on the field.
# minutes played: The total number of minutes the player has played.
# days_injured: The number of days the player has been injured.
# games_injured: The number of games the player missed due to injury.
# award: The number of awards or honors the player has received.
# current_value: The current market value of the player.
# highest_value: The highest recorded market value of the player.
# position_encoded: A numerical encoding of the player's position. I think this column was done before with someone else fo building a ML model.
# winger: A binary indicator (0 or 1) indicating whether the player is a winger (جناح).

In [8]:
df_football.columns

Index(['player', 'team', 'name', 'position', 'height', 'age', 'appearance',
       'goals', 'assists', 'yellow cards', 'second yellow cards', 'red cards',
       'goals conceded', 'clean sheets', 'minutes played', 'days_injured',
       'games_injured', 'award', 'current_value', 'highest_value',
       'position_encoded', 'winger'],
      dtype='object')

The process of profiling differs slightly for categorical and numerical variables due to their inherent differences.

**The two main types of data are:**
- Quantitative (numerical) data
- Qualitative (categorical) data

In [87]:
df_football.dtypes

player                  object
team                    object
name                    object
position                object
height                 float64
age                    float64
appearance               int64
goals                  float64
assists                float64
yellow cards           float64
second yellow cards    float64
red cards              float64
goals conceded         float64
clean sheets           float64
minutes played           int64
days_injured             int64
games_injured            int64
award                    int64
current_value            int64
highest_value            int64
position_encoded         int64
winger                   int64
dtype: object

### Data Quality Checks
Data quality checks involve the process of ensuring that the data is accurate, complete, consistent, relevant, and reliable. 


**Here are typical steps involved in checking data quality:**

#### 1. Reliability:
Evaluate the data's source and collection process to determine its trustworthiness.

In [88]:
# Yes data is reliable since it was scraped from the data source:
# https://www.transfermarkt.com/

#### 2. Timeliness: 
Ensure the data is up-to-date and reflective of the current situation or the period of interest for the analysis.

In [89]:
# Data is considered timeliness since our objectives didn't specify the time
# it was scraped on June 10, 2023

#### 3. Consistency: 

Confirm that the data is consistent within the dataset and across multiple data sources. For example, the same data point should not have different values in different places.


In [90]:
# Yes data is consistent regarding the dataset and source 

#### 4. Relevance: 
Assess whether the data is appropriate and applicable for the intended analysis. Data that is not relevant can skew results and lead to incorrect conclusions.

**Key considerations for relevance include:**

> 1. Sample Appropriateness: Confirm that your data sample aligns with your analysis objectives. For instance, utilizing data from the Northern region will not yield accurate insights for the Western region of the Kingdom.
>
> 2. Variable Selection: Any column will not be relevant for our analysis, we can get rid of these using the drop() method. We will set the “axis” argument to 1 since we’re dealing with columns, and set the “inplace” argument to True to make the change permanent.


In [91]:
# Sample Appropriateness: Yes the sample of data is relevant to our objectives
# Variable Selection: there are some variebles that we don't need them for our analysis and they won't give us any insight:
# 1. player column: the links are not useful for us at all in the analysis
# 2. position_encoded: which was an old column i will do it later if i needed it.
# 3. winger: because we can have it as a value in the position column


In [97]:
df_football.drop(['player', 'position_encoded', 'winger'], axis=1, inplace=True)

In [98]:
df_football.columns

Index(['team', 'name', 'position', 'height', 'age', 'appearance', 'goals',
       'assists', 'yellow cards', 'second yellow cards', 'red cards',
       'goals conceded', 'clean sheets', 'minutes played', 'days_injured',
       'games_injured', 'award', 'current_value', 'highest_value'],
      dtype='object')

#### 5. Uniqueness: 
Check for and remove duplicate records to prevent skewed analysis results.


In [99]:
# no duplicated values, so we don't need to deal with them
df_football.duplicated().sum()

0

#### 6. Completeness: 
Ensure that no critical data is missing. This might mean checking for null values or required fields that are empty.

We will start by checking the dataset for missing or null values. For this, we can use the isna() method which returns a dataframe of boolean values indicating if a field is null or not. To group all missing values by column, we can include the sum() method.

In [94]:
#Display number missing values per column

In [100]:
# no null values appeared, 
df_football.isnull().sum()

team                   0
name                   0
position               0
height                 0
age                    0
appearance             0
goals                  0
assists                0
yellow cards           0
second yellow cards    0
red cards              0
goals conceded         0
clean sheets           0
minutes played         0
days_injured           0
games_injured          0
award                  0
current_value          0
highest_value          0
dtype: int64

In [103]:
# since no null values appeared we don't need to clean them because they are already cleaned 
def count_characters(df, characters):
    counts = {}
    for char in characters:
        counts[char] = (df == char).sum()
    return pd.DataFrame(counts)

special_characters = [' ', '-', '/']

special_char_counts = count_characters(df_football, special_characters)

print(special_char_counts)

                        -  /
team                 0  0  0
name                 0  0  0
position             0  0  0
height               0  0  0
age                  0  0  0
appearance           0  0  0
goals                0  0  0
assists              0  0  0
yellow cards         0  0  0
second yellow cards  0  0  0
red cards            0  0  0
goals conceded       0  0  0
clean sheets         0  0  0
minutes played       0  0  0
days_injured         0  0  0
games_injured        0  0  0
award                0  0  0
current_value        0  0  0
highest_value        0  0  0


#### 7. Check Accuracy:

Verify that the data is correct and precise. This could involve comparing data samples with known sources or using validation rules.

**The process includes:**
1. Validating the appropriateness of data types for the dataset.
2. Identifying outliers  using established validation  rule

In [104]:
# check columns types
# there are columns that are float and they should be int,  but since we are going to build a ML model for 
# our objective questio, we need to treat them as float to scale them later 'although they look scaled'
df_football.dtypes

team                    object
name                    object
position                object
height                 float64
age                    float64
appearance               int64
goals                  float64
assists                float64
yellow cards           float64
second yellow cards    float64
red cards              float64
goals conceded         float64
clean sheets           float64
minutes played           int64
days_injured             int64
games_injured            int64
award                    int64
current_value            int64
highest_value            int64
dtype: object

In [106]:
df_football.head()

Unnamed: 0,team,name,position,height,age,appearance,goals,assists,yellow cards,second yellow cards,red cards,goals conceded,clean sheets,minutes played,days_injured,games_injured,award,current_value,highest_value
0,Manchester United,David de Gea,Goalkeeper,189.0,32.0,104,0.0,0.0,0.009585,0.0,0.0,1.217252,0.335463,9390,42,5,13,15000000,70000000
1,Manchester United,Jack Butland,Goalkeeper,196.0,30.0,15,0.0,0.0,0.069018,0.0,0.0,1.242331,0.207055,1304,510,58,1,1500000,22000000
2,Manchester United,Tom Heaton,Goalkeeper,188.0,37.0,4,0.0,0.0,0.0,0.0,0.0,0.616438,0.924658,292,697,84,4,600000,6000000
3,Manchester United,Lisandro Martínez,Defender Centre-Back,175.0,25.0,82,0.02809,0.05618,0.224719,0.0,0.0,0.0,0.0,6408,175,22,9,50000000,50000000
4,Manchester United,Raphaël Varane,Defender Centre-Back,191.0,30.0,63,0.017889,0.017889,0.053667,0.0,0.0,0.0,0.0,5031,238,51,21,40000000,80000000


In [None]:
# we can notice that 45% of the data are considered goalkeepers and defenders, 
# so we might split our analysis and models to be 2 

In [11]:
df_football[df_football['position'] == 'Goalkeeper'].value_counts().sum()

1229

In [12]:
df_football[(df_football['position'] == 'Defender Centre-Back') | (df_football['position'] == 'Defender Left-Back') | (df_football['position'] == 'Defender Right-Back')].value_counts().sum()

3495

## Outliers Check

In [None]:
# go to univariate graphical analysis
# go to lesson : data visualisation 1 - chart type section
# then go to univariate graphical analysis
# detect outliers using graphs varbaly

In [None]:
# go to lesson: statistics 1 then statistics 3
# then go to univariate Non graphical analysis
# detect outliers using numerical statistics 