                                    ANALYZING FOOTBALL PLAYERS DATASET

First steps: 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

data = pd.read_csv('C:/Users/alain/Downloads/football_data/fifa_players.csv')

print(data.shape)
print(data.index)
print(data.columns)
print('--------' * 8)
print(data.info())
print('--------' * 8)
print(data.isna().sum()) # NaN values
print('--------' * 8)
print(data.describe())
data.head()

Data Cleaning

In [None]:
players = data.copy() 
players = players.dropna(axis = 1) # Delete all columns with NaN values
players.drop(['full_name', 'birth_date', 'international_reputation(1-5)', 'weak_foot(1-5)',
       'skill_moves(1-5)', 'body_type', 'aggression'], axis = 1, inplace = True) #Drop non-important columns
players.set_index('name', inplace = True) #Set 'name' column as the Index
print(players.shape)
players.info()

In [None]:
#players.columns

In [None]:
weights_attack = [0.2, 0.15, 0.15, 0.2, 0.2, 0.1] #Create 2 new columns (attacking_ability and defensive_ability) with the skills and the weight of each skill
skills_attack = players[['finishing', 'heading_accuracy', 'volleys', 'dribbling', 'shot_power', 'long_shots']]
weights_defense = [0.2, 0.2, 0.25, 0.25, 0.1]  
skills_defense = players[['interceptions', 'marking', 'standing_tackle', 'sliding_tackle', 'heading_accuracy']]
players['attacking_ability'] = (skills_attack * weights_attack).sum(axis=1)
players['defensive_ability'] = (skills_defense * weights_defense).sum(axis=1)
players.head()

CORRELATION BETWEEN AGE AND DIFFERENT ABILITIES

In [None]:
main_column = 'age'
other_columns = ['volleys', 'sprint_speed', 'stamina', 'acceleration', 'dribbling', 'agility', 'reactions', 'balance', 'jumping', 'interceptions', 'composure', 'vision']

selected_columns = [main_column] + other_columns  # Combine lists
subset = players[selected_columns]
correlation_matrix = subset.corr()
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

plt.figure(figsize=(8, 5))
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', cbar=True, square=True, cbar_kws={'shrink': .8})
plt.xticks(rotation=45, ha='right')
plt.title('Age vs Abilities')
plt.tight_layout()
plt.savefig('age_vs_abilities')
plt.show()

CORRELATION BETWEEN DRIBBLING AND BALL CONTROL

In [None]:
# Create a figure with 2 subplots side by side
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))  # Adjust figsize to fit your screen or preferences

# Create the heatmap on the first subplot
skills_data = players[['dribbling', 'ball_control']] #Correlation between dribbling and ball_control (skills-data)
correlation_matrix = skills_data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='vlag', cbar=True, ax=ax1)
ax1.set_title('Dribbling vs. Ball Control')
ax1.set_yticklabels(ax1.get_yticklabels(), rotation=25)  # Rotate y-axis labels on heatmap

# Create the scatter plot on the second subplot
sns.scatterplot(data=players, x='dribbling', y='ball_control', ax=ax2)
ax2.set_title('Dribbling vs. Ball Control')


plt.tight_layout()  # Adjusts plot so that everything fits without overlap
plt.savefig('dribling_ball_control')
plt.show()

CORRELATION BETWEEN HEIGHT, STRENGHT AND HEADING ACCURACY

In [None]:
correlation_matrix = players[['height_cm', 'strength', 'heading_accuracy']].corr()
plt.figure(figsize=(6,4))
sns.heatmap(correlation_matrix, annot=True, cmap='Greens', fmt=".2f")
plt.title('Correlation Matrix')
plt.yticks(rotation=45)
plt.savefig('height_strength_heading_accuracy')
plt.show()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))
# Relationship between height and heading accuracy
sns.scatterplot(data=players, x='height_cm', y='heading_accuracy', ax=ax1)
ax1.set_title('Height vs. Heading Accuracy')


# Relationship between strength and heading accuracy
sns.scatterplot(data=players, x='strength', y='heading_accuracy', ax=ax2)
ax2.set_title('Strength vs. Heading Accuracy')

plt.tight_layout()
plt.savefig('strength_height_head_accuracy')
plt.show()

In [None]:
#players.columns

TOP 10 ATTACKING PLAYERS

In [None]:
top_attackers = players.sort_values(by='attacking_ability', ascending=False).head(10)
fig, ax = plt.subplots(figsize=(8,4))
ax.bar(top_attackers.index, top_attackers['attacking_ability'], color='Orange')
ax.set_title('Top 10 Attackers')
ax.set_xlabel('Player')
ax.set_ylabel('Attacking ability')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('top_attackers')
plt.show()

In [None]:
# Setting up the plot with the correct figure size
fig, ax = plt.subplots(figsize=(12,6))

# Creating a bar chart for 'attacking_ability'
bar1 = ax.bar(top_attackers.index, top_attackers['attacking_ability'], color='Orange', label='Attacking Ability')
ax.bar_label(bar1, padding=-40)
# Stacking 'defensive_ability' on top of 'attacking_ability'
bar2 = ax.bar(top_attackers.index, top_attackers['defensive_ability'], bottom=top_attackers['attacking_ability'], color='Blue', label='Defensive Ability')
ax.bar_label(bar2, padding=-20)
# Adding titles and labels
ax.set_title('Top 10 Attackers: Combined Abilities')
ax.set_xlabel('Player')
ax.set_ylabel('Total Ability Score')

# Setting the x-ticks to be directly under each bar
ax.set_xticks(range(len(top_attackers.index)))  # Use range to match the number of bars
ax.set_xticklabels(top_attackers.index, rotation=45, ha='right')

# Adding a legend to the plot to identify the colors
ax.legend()

# Showing the plot
plt.tight_layout()
plt.savefig('top_attack_combined')
plt.show()

TOP 10 DEFENDERS

In [None]:
top_defenders = players.sort_values(by='defensive_ability', ascending=False).head(10)
fig, ax = plt.subplots(figsize=(8,4))
bar = ax.bar(top_defenders.index, top_defenders['defensive_ability'], color='Green')
ax.bar_label(bar, padding=-40)
ax.set_title('Top 10 Defenders')
ax.set_xlabel('Player')
ax.set_ylabel('Defensive ability')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('top_defenders')
plt.show()

AGE COMPARISON BETWEEN ATTACKERS AND DEFENDERS

In [None]:
print(top_attackers['age'].mean())
top_defenders['age'].mean()

TOP 10 GOALKEEPERS

In [None]:
goalkeeper = players[players['positions']=='GK']
top_gk = goalkeeper.sort_values(by='overall_rating', ascending=False).head(10)
fig, ax = plt.subplots(figsize=(8,4))
bar = ax.bar(top_gk.index, top_gk['overall_rating'], color='Purple')
ax.bar_label(bar, padding=-40)
ax.set_title('Top 10 Goalkeepers')
ax.set_xlabel('Player')
ax.set_ylabel('Overall rating')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('top_gk')
plt.show()

TOP SPRINT SPEED AND ACCELERATION

In [None]:
top_speed = players.sort_values(by='sprint_speed', ascending=False).head(15)
top_accel = players.sort_values(by='acceleration', ascending=False).head(15)

fig, ax = plt.subplots(figsize=(10,4))
bar = ax.bar(top_speed.index, top_speed['sprint_speed'], color='Green')
ax.bar_label(bar, padding=-40)
ax.set_title('Top 15 Fastest Sprinters')
ax.set_xlabel('Player')
ax.set_ylabel('Sprint speed')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('top_speed')

fig, ax = plt.subplots(figsize=(10,4))
bars = ax.bar(top_accel.index, top_accel['acceleration'], color='Purple')
ax.bar_label(bars, padding=-40)
ax.set_title('Top 15 Fastest Accelerations')
ax.set_xlabel('Player')
ax.set_ylabel('Acceleration')
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.savefig('top_accel')
plt.show()

OVERALL RATING OF LEFT/RIGHT-FOOTED PLAYERS BY NATIONALITY

In [None]:
argentina = players[players['nationality']=='Argentina']
arg_left = argentina[argentina['preferred_foot']=='Left']
arg_right = argentina[argentina['preferred_foot']=='Right']

spain = players[players['nationality']=='Spain']
spain_left = spain[spain['preferred_foot']=='Left']
spain_right = spain[spain['preferred_foot']=='Right']

france = players[players['nationality']=='France']
france_left = france[france['preferred_foot']=='Left']
france_right = france[france['preferred_foot']=='Right']

portugal = players[players['nationality']=='Portugal']
portugal_left = portugal[portugal['preferred_foot']=='Left']
portugal_right = portugal[portugal['preferred_foot']=='Right']

brazil = players[players['nationality']=='Brazil']
brazil_left = brazil[brazil['preferred_foot']=='Left']
brazil_right = brazil[brazil['preferred_foot']=='Right']

england = players[players['nationality']=='England']
england_left = england[england['preferred_foot']=='Left']
england_right = england[england['preferred_foot']=='Right']

left_footed_players = pd.concat([
    arg_left,
    spain_left,
    france_left,
    portugal_left,
    brazil_left,
    england_left
], axis=0)
right_footed_players = pd.concat([
    arg_right,
    spain_right,
    france_right,
    portugal_right,
    brazil_right,
    england_right
], axis=0)

In [None]:
plt.figure(figsize=(10, 6)) 
sns.boxplot(x='nationality', y='overall_rating', data=left_footed_players)
plt.title('Overall Rating of Left-Footed Players by Nationality')
plt.xlabel('Nationality')
plt.ylabel('Overall Rating')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('overall_rating_left_foot')
plt.show()

plt.figure(figsize=(10, 6)) 
sns.boxplot(x='nationality', y='overall_rating', data=right_footed_players)
plt.title('Overall Rating of Right-Footed Players by Nationality')
plt.xlabel('Nationality')
plt.ylabel('Overall Rating')
plt.xticks(rotation=45)  
plt.tight_layout()
plt.savefig('overall_rating_right_foot')
plt.show()