# <p style="font-family: Arial; font-size:2.2em;color:blue;"> Lionel Messi - Career Goals </p>


![](https://static01.nyt.com/images/2019/04/16/sports/16onsoccerweb-2/merlin_153612873_5bb119b9-8972-4087-b4fd-371cab8c5ba2-superJumbo.jpg?quality=90&auto=webp)

# Data Sourcing - Let's read!
1. Importing necessary libraries
2. Reading the files
3. Understanding the basic information

In [None]:
#importing libraries
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
messi=pd.read_csv('../input/messi-dataset/lionel_messi_goals.csv')
messi.head()

In [None]:
# checking total no. of rows and columns present in original datset
messi.shape

In [None]:
# getting information about total non-null values and datatypes
messi.info()

In [None]:
# to get percentage of null values present in all columns respectively
round(messi.isnull().sum()*100/len(messi),2)

**Note:**
* Around 48% of data is missing from column *goal_type*. It would be better to drop this column.
* Same goes with the column *competition_name* as it also has 19% missing values.

In [None]:
# total count of types of goal scored
messi.goal_type.value_counts()

In [None]:
# count of goal scored in diferent competitons
messi.competition_name.value_counts()

**Note:**
* As we know club games are popular in European football, Messi has scored 409 goals in LaLiga.

In [None]:
# checking presence of null value in goal_type column
messi[messi.goal_type.isnull()]

In [None]:
# checking presence of null value in competition_name column
messi[messi.competition_name.isnull()]

# Data Cleaning - Let's fix it!
1. Imputing the missing values
2. Removing unimportant columns

In [None]:
# dropping column named competition_name due to missing values in column
messi.drop('competition_name',axis=1,inplace=True)

# dropping column named goal_type due to missing values in column
messi.drop('goal_type',axis=1,inplace=True)

In [None]:
round(messi.isnull().sum()*100/len(messi),2)

In [None]:
messi.home_team.value_counts()

In [None]:
messi.away_team.value_counts()

In [None]:
messi.minute.value_counts()

In [None]:
# creating a new column named goal_halves
def get_half(x):
    if '+' in x:
        return 'extra_time'
    elif int(x)<46:
        return 'first_half'
    else:
        return 'second_half'

messi['goal_halves']=messi.minute.apply(lambda x:get_half(x))
messi.head()

In [None]:
# transforming items in minute column from containing "+" into its real sum
def transform(y):
    if '+' in y:
        
        sum1= y.split('+')
        return int(sum1[0])+int(sum1[1])
        
    else:
        return int(y)
    
messi.minute=messi.minute.apply(lambda y:transform(y))
messi.minute.astype(int)
messi.head()

In [None]:
# Adding a new column named win_not_win containing only two categories win and draw/loss
def win_not_win(z):
    if 'Won' in z:
        return "won"
    else:
        return "draw/loss"
messi['win_not_win']=messi.final_game_result.apply(lambda z:win_not_win(z))
messi.head()

In [None]:
# checking total count of win and draw/loss
messi.win_not_win.value_counts()

# Data Analysis - Let's visualize!

#### We will try to plot chart based on different factors in  above dataset based on Messi performance affecting final game result  

In [None]:
import warnings
warnings.filterwarnings('ignore')

import seaborn as sns

plt.figure(figsize=(100,100))

sns.set(font_scale = 5)

plt.subplot(3,2,1)
ax=sns.countplot(messi.win_not_win)
ax.set(xlabel = "Win and Draw/Loss")
for rect in ax.patches:
    ax.text (rect.get_x() + rect.get_width()  / 2,rect.get_height()+ 0.75,rect.get_height(),horizontalalignment='center', fontsize = 50)

plt.subplot(3,2,2)
bx=sns.countplot(messi.final_game_result)
bx.set(xlabel = "Final Result")
for rect in bx.patches:
    bx.text (rect.get_x() + rect.get_width()  / 2,rect.get_height()+ 0.75,rect.get_height(),horizontalalignment='center', fontsize = 50)

plt.subplot(3,2,3)
cx=sns.countplot(messi.goal_method,hue=messi.final_game_result)
cx.set(xlabel = "Goal Methods")
for rect in cx.patches:
    cx.text (rect.get_x() + rect.get_width()  / 2,rect.get_height()+ 0.75,rect.get_height(),horizontalalignment='center', fontsize = 50)
cx.legend(loc='upper right')

plt.subplot(3,2,4)
dx=sns.countplot(messi.goal_halves,hue=messi.final_game_result)
dx.set(xlabel = "Goal Halves")
for rect in dx.patches:
    dx.text (rect.get_x() + rect.get_width()  / 2,rect.get_height()+ 0.75,rect.get_height(),horizontalalignment='center', fontsize = 50)
dx.legend(loc='upper right')

plt.show()

#### Observations:

* In Messi's career in which he scored,he has **won 586 matches** , **drawn 47 matches** and **lost in 18 matches** out of  **651 matches** while playing for his club and country.
* Winning percentage is higher whenever he scores in second half than scoring in first half.
* In most of the matches Messi scores with left foot and winning percentage is **90 percent**.
* Whenever he scores with right foot or scores with a header ,Barcelona always wins.



## Let's try to evaluate the pattern of scoring in different halves against different opponents

In [None]:
national_teams  = ['Argentina']
club_teams  = ['FC Barcelona']

messi_club = messi[messi["home_team"].isin(club_teams) | messi["away_team"].isin(club_teams)]
messi_national = messi[messi["home_team"].isin(national_teams)| messi["away_team"].isin(national_teams)]

df_top_performances = messi.opponent.value_counts().head(10).to_frame().reset_index()
df_top_performances.columns = ['opponents', 'scored_in_matches']
listoftopteams = df_top_performances['opponents'].tolist()

dfp = messi_national.opponent.value_counts().head(10).to_frame().reset_index()
dfp.columns = ['opponent', 'scored_in_matches']
listoftopnationalteams = dfp['opponent'].tolist()

plt.figure(figsize=(50,35))

plt.subplot(2,1,1)
sns.set(font_scale = 4)
cx=sns.countplot(y = messi_club[messi_club.opponent.isin(listoftopteams)].opponent,hue=messi_club.goal_halves)
cx.bar_label(cx.containers[1],fontsize = 30)
cx.bar_label(cx.containers[0],fontsize = 30)
cx.bar_label(cx.containers[2],fontsize = 30)
plt.yticks(fontsize=35)
plt.xticks(fontsize=20)
plt.xlabel("")
plt.ylabel("")
plt.legend(loc = "lower right")
plt.title("Goals Scored in Different Halves Against Top 10 Club Opponents",fontsize = 40)

plt.subplot(2,1,2)
sns.set(font_scale = 4)
cx=sns.countplot(y = messi_national[messi_national.opponent.isin(listoftopnationalteams)].opponent,hue=messi_national.goal_halves)
cx.bar_label(cx.containers[1],fontsize = 30)
cx.bar_label(cx.containers[0],fontsize = 30)
cx.bar_label(cx.containers[2],fontsize = 30)
plt.yticks(fontsize=35)
plt.xticks(fontsize=20)
plt.xlabel("")
plt.ylabel("")
plt.legend(loc = "lower right")
plt.title("Goals Scored in Different halves Against Top 10 National Teams",fontsize = 40)
plt.subplots_adjust(hspace = 0.2)
plt.show()

#### Performances Against Top Spanish Clubs and National Teams:
*  In Laliga against top clubs,Messi scored more goals in first half against **Sevilla**.
*  His performance against **Real Madrid** is better in second half than first half.
*  Against **Atletico Madrid** his performance is equally good in both halves.
* Majority of goals scored by him against **national teams** are in **first half**.


## How has Messi performed scoring goals with different methods in different halves? 
#### let's evaluate

In [None]:
plt.figure(figsize = (15,11))

a = 1

for method in messi.goal_method.value_counts().index.values:
    plt.subplot(2,3,a)

    data = messi[messi.goal_method == method].groupby(['goal_halves'])['goal_halves'].count().reset_index(name = 'Count')
    textprops = {"fontsize":15}
    plt.pie(data.Count,autopct='%1.1f%%',textprops =textprops )
    plt.title(method,fontsize = 15)
    plt.legend(title = 'Halves',labels= data.goal_halves,fontsize =7,title_fontsize =7)
    
    a += 1
plt.suptitle("Goal Method In Different Halves",fontsize = 20)
plt.show()

#### Observations:
* In all goals scored using left foot and right foot,he **scored more in second half than first half** in both cases.
* When comparing two pie charts of right footed goals anad left footed goals,he **scores more with right foot** in **second half** and **extra time** out of **overall right footed goals** than he **scores with left foot** in **second half** and **extra time** out of **overall left footed goals**
* In his career he has equally scored in both halves of game using his head.
* He could score only  in second half using his chest.
* And could only scored in first half using his hand and hip.


## Who are the top ten club against whom Messi has scored most times?
#### Let's find out

In [None]:
no_of_matches = df_top_performances['scored_in_matches'].tolist()

plt.figure(figsize = (15,7))
plt.barh(y = df_top_performances.opponents, width=df_top_performances.scored_in_matches)
for index, value in enumerate(no_of_matches):
    plt.text(value, index, str(value),fontsize = 20)
plt.yticks(fontsize=20)
plt.xticks(fontsize=10)
plt.title("Total Goals Scored Against Top 10 Club Opponents",fontsize = 25)
plt.show()

#### Observation:

* Top ten teams against whom Messi has scored most times are **Sevilla, Atlético Madrid, Valencia, Real Madrid, Athletic de Bilbao, Osasuna, Espanyol, Deportivo La Coruña, Real Betis, Levante**.

## Who is most favorite opponent of Messi and how does his scoring affect final game result?

### Let's find

In [None]:
plt.figure(figsize=(45,25))

plt.subplot(2,1,1)
sns.set(font_scale = 4)
cx=sns.countplot(y = messi_club[messi_club.opponent.isin(listoftopteams)].opponent,hue=messi_club.win_not_win)
cx.bar_label(cx.containers[1],fontsize = 30)
cx.bar_label(cx.containers[0],fontsize = 30)
plt.yticks(fontsize=35)
plt.xticks(fontsize=20)
plt.xlabel("")
plt.ylabel("")
plt.legend(loc = "upper right")
plt.title("Goals scored And It's Effect On Result Against Top 10 Club Opponents",fontsize = 40)

plt.subplot(2,1,2)
sns.set(font_scale = 4)
cx=sns.countplot(y = messi_national[messi_national.opponent.isin(listoftopnationalteams)].opponent,hue=messi_national.win_not_win)
cx.bar_label(cx.containers[1],fontsize = 30)
cx.bar_label(cx.containers[0],fontsize = 30)
plt.yticks(fontsize=35)
plt.xticks(fontsize=20)
plt.xlabel("")
plt.ylabel("")
plt.legend(loc = "center right")
plt.title("Goals scored And It's Effect On Result Against Top 10 National Teams",fontsize = 40)
plt.subplots_adjust(hspace = 0.2)
plt.show()

#### Observations:

* Messi is most succesfull against a top Laliga club **"Sevilla"** with **34 win out of 36 matches** in which he scored.
* In total **26 matches** in which he scored against top rival club **Real Madrid** of spain,he has **won in 15 matches**.
* Against **Atletico madrid** his record is amazing with **winning in 27 matches out of 28 matches**in which he scored.
* His record against top european clubs like **Man City, Man United, AC Milan, Arsenal, Ajax, Tottenham, Bayern Munchen** is phenomenal with winning in most of the matches in which he scored.
* In national side whenever he has scored against top national teams like **Brazil, Ecuador, Uruguay, and switzerland** his team has always won.
* Above observations implies that whenever he will score chances of winning will be much higher.

## Further continuing our analysis from previous distribution of data among different club opponents, we will see both top performances at home and away from home against different clubs 

In [None]:
venues = ['Home Team','Away Team']
messi_modified = messi.rename(columns={"home_team": "Home Team", "away_team": "Away Team"})

plt.figure(figsize = (10,10))

a = 1
for i in venues:
    plt.subplot(2,1,a)
    df_opponent_clubs = messi_modified[messi_modified[i].isin(club_teams)]
    df_top = df_opponent_clubs.opponent.value_counts().head(10).to_frame().reset_index()
    df_top.columns = ['opponents', 'scored_in_matches']
    no_of_matches_club = df_top['scored_in_matches'].tolist()
    plt.barh(y=df_top.opponents, width=df_top.scored_in_matches)
    for index, value in enumerate(no_of_matches_club):
        plt.text(value, index,str(value),fontsize = 20) 
    plt.yticks(fontsize=20)
    plt.xticks(fontsize=10)
    plt.gca().set_title("When FC Barcelona is a "+ i,fontsize = 20)
    plt.suptitle("Goals Scored Against Top 10 Club Opponents",fontsize = 25)
    plt.subplots_adjust(hspace = 0.2)
    a +=1

#### Observation:
* At home **Messi** is most successfull against **Sevilla** and at away games he is most successfull against **Real Madrid**

## How does Messi perform for Argentina when he is playing at home and away from home?
#### Let's find out top performances against countries

In [None]:
plt.figure(figsize = (28,7))
a = 1
for i in venues:
    plt.subplot(1,2,a)
    df_opponent_countries = messi_modified[messi_modified[i].isin(national_teams)]
    df_top_performances = df_opponent_countries.opponent.value_counts().head(5).to_frame().reset_index()
    df_top_performances.columns = ['opponents', 'scored_in_matches']
    no_of_matches_ = df_top_performances['scored_in_matches'].tolist()
    plt.barh(y=df_top_performances.opponents, width=df_top_performances.scored_in_matches)
    for index, value in enumerate(no_of_matches_):
        plt.text(value, index,
                 str(value),fontsize = 20) 
    plt.xlabel("Number of times scored",fontsize = 20)
    plt.yticks(fontsize=20)
    plt.xticks(fontsize=20)
    plt.gca().set_title("When Argentina is a "+ i,fontsize = 20)
    plt.suptitle("Goals Scored Against Top 5 National Teams",fontsize = 25)
    
    a +=1 

#### Observations
* Top 5 teams are **Brazil, Uruguay, Haiti, Bolivia and Venezuela** against whom **Messi** has scored most times at home.
* Top 5 teams are **Switzerland, Nigeria, Ecuador, Guatemala and Paraguay** against whom **Messi** has scored most times away from home.

## How does FC Barcelona and Argentina perform at home and away games whenever Messi has scored?
#### Lets find out

In [None]:
plt.figure(figsize = (18,18))
teams = ["FC Barcelona","Argentina"]
a = 1
for team in teams:
    plt.subplot(1,4,a)

    data = messi[messi.home_team == team].groupby(['win_not_win'])['win_not_win'].count().reset_index(name = 'Count')
    textprops = {"fontsize":15}
    plt.pie(data.Count,autopct='%1.1f%%',textprops =textprops )
    plt.title(team +" at Home",fontsize = 15)
    plt.legend(title = "result",labels= data.win_not_win,fontsize = 7,title_fontsize =7,loc = "upper right")
    
    a += 1 
for team in teams:
    plt.subplot(1,4,a)

    data = messi[messi.away_team == team].groupby(['win_not_win'])['win_not_win'].count().reset_index(name = 'Count')
    textprops = {"fontsize":15}
    plt.pie(data.Count,autopct='%1.1f%%',textprops =textprops )
    plt.title(team+" Away from Home",fontsize = 15)
    plt.legend(title = "result",labels= data.win_not_win,fontsize = 7,title_fontsize = 7,loc = "upper right")
    
    a += 1    
    
plt.show()

#### Observations:
* Winning percentage of FC Barcelona  is less than winning percentage of Argentina at both home game and away game.
* Overall the winning percentage for both Barcelona and Argentina at home game is quite high with **92.4%** and **94.7%** respectively.
* Overall the winning percentage for both Barcelona and Argentina in away games is decent  with **86%** and **88.9%** respectively but less than when compared to their performance at home.