### Connecting to the European Soccer Database

In [1]:
import sqlite3
import pandas as pd
import numpy as np
import datetime

try:
    sqliteConnection = sqlite3.connect('database.sqlite')
    #cursor = sqliteConnection.cursor()
    print("Database created and Successfully Connected to SQLite")
except sqlite3.Error as error:
    print("Error while connecting to sqlite", error)

Database created and Successfully Connected to SQLite


### Populating the 7 tables in the database to 7 Pandas Dataframes

In [2]:
country_query = "SELECT * from Country"
country_df = pd.read_sql_query(country_query, sqliteConnection)

league_query = "SELECT * from League"
league_df = pd.read_sql_query(league_query, sqliteConnection)

match_query = "SELECT * from Match"
match_df = pd.read_sql_query(match_query, sqliteConnection)

player_query = "SELECT * from Player"
player_df = pd.read_sql_query(player_query, sqliteConnection)

playerAttributes_query = "SELECT * from Player_Attributes"
playerAttributes_df = pd.read_sql_query(playerAttributes_query, sqliteConnection)

team_query = "SELECT * from Team"
team_df = pd.read_sql_query(team_query, sqliteConnection)

teamAttributes_query = "SELECT * from Team_Attributes"
teamAttributes_df = pd.read_sql_query(teamAttributes_query, sqliteConnection)

### Cleaning the Country Dataframe

In [3]:
del country_df['id']
country_df.insert(0, 'Country_id', range(1, len(country_df)+1))
country_df = country_df.rename(columns={"name": "Country_Name"})
#country_df

### Cleaning the League Dataframe

In [4]:
del league_df['id']
del league_df['country_id']
league_df.insert(0, 'League_id', range(1, len(league_df)+1))
league_df = league_df.rename(columns={"name": "League_Name"})
#league_df

### Cleaning the Match Dataframe

In [5]:
match_df = match_df.rename(columns={"id": "Match_id"})

#Replacing the old country_id values with the Country_Name values for better readability
match_df['country_id'] = match_df['country_id'].replace(1,'Belgium')
match_df['country_id'] = match_df['country_id'].replace(1729,'England')
match_df['country_id'] = match_df['country_id'].replace(4769,'France')
match_df['country_id'] = match_df['country_id'].replace(7809,'Germany')
match_df['country_id'] = match_df['country_id'].replace(10257,'Italy')
match_df['country_id'] = match_df['country_id'].replace(13274,'Netherlands')
match_df['country_id'] = match_df['country_id'].replace(15722,'Poland')
match_df['country_id'] = match_df['country_id'].replace(17642,'Portugal')
match_df['country_id'] = match_df['country_id'].replace(19694,'Scotland')
match_df['country_id'] = match_df['country_id'].replace(21518,'Spain')
match_df['country_id'] = match_df['country_id'].replace(24558,'Switzerland')
match_df = match_df.rename(columns={"country_id": "Country_Name"})

#Replacing the old league_id values with the League_Name values for better readability
match_df['league_id'] = match_df['league_id'].replace(1,'Belgium Jupiler League')
match_df['league_id'] = match_df['league_id'].replace(1729,'England Premier League')
match_df['league_id'] = match_df['league_id'].replace(4769,'France Ligue 1')
match_df['league_id'] = match_df['league_id'].replace(7809,'Germany 1. Bundesliga')
match_df['league_id'] = match_df['league_id'].replace(10257,'Italy Serie A')
match_df['league_id'] = match_df['league_id'].replace(13274,'Netherlands Eredivisie')
match_df['league_id'] = match_df['league_id'].replace(15722,'Poland Ekstraklasa')
match_df['league_id'] = match_df['league_id'].replace(17642,'Portugal Liga ZON Sagres')
match_df['league_id'] = match_df['league_id'].replace(19694,'Scotland Premier League')
match_df['league_id'] = match_df['league_id'].replace(21518,'Spain LIGA BBVA')
match_df['league_id'] = match_df['league_id'].replace(24558,'Switzerland Super League')
match_df = match_df.rename(columns={"league_id": "League_Name"})
match_df['date'] = pd.to_datetime(match_df['date'], format='%Y/%m/%d %H:%M:%S')
#match_df.head()
#Lesa na2es feeh shwayet hagat

### Cleaning the Player Attributes Dataframe

In [None]:
#playerAttributes_df.head(10)

In [None]:
print('The number of rows before removing duplicates: ' + str(playerAttributes_df.shape[0]))
duplicateRowsPlayerAttributesDF = playerAttributes_df[playerAttributes_df.duplicated(['player_fifa_api_id', 'player_api_id'])]
duplicatePercentage = (duplicateRowsPlayerAttributesDF.shape[0]/playerAttributes_df.shape[0]) * 100
print('The percentage of duplicated rows: ' + str(duplicatePercentage))

In [None]:
playerAttributes_df_1 = playerAttributes_df.sort_values('date').drop_duplicates(['player_fifa_api_id', 'player_api_id'],keep='last')
playerAttributes_df_1 = playerAttributes_df_1.sort_values('id')
print('The number of rows after removing duplicates: ' + str(playerAttributes_df_1.shape[0]))
percentageLeft = (playerAttributes_df_1.shape[0]/playerAttributes_df.shape[0]) * 100
print('The percentage of rows left: ' + str(percentageLeft))
#playerAttributes_df_1.head(10)

In [None]:
# get the number of missing values per column
missing_values_count_playerAttributes = playerAttributes_df_1.isnull().sum()

#print(missing_values_count_playerAttributes)

total_cells_playerAttributes = np.product(playerAttributes_df_1.shape) 
total_missing_playerAttributes = missing_values_count_playerAttributes.sum()

# percentage of data that is missing
percentage_missign_values_playerAttributes = (total_missing_playerAttributes/total_cells_playerAttributes) * 100
#print('Percentage of missing values: '+ str(percentage_missign_values_playerAttributes))

In [None]:
rows_with_missing_values = playerAttributes_df_1[playerAttributes_df_1.isna().any(axis=1)]
print('Percentage of rows with missing values: '+ str((rows_with_missing_values.shape[0]/playerAttributes_df_1.shape[0])*100))

In [None]:
# merge with player_df to search on the internet for missing values with player names
df2 = pd.merge(player_df,playerAttributes_df_1,on=['player_api_id','player_fifa_api_id'])
# sort and get the percentage of old players having NaNs
rows_with_null_sorted = df2[df2.isna().any(axis=1)].sort_values('birthday')
end_date = pd.Timestamp(datetime.date(1980,1,1))
rows_with_null_sorted['birthday'] = pd.to_datetime(rows_with_null_sorted['birthday'])
mask = rows_with_null_sorted['birthday'] <= end_date
old_players = rows_with_null_sorted.loc[mask]
#old_players

In [None]:
rows_with_missing_values_old_players = old_players[old_players.isna().any(axis=1)]
print('Percentage of rows of old players with missing values: '+ str((rows_with_missing_values_old_players.shape[0]/playerAttributes_df_1.shape[0])*100))

In [None]:
playerAttributes_df_1 = playerAttributes_df_1.dropna()
#playerAttributes_df_1

In [None]:
missing_values_count_playerAttributes = playerAttributes_df_1.isnull().sum()
#print(missing_values_count_playerAttributes)

In [None]:
playerAttributes_df_1 = playerAttributes_df_1.rename(columns={"date": "date_modified"})
#playerAttributes_df_1

In [14]:
pd.set_option('display.max_rows', 800)
pd.set_option('display.max_columns', 116)

### Cleaning the Home Players X1 to X11 Coordinates (1/2)
Getting the missing X1 to X11 values from the complete records of the home teams in the table

In [6]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            home_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            home_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        home_players_df = home_players_df.dropna(subset = ['home_player_X'+str(m)])
        home_player_X_rows = match_df[match_df['home_player_X'+str(m)].isnull()]
        if n < 10:
            home_player_X_rows = home_player_X_rows[home_player_X_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            home_player_X_rows = home_player_X_rows[home_player_X_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        home_player_X_rows = home_player_X_rows.reset_index()

        for i in range(0, home_player_X_rows.shape[0]):
            return_rows = home_players_df[home_players_df['home_team_api_id'] == home_player_X_rows['home_team_api_id'][i]]
            
            if return_rows.size != 0:
                X_Player = return_rows['home_player_X'+str(m)].head(1)
                null_X_rows = match_df[(match_df['home_team_api_id'] == home_player_X_rows['home_team_api_id'][i])]
                null_X_rows = null_X_rows[null_X_rows['home_player_X'+str(m)].isnull()]
                
                if n < 10:
                    null_X_rows = null_X_rows[null_X_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_X_rows = null_X_rows[null_X_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_X_rows.shape[0]):
                    indices.append(null_X_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'home_player_X'+str(m)] = int(X_Player[X_Player.index[0]])

### Cleaning the Home Players X1 to X11 Coordinates (2/2)
Getting the missing X1 to X11 values from the complete records of the away teams in the table

In [19]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            home_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            home_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        home_players_df = home_players_df.dropna(subset = ['away_player_X'+str(m)])
        home_player_X_rows = match_df[match_df['home_player_X'+str(m)].isnull()]
        if n < 10:
            home_player_X_rows = home_player_X_rows[home_player_X_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            home_player_X_rows = home_player_X_rows[home_player_X_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        home_player_X_rows = home_player_X_rows.reset_index()

        for i in range(0, home_player_X_rows.shape[0]):
            return_rows = home_players_df[home_players_df['away_team_api_id'] == home_player_X_rows['home_team_api_id'][i]]
            
            if return_rows.size != 0:
                X_Player = return_rows['away_player_X'+str(m)].head(1)
                null_X_rows = match_df[(match_df['away_team_api_id'] == home_player_X_rows['home_team_api_id'][i])]
                null_X_rows = null_X_rows[null_X_rows['home_player_X'+str(m)].isnull()]
                
                if n < 10:
                    null_X_rows = null_X_rows[null_X_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_X_rows = null_X_rows[null_X_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_X_rows.shape[0]):
                    indices.append(null_X_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'home_player_X'+str(m)] = int(X_Player[X_Player.index[0]])

### Cleaning the Home Players Y1 to Y11 Coordinates (1/2)
Getting the missing Y1 to Y11 values from the complete records of the home teams in the table

In [7]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            home_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            home_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        home_players_df = home_players_df.dropna(subset = ['home_player_Y'+str(m)])
        home_player_Y_rows = match_df[match_df['home_player_Y'+str(m)].isnull()]
        if n < 10:
            home_player_Y_rows = home_player_Y_rows[home_player_Y_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            home_player_Y_rows = home_player_Y_rows[home_player_Y_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        home_player_Y_rows = home_player_Y_rows.reset_index()

        for i in range(0, home_player_Y_rows.shape[0]):
            return_rows = home_players_df[home_players_df['home_team_api_id'] == home_player_Y_rows['home_team_api_id'][i]]
            
            if return_rows.size != 0:
                Y_Player = return_rows['home_player_Y'+str(m)].head(1)
                null_Y_rows = match_df[(match_df['home_team_api_id'] == home_player_Y_rows['home_team_api_id'][i])]
                null_Y_rows = null_Y_rows[null_Y_rows['home_player_Y'+str(m)].isnull()]
                
                if n < 10:
                    null_Y_rows = null_Y_rows[null_Y_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_Y_rows = null_Y_rows[null_Y_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_Y_rows.shape[0]):
                    indices.append(null_Y_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'home_player_Y'+str(m)] = int(Y_Player[Y_Player.index[0]])

### Cleaning the Home Players Y1 to Y11 Coordinates (2/2)
Getting the missing Y1 to Y11 values from the complete records of the away teams in the table

In [21]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            home_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            home_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        home_players_df = home_players_df.dropna(subset = ['away_player_Y'+str(m)])
        home_player_Y_rows = match_df[match_df['home_player_Y'+str(m)].isnull()]
        if n < 10:
            home_player_Y_rows = home_player_Y_rows[home_player_Y_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            home_player_Y_rows = home_player_Y_rows[home_player_Y_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        home_player_Y_rows = home_player_Y_rows.reset_index()

        for i in range(0, home_player_Y_rows.shape[0]):
            return_rows = home_players_df[home_players_df['away_team_api_id'] == home_player_Y_rows['home_team_api_id'][i]]
            
            if return_rows.size != 0:
                Y_Player = return_rows['away_player_Y'+str(m)].head(1)
                null_Y_rows = match_df[(match_df['away_team_api_id'] == home_player_Y_rows['home_team_api_id'][i])]
                null_Y_rows = null_Y_rows[null_Y_rows['home_player_Y'+str(m)].isnull()]
                
                if n < 10:
                    null_Y_rows = null_Y_rows[null_Y_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_Y_rows = null_Y_rows[null_Y_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_Y_rows.shape[0]):
                    indices.append(null_Y_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'home_player_Y'+str(m)] = int(Y_Player[Y_Player.index[0]])

### Cleaning the Away Players X1 to X11 Coordinates (1/2)
Getting the missing X1 to X11 values from the complete records of the away teams in the table

In [8]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            away_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            away_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        away_players_df = away_players_df.dropna(subset = ['away_player_X'+str(m)])

        away_player_X_rows = match_df[match_df['away_player_X'+str(m)].isnull()]
        if n < 10:
            away_player_X_rows = away_player_X_rows[away_player_X_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            away_player_X_rows = away_player_X_rows[away_player_X_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        away_player_X_rows = away_player_X_rows.reset_index()

        for i in range(0, away_player_X_rows.shape[0]):
            return_rows = away_players_df[away_players_df['away_team_api_id'] == away_player_X_rows['away_team_api_id'][i]]
            
            if return_rows.size != 0:
                X_Player = return_rows['away_player_X'+str(m)].head(1)
                null_X_rows = match_df[(match_df['away_team_api_id'] == away_player_X_rows['away_team_api_id'][i])]
                null_X_rows = null_X_rows[null_X_rows['away_player_X'+str(m)].isnull()]
                
                if n < 10:
                    null_X_rows = null_X_rows[null_X_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_X_rows = null_X_rows[null_X_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_X_rows.shape[0]):
                    indices.append(null_X_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'away_player_X'+str(m)] = int(X_Player[X_Player.index[0]])

### Cleaning the Away Players X1 to X11 Coordinates (2/2)
Getting the missing X1 to X11 values from the complete records of the home teams in the table

In [22]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            away_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            away_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        away_players_df = away_players_df.dropna(subset = ['home_player_X'+str(m)])
        away_player_X_rows = match_df[match_df['away_player_X'+str(m)].isnull()]
        if n < 10:
            away_player_X_rows = away_player_X_rows[away_player_X_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            away_player_X_rows = away_player_X_rows[away_player_X_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        away_player_X_rows = away_player_X_rows.reset_index()

        for i in range(0, away_player_X_rows.shape[0]):
            return_rows = away_players_df[away_players_df['home_team_api_id'] == away_player_X_rows['away_team_api_id'][i]]
            
            if return_rows.size != 0:
                X_Player = return_rows['home_player_X'+str(m)].head(1)
                null_X_rows = match_df[(match_df['home_team_api_id'] == away_player_X_rows['away_team_api_id'][i])]
                null_X_rows = null_X_rows[null_X_rows['away_player_X'+str(m)].isnull()]
                
                if n < 10:
                    null_X_rows = null_X_rows[null_X_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_X_rows = null_X_rows[null_X_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_X_rows.shape[0]):
                    indices.append(null_X_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'away_player_X'+str(m)] = int(X_Player[X_Player.index[0]])

### Cleaning the Away Players Y1 to Y11 Coordinates (1/2)
Getting the missing Y1 to Y11 values from the complete records of the away teams in the table

In [9]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            away_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            away_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        away_players_df = away_players_df.dropna(subset = ['away_player_Y'+str(m)])

        away_player_Y_rows = match_df[match_df['away_player_Y'+str(m)].isnull()]
        if n < 10:
            away_player_Y_rows = away_player_Y_rows[away_player_Y_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            away_player_Y_rows = away_player_Y_rows[away_player_Y_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        away_player_Y_rows = away_player_Y_rows.reset_index()

        for i in range(0, away_player_Y_rows.shape[0]):
            return_rows = away_players_df[away_players_df['away_team_api_id'] == away_player_Y_rows['away_team_api_id'][i]]
            
            if return_rows.size != 0:
                Y_Player = return_rows['away_player_Y'+str(m)].head(1)
                null_Y_rows = match_df[(match_df['away_team_api_id'] == away_player_Y_rows['away_team_api_id'][i])]
                null_Y_rows = null_Y_rows[null_Y_rows['away_player_Y'+str(m)].isnull()]
                
                if n < 10:
                    null_Y_rows = null_Y_rows[null_Y_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_Y_rows = null_Y_rows[null_Y_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_Y_rows.shape[0]):
                    indices.append(null_Y_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'away_player_Y'+str(m)] = int(Y_Player[Y_Player.index[0]])

### Cleaning the Away Players Y1 to Y11 Coordinates (2/2)
Getting the missing Y1 to Y11 values from the complete records of the home teams in the table

In [23]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            away_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            away_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        away_players_df = away_players_df.dropna(subset = ['home_player_Y'+str(m)])
        away_player_Y_rows = match_df[match_df['away_player_Y'+str(m)].isnull()]
        if n < 10:
            away_player_Y_rows = away_player_Y_rows[away_player_Y_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            away_player_Y_rows = away_player_Y_rows[away_player_Y_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        away_player_Y_rows = away_player_Y_rows.reset_index()

        for i in range(0, away_player_Y_rows.shape[0]):
            return_rows = away_players_df[away_players_df['home_team_api_id'] == away_player_Y_rows['away_team_api_id'][i]]
            
            if return_rows.size != 0:
                Y_Player = return_rows['home_player_Y'+str(m)].head(1)
                null_Y_rows = match_df[(match_df['home_team_api_id'] == away_player_Y_rows['away_team_api_id'][i])]
                null_Y_rows = null_Y_rows[null_Y_rows['away_player_Y'+str(m)].isnull()]
                
                if n < 10:
                    null_Y_rows = null_Y_rows[null_Y_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_Y_rows = null_Y_rows[null_Y_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_Y_rows.shape[0]):
                    indices.append(null_Y_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'away_player_Y'+str(m)] = int(Y_Player[Y_Player.index[0]])

### Cleaning the Home Players Columns (1/2)
Getting the missing home player 1 to 11 values from the complete records of the home teams in the table

In [10]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            home_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            home_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        home_players_df = home_players_df.dropna(subset = ['home_player_'+str(m)])

        home_player_rows = match_df[match_df['home_player_'+str(m)].isnull()]
        if n < 10:
            home_player_rows = home_player_rows[home_player_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            home_player_rows = home_player_rows[home_player_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        home_player_rows = home_player_rows.reset_index()

        for i in range(0, home_player_rows.shape[0]):
            return_rows = home_players_df[home_players_df['home_team_api_id'] == home_player_rows['home_team_api_id'][i]]
            
            if return_rows.size != 0:
                Player = return_rows['home_player_'+str(m)].head(1)
                null_rows = match_df[(match_df['home_team_api_id'] == home_player_rows['home_team_api_id'][i])]
                null_rows = null_rows[null_rows['home_player_'+str(m)].isnull()]
                
                if n < 10:
                    null_rows = null_rows[null_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_rows = null_rows[null_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_rows.shape[0]):
                    indices.append(null_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'home_player_'+str(m)] = int(Player[Player.index[0]])

### Cleaning the Home Players Columns (2/2)
Getting the missing home player 1 to 11 values from the complete records of the away teams in the table

In [None]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            home_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            home_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        home_players_df = home_players_df.dropna(subset = ['away_player_'+str(m)])

        home_player_rows = match_df[match_df['home_player_'+str(m)].isnull()]
        if n < 10:
            home_player_rows = home_player_rows[home_player_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            home_player_rows = home_player_rows[home_player_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        home_player_rows = home_player_rows.reset_index()

        for i in range(0, home_player_rows.shape[0]):
            return_rows = home_players_df[home_players_df['away_team_api_id'] == home_player_rows['home_team_api_id'][i]]
            
            if return_rows.size != 0:
                Player = return_rows['away_player_'+str(m)].head(1)
                null_rows = match_df[(match_df['away_team_api_id'] == home_player_rows['home_team_api_id'][i])]
                null_rows = null_rows[null_rows['home_player_'+str(m)].isnull()]
                
                if n < 10:
                    null_rows = null_rows[null_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_rows = null_rows[null_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_rows.shape[0]):
                    indices.append(null_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'home_player_'+str(m)] = int(Player[Player.index[0]])

### Cleaning the Away Players Columns (1/2)
Getting the missing away player 1 to 11 values from the complete records of the away teams in the table

In [11]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            away_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            away_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        away_players_df = away_players_df.dropna(subset = ['away_player_'+str(m)])

        away_player_rows = match_df[match_df['away_player_'+str(m)].isnull()]
        if n < 10:
            away_player_rows = away_player_rows[away_player_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            away_player_rows = away_player_rows[away_player_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        away_player_rows = away_player_rows.reset_index()

        for i in range(0, away_player_rows.shape[0]):
            return_rows = away_players_df[away_players_df['away_team_api_id'] == away_player_rows['away_team_api_id'][i]]
            
            if return_rows.size != 0:
                Player = return_rows['away_player_'+str(m)].head(1)
                null_rows = match_df[(match_df['away_team_api_id'] == away_player_rows['away_team_api_id'][i])]
                null_rows = null_rows[null_rows['away_player_'+str(m)].isnull()]
                
                if n < 10:
                    null_rows = null_rows[null_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_rows = null_rows[null_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_rows.shape[0]):
                    indices.append(null_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'away_player_'+str(m)] = int(Player[Player.index[0]])

### Cleaning the Away Players Columns (2/2)
Getting the missing away player 1 to 11 values from the complete records of the home teams in the table

In [25]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            away_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            away_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        away_players_df = away_players_df.dropna(subset = ['home_player_'+str(m)])

        away_player_rows = match_df[match_df['away_player_'+str(m)].isnull()]
        if n < 10:
            away_player_rows = away_player_rows[away_player_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            away_player_rows = away_player_rows[away_player_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        away_player_rows = away_player_rows.reset_index()

        for i in range(0, away_player_rows.shape[0]):
            return_rows = away_players_df[away_players_df['home_team_api_id'] == away_player_rows['away_team_api_id'][i]]
            
            if return_rows.size != 0:
                Player = return_rows['home_player_'+str(m)].head(1)
                null_rows = match_df[(match_df['home_team_api_id'] == away_player_rows['away_team_api_id'][i])]
                null_rows = null_rows[null_rows['away_player_'+str(m)].isnull()]
                
                if n < 10:
                    null_rows = null_rows[null_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_rows = null_rows[null_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_rows.shape[0]):
                    indices.append(null_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'away_player_'+str(m)] = int(Player[Player.index[0]])

In [27]:
match_df.isnull().sum()

Match_id                0
Country_Name            0
League_Name             0
season                  0
stage                   0
date                    0
match_api_id            0
home_team_api_id        0
away_team_api_id        0
home_team_goal          0
away_team_goal          0
home_player_X1       1200
home_player_X2       1200
home_player_X3       1200
home_player_X4       1200
home_player_X5       1200
home_player_X6       1200
home_player_X7       1200
home_player_X8       1200
home_player_X9       1200
home_player_X10      1200
home_player_X11      1200
away_player_X1       1200
away_player_X2       1200
away_player_X3       1200
away_player_X4       1200
away_player_X5       1200
away_player_X6       1200
away_player_X7       1200
away_player_X8       1200
away_player_X9       1200
away_player_X10      1200
away_player_X11      1200
home_player_Y1       1200
home_player_Y2       1200
home_player_Y3       1200
home_player_Y4       1200
home_player_Y5       1200
home_player_