### Connecting to the European Soccer Database

In [1]:
import sqlite3
import pandas as pd
import numpy as np
import datetime

try:
    sqliteConnection = sqlite3.connect('database.sqlite')
    #cursor = sqliteConnection.cursor()
    print("Database created and Successfully Connected to SQLite")
except sqlite3.Error as error:
    print("Error while connecting to sqlite", error)

Database created and Successfully Connected to SQLite


### Populating the 7 tables in the database to 7 Pandas Dataframes

In [80]:
country_query = "SELECT * from Country"
country_df = pd.read_sql_query(country_query, sqliteConnection)

league_query = "SELECT * from League"
league_df = pd.read_sql_query(league_query, sqliteConnection)

match_query = "SELECT * from Match"
match_df = pd.read_sql_query(match_query, sqliteConnection)

player_query = "SELECT * from Player"
player_df = pd.read_sql_query(player_query, sqliteConnection)

playerAttributes_query = "SELECT * from Player_Attributes"
playerAttributes_df = pd.read_sql_query(playerAttributes_query, sqliteConnection)

team_query = "SELECT * from Team"
team_df = pd.read_sql_query(team_query, sqliteConnection)

teamAttributes_query = "SELECT * from Team_Attributes"
teamAttributes_df = pd.read_sql_query(teamAttributes_query, sqliteConnection)

### Cleaning the Country Dataframe

In [3]:
del country_df['id']
country_df.insert(0, 'Country_id', range(1, len(country_df)+1))
country_df = country_df.rename(columns={"name": "Country_Name"})
#country_df

### Cleaning the League Dataframe

In [4]:
del league_df['id']
del league_df['country_id']
league_df.insert(0, 'League_id', range(1, len(league_df)+1))
league_df = league_df.rename(columns={"name": "League_Name"})
#league_df

### Cleaning the Match Dataframe

In [5]:
match_df = match_df.rename(columns={"id": "Match_id"})

#Replacing the old country_id values with the Country_Name values for better readability
match_df['country_id'] = match_df['country_id'].replace(1,'Belgium')
match_df['country_id'] = match_df['country_id'].replace(1729,'England')
match_df['country_id'] = match_df['country_id'].replace(4769,'France')
match_df['country_id'] = match_df['country_id'].replace(7809,'Germany')
match_df['country_id'] = match_df['country_id'].replace(10257,'Italy')
match_df['country_id'] = match_df['country_id'].replace(13274,'Netherlands')
match_df['country_id'] = match_df['country_id'].replace(15722,'Poland')
match_df['country_id'] = match_df['country_id'].replace(17642,'Portugal')
match_df['country_id'] = match_df['country_id'].replace(19694,'Scotland')
match_df['country_id'] = match_df['country_id'].replace(21518,'Spain')
match_df['country_id'] = match_df['country_id'].replace(24558,'Switzerland')
match_df = match_df.rename(columns={"country_id": "Country_Name"})

#Replacing the old league_id values with the League_Name values for better readability
match_df['league_id'] = match_df['league_id'].replace(1,'Belgium Jupiler League')
match_df['league_id'] = match_df['league_id'].replace(1729,'England Premier League')
match_df['league_id'] = match_df['league_id'].replace(4769,'France Ligue 1')
match_df['league_id'] = match_df['league_id'].replace(7809,'Germany 1. Bundesliga')
match_df['league_id'] = match_df['league_id'].replace(10257,'Italy Serie A')
match_df['league_id'] = match_df['league_id'].replace(13274,'Netherlands Eredivisie')
match_df['league_id'] = match_df['league_id'].replace(15722,'Poland Ekstraklasa')
match_df['league_id'] = match_df['league_id'].replace(17642,'Portugal Liga ZON Sagres')
match_df['league_id'] = match_df['league_id'].replace(19694,'Scotland Premier League')
match_df['league_id'] = match_df['league_id'].replace(21518,'Spain LIGA BBVA')
match_df['league_id'] = match_df['league_id'].replace(24558,'Switzerland Super League')
match_df = match_df.rename(columns={"league_id": "League_Name"})
match_df['date'] = pd.to_datetime(match_df['date'], format='%Y/%m/%d %H:%M:%S')
#match_df.head()
#Lesa na2es feeh shwayet hagat

### Cleaning the Player Attributes Dataframe

In [None]:
#playerAttributes_df.head(10)

In [None]:
print('The number of rows before removing duplicates: ' + str(playerAttributes_df.shape[0]))
duplicateRowsPlayerAttributesDF = playerAttributes_df[playerAttributes_df.duplicated(['player_fifa_api_id', 'player_api_id'])]
duplicatePercentage = (duplicateRowsPlayerAttributesDF.shape[0]/playerAttributes_df.shape[0]) * 100
print('The percentage of duplicated rows: ' + str(duplicatePercentage))

In [None]:
playerAttributes_df_1 = playerAttributes_df.sort_values('date').drop_duplicates(['player_fifa_api_id', 'player_api_id'],keep='last')
playerAttributes_df_1 = playerAttributes_df_1.sort_values('id')
print('The number of rows after removing duplicates: ' + str(playerAttributes_df_1.shape[0]))
percentageLeft = (playerAttributes_df_1.shape[0]/playerAttributes_df.shape[0]) * 100
print('The percentage of rows left: ' + str(percentageLeft))
#playerAttributes_df_1.head(10)

In [None]:
# get the number of missing values per column
missing_values_count_playerAttributes = playerAttributes_df_1.isnull().sum()

#print(missing_values_count_playerAttributes)

total_cells_playerAttributes = np.product(playerAttributes_df_1.shape) 
total_missing_playerAttributes = missing_values_count_playerAttributes.sum()

# percentage of data that is missing
percentage_missign_values_playerAttributes = (total_missing_playerAttributes/total_cells_playerAttributes) * 100
#print('Percentage of missing values: '+ str(percentage_missign_values_playerAttributes))

In [None]:
rows_with_missing_values = playerAttributes_df_1[playerAttributes_df_1.isna().any(axis=1)]
print('Percentage of rows with missing values: '+ str((rows_with_missing_values.shape[0]/playerAttributes_df_1.shape[0])*100))

In [None]:
# merge with player_df to search on the internet for missing values with player names
df2 = pd.merge(player_df,playerAttributes_df_1,on=['player_api_id','player_fifa_api_id'])
# sort and get the percentage of old players having NaNs
rows_with_null_sorted = df2[df2.isna().any(axis=1)].sort_values('birthday')
end_date = pd.Timestamp(datetime.date(1980,1,1))
rows_with_null_sorted['birthday'] = pd.to_datetime(rows_with_null_sorted['birthday'])
mask = rows_with_null_sorted['birthday'] <= end_date
old_players = rows_with_null_sorted.loc[mask]
#old_players

In [None]:
rows_with_missing_values_old_players = old_players[old_players.isna().any(axis=1)]
print('Percentage of rows of old players with missing values: '+ str((rows_with_missing_values_old_players.shape[0]/playerAttributes_df_1.shape[0])*100))

In [None]:
playerAttributes_df_1 = playerAttributes_df_1.dropna()
#playerAttributes_df_1

In [None]:
missing_values_count_playerAttributes = playerAttributes_df_1.isnull().sum()
#print(missing_values_count_playerAttributes)

In [None]:
playerAttributes_df_1 = playerAttributes_df_1.rename(columns={"date": "date_modified"})
#playerAttributes_df_1

In [14]:
pd.set_option('display.max_rows', 800)
pd.set_option('display.max_columns', 116)

### Cleaning the Home Players X1 to X11 Coordinates (1/2)
Getting the missing X1 to X11 values from the complete records of the home teams in the table

In [6]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            home_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            home_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        home_players_df = home_players_df.dropna(subset = ['home_player_X'+str(m)])
        home_player_X_rows = match_df[match_df['home_player_X'+str(m)].isnull()]
        if n < 10:
            home_player_X_rows = home_player_X_rows[home_player_X_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            home_player_X_rows = home_player_X_rows[home_player_X_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        home_player_X_rows = home_player_X_rows.reset_index()

        for i in range(0, home_player_X_rows.shape[0]):
            return_rows = home_players_df[home_players_df['home_team_api_id'] == home_player_X_rows['home_team_api_id'][i]]
            
            if return_rows.size != 0:
                X_Player = return_rows['home_player_X'+str(m)].head(1)
                null_X_rows = match_df[(match_df['home_team_api_id'] == home_player_X_rows['home_team_api_id'][i])]
                null_X_rows = null_X_rows[null_X_rows['home_player_X'+str(m)].isnull()]
                
                if n < 10:
                    null_X_rows = null_X_rows[null_X_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_X_rows = null_X_rows[null_X_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_X_rows.shape[0]):
                    indices.append(null_X_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'home_player_X'+str(m)] = int(X_Player[X_Player.index[0]])

### Cleaning the Home Players X1 to X11 Coordinates (2/2)
Getting the missing X1 to X11 values from the complete records of the away teams in the table

In [19]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            home_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            home_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        home_players_df = home_players_df.dropna(subset = ['away_player_X'+str(m)])
        home_player_X_rows = match_df[match_df['home_player_X'+str(m)].isnull()]
        if n < 10:
            home_player_X_rows = home_player_X_rows[home_player_X_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            home_player_X_rows = home_player_X_rows[home_player_X_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        home_player_X_rows = home_player_X_rows.reset_index()

        for i in range(0, home_player_X_rows.shape[0]):
            return_rows = home_players_df[home_players_df['away_team_api_id'] == home_player_X_rows['home_team_api_id'][i]]
            
            if return_rows.size != 0:
                X_Player = return_rows['away_player_X'+str(m)].head(1)
                null_X_rows = match_df[(match_df['away_team_api_id'] == home_player_X_rows['home_team_api_id'][i])]
                null_X_rows = null_X_rows[null_X_rows['home_player_X'+str(m)].isnull()]
                
                if n < 10:
                    null_X_rows = null_X_rows[null_X_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_X_rows = null_X_rows[null_X_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_X_rows.shape[0]):
                    indices.append(null_X_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'home_player_X'+str(m)] = int(X_Player[X_Player.index[0]])

### Cleaning the Home Players Y1 to Y11 Coordinates (1/2)
Getting the missing Y1 to Y11 values from the complete records of the home teams in the table

In [7]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            home_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            home_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        home_players_df = home_players_df.dropna(subset = ['home_player_Y'+str(m)])
        home_player_Y_rows = match_df[match_df['home_player_Y'+str(m)].isnull()]
        if n < 10:
            home_player_Y_rows = home_player_Y_rows[home_player_Y_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            home_player_Y_rows = home_player_Y_rows[home_player_Y_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        home_player_Y_rows = home_player_Y_rows.reset_index()

        for i in range(0, home_player_Y_rows.shape[0]):
            return_rows = home_players_df[home_players_df['home_team_api_id'] == home_player_Y_rows['home_team_api_id'][i]]
            
            if return_rows.size != 0:
                Y_Player = return_rows['home_player_Y'+str(m)].head(1)
                null_Y_rows = match_df[(match_df['home_team_api_id'] == home_player_Y_rows['home_team_api_id'][i])]
                null_Y_rows = null_Y_rows[null_Y_rows['home_player_Y'+str(m)].isnull()]
                
                if n < 10:
                    null_Y_rows = null_Y_rows[null_Y_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_Y_rows = null_Y_rows[null_Y_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_Y_rows.shape[0]):
                    indices.append(null_Y_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'home_player_Y'+str(m)] = int(Y_Player[Y_Player.index[0]])

### Cleaning the Home Players Y1 to Y11 Coordinates (2/2)
Getting the missing Y1 to Y11 values from the complete records of the away teams in the table

In [21]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            home_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            home_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        home_players_df = home_players_df.dropna(subset = ['away_player_Y'+str(m)])
        home_player_Y_rows = match_df[match_df['home_player_Y'+str(m)].isnull()]
        if n < 10:
            home_player_Y_rows = home_player_Y_rows[home_player_Y_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            home_player_Y_rows = home_player_Y_rows[home_player_Y_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        home_player_Y_rows = home_player_Y_rows.reset_index()

        for i in range(0, home_player_Y_rows.shape[0]):
            return_rows = home_players_df[home_players_df['away_team_api_id'] == home_player_Y_rows['home_team_api_id'][i]]
            
            if return_rows.size != 0:
                Y_Player = return_rows['away_player_Y'+str(m)].head(1)
                null_Y_rows = match_df[(match_df['away_team_api_id'] == home_player_Y_rows['home_team_api_id'][i])]
                null_Y_rows = null_Y_rows[null_Y_rows['home_player_Y'+str(m)].isnull()]
                
                if n < 10:
                    null_Y_rows = null_Y_rows[null_Y_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_Y_rows = null_Y_rows[null_Y_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_Y_rows.shape[0]):
                    indices.append(null_Y_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'home_player_Y'+str(m)] = int(Y_Player[Y_Player.index[0]])

### Cleaning the Away Players X1 to X11 Coordinates (1/2)
Getting the missing X1 to X11 values from the complete records of the away teams in the table

In [8]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            away_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            away_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        away_players_df = away_players_df.dropna(subset = ['away_player_X'+str(m)])

        away_player_X_rows = match_df[match_df['away_player_X'+str(m)].isnull()]
        if n < 10:
            away_player_X_rows = away_player_X_rows[away_player_X_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            away_player_X_rows = away_player_X_rows[away_player_X_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        away_player_X_rows = away_player_X_rows.reset_index()

        for i in range(0, away_player_X_rows.shape[0]):
            return_rows = away_players_df[away_players_df['away_team_api_id'] == away_player_X_rows['away_team_api_id'][i]]
            
            if return_rows.size != 0:
                X_Player = return_rows['away_player_X'+str(m)].head(1)
                null_X_rows = match_df[(match_df['away_team_api_id'] == away_player_X_rows['away_team_api_id'][i])]
                null_X_rows = null_X_rows[null_X_rows['away_player_X'+str(m)].isnull()]
                
                if n < 10:
                    null_X_rows = null_X_rows[null_X_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_X_rows = null_X_rows[null_X_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_X_rows.shape[0]):
                    indices.append(null_X_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'away_player_X'+str(m)] = int(X_Player[X_Player.index[0]])

### Cleaning the Away Players X1 to X11 Coordinates (2/2)
Getting the missing X1 to X11 values from the complete records of the home teams in the table

In [22]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            away_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            away_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        away_players_df = away_players_df.dropna(subset = ['home_player_X'+str(m)])
        away_player_X_rows = match_df[match_df['away_player_X'+str(m)].isnull()]
        if n < 10:
            away_player_X_rows = away_player_X_rows[away_player_X_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            away_player_X_rows = away_player_X_rows[away_player_X_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        away_player_X_rows = away_player_X_rows.reset_index()

        for i in range(0, away_player_X_rows.shape[0]):
            return_rows = away_players_df[away_players_df['home_team_api_id'] == away_player_X_rows['away_team_api_id'][i]]
            
            if return_rows.size != 0:
                X_Player = return_rows['home_player_X'+str(m)].head(1)
                null_X_rows = match_df[(match_df['home_team_api_id'] == away_player_X_rows['away_team_api_id'][i])]
                null_X_rows = null_X_rows[null_X_rows['away_player_X'+str(m)].isnull()]
                
                if n < 10:
                    null_X_rows = null_X_rows[null_X_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_X_rows = null_X_rows[null_X_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_X_rows.shape[0]):
                    indices.append(null_X_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'away_player_X'+str(m)] = int(X_Player[X_Player.index[0]])

### Cleaning the Away Players Y1 to Y11 Coordinates (1/2)
Getting the missing Y1 to Y11 values from the complete records of the away teams in the table

In [9]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            away_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            away_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        away_players_df = away_players_df.dropna(subset = ['away_player_Y'+str(m)])

        away_player_Y_rows = match_df[match_df['away_player_Y'+str(m)].isnull()]
        if n < 10:
            away_player_Y_rows = away_player_Y_rows[away_player_Y_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            away_player_Y_rows = away_player_Y_rows[away_player_Y_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        away_player_Y_rows = away_player_Y_rows.reset_index()

        for i in range(0, away_player_Y_rows.shape[0]):
            return_rows = away_players_df[away_players_df['away_team_api_id'] == away_player_Y_rows['away_team_api_id'][i]]
            
            if return_rows.size != 0:
                Y_Player = return_rows['away_player_Y'+str(m)].head(1)
                null_Y_rows = match_df[(match_df['away_team_api_id'] == away_player_Y_rows['away_team_api_id'][i])]
                null_Y_rows = null_Y_rows[null_Y_rows['away_player_Y'+str(m)].isnull()]
                
                if n < 10:
                    null_Y_rows = null_Y_rows[null_Y_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_Y_rows = null_Y_rows[null_Y_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_Y_rows.shape[0]):
                    indices.append(null_Y_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'away_player_Y'+str(m)] = int(Y_Player[Y_Player.index[0]])

### Cleaning the Away Players Y1 to Y11 Coordinates (2/2)
Getting the missing Y1 to Y11 values from the complete records of the home teams in the table

In [23]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            away_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            away_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        away_players_df = away_players_df.dropna(subset = ['home_player_Y'+str(m)])
        away_player_Y_rows = match_df[match_df['away_player_Y'+str(m)].isnull()]
        if n < 10:
            away_player_Y_rows = away_player_Y_rows[away_player_Y_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            away_player_Y_rows = away_player_Y_rows[away_player_Y_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        away_player_Y_rows = away_player_Y_rows.reset_index()

        for i in range(0, away_player_Y_rows.shape[0]):
            return_rows = away_players_df[away_players_df['home_team_api_id'] == away_player_Y_rows['away_team_api_id'][i]]
            
            if return_rows.size != 0:
                Y_Player = return_rows['home_player_Y'+str(m)].head(1)
                null_Y_rows = match_df[(match_df['home_team_api_id'] == away_player_Y_rows['away_team_api_id'][i])]
                null_Y_rows = null_Y_rows[null_Y_rows['away_player_Y'+str(m)].isnull()]
                
                if n < 10:
                    null_Y_rows = null_Y_rows[null_Y_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_Y_rows = null_Y_rows[null_Y_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_Y_rows.shape[0]):
                    indices.append(null_Y_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'away_player_Y'+str(m)] = int(Y_Player[Y_Player.index[0]])

### Cleaning the Home Players Columns (1/2)
Getting the missing home player 1 to 11 values from the complete records of the home teams in the table

In [10]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            home_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            home_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        home_players_df = home_players_df.dropna(subset = ['home_player_'+str(m)])

        home_player_rows = match_df[match_df['home_player_'+str(m)].isnull()]
        if n < 10:
            home_player_rows = home_player_rows[home_player_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            home_player_rows = home_player_rows[home_player_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        home_player_rows = home_player_rows.reset_index()

        for i in range(0, home_player_rows.shape[0]):
            return_rows = home_players_df[home_players_df['home_team_api_id'] == home_player_rows['home_team_api_id'][i]]
            
            if return_rows.size != 0:
                Player = return_rows['home_player_'+str(m)].head(1)
                null_rows = match_df[(match_df['home_team_api_id'] == home_player_rows['home_team_api_id'][i])]
                null_rows = null_rows[null_rows['home_player_'+str(m)].isnull()]
                
                if n < 10:
                    null_rows = null_rows[null_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_rows = null_rows[null_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_rows.shape[0]):
                    indices.append(null_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'home_player_'+str(m)] = int(Player[Player.index[0]])

### Cleaning the Home Players Columns (2/2)
Getting the missing home player 1 to 11 values from the complete records of the away teams in the table

In [None]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            home_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            home_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        home_players_df = home_players_df.dropna(subset = ['away_player_'+str(m)])

        home_player_rows = match_df[match_df['home_player_'+str(m)].isnull()]
        if n < 10:
            home_player_rows = home_player_rows[home_player_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            home_player_rows = home_player_rows[home_player_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        home_player_rows = home_player_rows.reset_index()

        for i in range(0, home_player_rows.shape[0]):
            return_rows = home_players_df[home_players_df['away_team_api_id'] == home_player_rows['home_team_api_id'][i]]
            
            if return_rows.size != 0:
                Player = return_rows['away_player_'+str(m)].head(1)
                null_rows = match_df[(match_df['away_team_api_id'] == home_player_rows['home_team_api_id'][i])]
                null_rows = null_rows[null_rows['home_player_'+str(m)].isnull()]
                
                if n < 10:
                    null_rows = null_rows[null_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_rows = null_rows[null_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_rows.shape[0]):
                    indices.append(null_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'home_player_'+str(m)] = int(Player[Player.index[0]])

### Cleaning the Away Players Columns (1/2)
Getting the missing away player 1 to 11 values from the complete records of the away teams in the table

In [11]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            away_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            away_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        away_players_df = away_players_df.dropna(subset = ['away_player_'+str(m)])

        away_player_rows = match_df[match_df['away_player_'+str(m)].isnull()]
        if n < 10:
            away_player_rows = away_player_rows[away_player_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            away_player_rows = away_player_rows[away_player_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        away_player_rows = away_player_rows.reset_index()

        for i in range(0, away_player_rows.shape[0]):
            return_rows = away_players_df[away_players_df['away_team_api_id'] == away_player_rows['away_team_api_id'][i]]
            
            if return_rows.size != 0:
                Player = return_rows['away_player_'+str(m)].head(1)
                null_rows = match_df[(match_df['away_team_api_id'] == away_player_rows['away_team_api_id'][i])]
                null_rows = null_rows[null_rows['away_player_'+str(m)].isnull()]
                
                if n < 10:
                    null_rows = null_rows[null_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_rows = null_rows[null_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_rows.shape[0]):
                    indices.append(null_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'away_player_'+str(m)] = int(Player[Player.index[0]])

### Cleaning the Away Players Columns (2/2)
Getting the missing away player 1 to 11 values from the complete records of the home teams in the table

In [25]:
for n in range(8,16):
    for m in range(1,12):
        if n < 10:
            away_players_df = match_df[(match_df['season'] == '200'+str(n)+'/'+'200'+str(n+1))]
        else:
            away_players_df = match_df[(match_df['season'] == '20'+str(n)+'/'+'20'+str(n+1))]
        
        away_players_df = away_players_df.dropna(subset = ['home_player_'+str(m)])

        away_player_rows = match_df[match_df['away_player_'+str(m)].isnull()]
        if n < 10:
            away_player_rows = away_player_rows[away_player_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
        else:
            away_player_rows = away_player_rows[away_player_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
        
        away_player_rows = away_player_rows.reset_index()

        for i in range(0, away_player_rows.shape[0]):
            return_rows = away_players_df[away_players_df['home_team_api_id'] == away_player_rows['away_team_api_id'][i]]
            
            if return_rows.size != 0:
                Player = return_rows['home_player_'+str(m)].head(1)
                null_rows = match_df[(match_df['home_team_api_id'] == away_player_rows['away_team_api_id'][i])]
                null_rows = null_rows[null_rows['away_player_'+str(m)].isnull()]
                
                if n < 10:
                    null_rows = null_rows[null_rows['season'] == '200'+str(n)+'/'+'200'+str(n+1)]
                else:
                    null_rows = null_rows[null_rows['season'] == '20'+str(n)+'/'+'20'+str(n+1)]
                    
                indices = []
                for j in range(0, null_rows.shape[0]):
                    indices.append(null_rows.index[j])
                for k in range(0, len(indices)):
                    index = match_df.index[indices[k]]
                    match_df.at[index, 'away_player_'+str(m)] = int(Player[Player.index[0]])

In [27]:
match_df.isnull().sum()

Match_id                0
Country_Name            0
League_Name             0
season                  0
stage                   0
date                    0
match_api_id            0
home_team_api_id        0
away_team_api_id        0
home_team_goal          0
away_team_goal          0
home_player_X1       1200
home_player_X2       1200
home_player_X3       1200
home_player_X4       1200
home_player_X5       1200
home_player_X6       1200
home_player_X7       1200
home_player_X8       1200
home_player_X9       1200
home_player_X10      1200
home_player_X11      1200
away_player_X1       1200
away_player_X2       1200
away_player_X3       1200
away_player_X4       1200
away_player_X5       1200
away_player_X6       1200
away_player_X7       1200
away_player_X8       1200
away_player_X9       1200
away_player_X10      1200
away_player_X11      1200
home_player_Y1       1200
home_player_Y2       1200
home_player_Y3       1200
home_player_Y4       1200
home_player_Y5       1200
home_player_

### Cleaning the Team Dataframe

In [60]:
missing_values_count_team = team_df.isnull().sum()
print(missing_values_count_team)

id                   0
team_api_id          0
team_fifa_api_id    11
team_long_name       0
team_short_name      0
dtype: int64


In [61]:
team_api_id_1=team_df[['team_api_id','team_fifa_api_id']]
team_api_id_2=teamAttributes_df[['team_api_id','team_fifa_api_id']]
team_api_id_2.drop_duplicates(subset ="team_fifa_api_id", keep = "last", inplace = True)
missing_api_1=team_api_id_1[team_api_id_1["team_fifa_api_id"].isnull()]
found_api_2= team_api_id_2.loc[team_api_id_2['team_api_id'].isin(missing_api_1)]
found_api_2


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,team_api_id,team_fifa_api_id


In [62]:
team_df.dropna(inplace=True)
missing_values_count_team = team_df.isnull().sum()
print(missing_values_count_team)


id                  0
team_api_id         0
team_fifa_api_id    0
team_long_name      0
team_short_name     0
dtype: int64


In [64]:
duplicateRowsTeamDF = team_df[team_df.duplicated(['team_long_name'])]
duplicateNumber = duplicateRowsTeamDF.shape[0]
print('The number of duplicated rows in team_long_name: ' + str(duplicateNumber))

duplicateRowsTeamDF = team_df[team_df.duplicated(['team_fifa_api_id'])]
duplicateNumber = duplicateRowsTeamDF.shape[0]
print('The number of duplicated rows in team_fifa_api_id: ' + str(duplicateNumber))

duplicateRowsTeamDF = team_df[team_df.duplicated(['team_api_id'])]
duplicateNumber = duplicateRowsTeamDF.shape[0]
print('The number of duplicated rows in team_api_id: ' + str(duplicateNumber))

The number of duplicated rows in team_long_name: 3
The number of duplicated rows in team_fifa_api_id: 3
The number of duplicated rows in team_api_id: 0


In [65]:
duplicate_rows_df= team_df[team_df.duplicated(['team_fifa_api_id', 'team_long_name'], keep=False)]
display(duplicate_rows_df)
target_team_api_id=duplicate_rows_df['team_fifa_api_id'].tolist()
teamAttributes_subset=teamAttributes_df.loc[teamAttributes_df['team_fifa_api_id'].isin(target_team_api_id)]
display(teamAttributes_subset.sort_values(['team_fifa_api_id','team_api_id']))

Unnamed: 0,id,team_api_id,team_fifa_api_id,team_long_name,team_short_name
15,16,9996,111560.0,Royal Excel Mouscron,MOU
24,2510,274581,111560.0,Royal Excel Mouscron,MOP
182,31444,8031,111429.0,Polonia Bytom,POB
183,31445,8020,111429.0,Polonia Bytom,GOR
189,31451,8244,301.0,Widzew Łódź,LOD
199,32409,8024,301.0,Widzew Łódź,WID


Unnamed: 0,id,team_fifa_api_id,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,...,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
1394,1395,301,8024,2011-02-22 00:00:00,63,Balanced,,Little,48,Mixed,...,53,Normal,Organised,37,Medium,58,Press,63,Normal,Cover
1395,1396,301,8024,2012-02-22 00:00:00,66,Balanced,,Little,72,Long,...,54,Normal,Organised,32,Deep,32,Contain,62,Normal,Cover
1396,1397,301,8024,2013-09-20 00:00:00,66,Balanced,,Little,59,Mixed,...,54,Normal,Organised,32,Deep,43,Press,46,Normal,Cover
1397,1398,301,8024,2014-09-19 00:00:00,66,Balanced,52.0,Normal,72,Long,...,51,Normal,Organised,36,Medium,38,Press,65,Normal,Cover
754,755,301,8244,2011-02-22 00:00:00,63,Balanced,,Little,48,Mixed,...,53,Normal,Organised,37,Medium,58,Press,63,Normal,Cover
755,756,301,8244,2012-02-22 00:00:00,66,Balanced,,Little,72,Long,...,54,Normal,Organised,32,Deep,32,Contain,62,Normal,Cover
756,757,301,8244,2013-09-20 00:00:00,66,Balanced,,Little,59,Mixed,...,54,Normal,Organised,32,Deep,43,Press,46,Normal,Cover
757,758,301,8244,2014-09-19 00:00:00,66,Balanced,52.0,Normal,72,Long,...,51,Normal,Organised,36,Medium,38,Press,65,Normal,Cover
522,523,111429,8020,2010-02-22 00:00:00,30,Slow,,Little,50,Mixed,...,70,Lots,Organised,60,Medium,70,Double,55,Normal,Cover
523,524,111429,8020,2011-02-22 00:00:00,46,Balanced,,Little,38,Mixed,...,53,Normal,Organised,53,Medium,54,Press,55,Normal,Cover


In [66]:
team_df=team_df.sort_values(['team_fifa_api_id','team_api_id'])
team_df.drop_duplicates(subset =['team_fifa_api_id', 'team_long_name'], keep = 'last', inplace = True)

duplicateRowsTeamDF = team_df[team_df.duplicated(['team_long_name'])]
duplicateNumber = duplicateRowsTeamDF.shape[0]
print('The number of duplicated rows in team_long_name: ' + str(duplicateNumber))

duplicateRowsTeamDF = team_df[team_df.duplicated(['team_fifa_api_id'])]
duplicateNumber = duplicateRowsTeamDF.shape[0]
print('The number of duplicated rows in team_fifa_api_id: ' + str(duplicateNumber))

The number of duplicated rows in team_long_name: 0
The number of duplicated rows in team_fifa_api_id: 0


In [67]:
duplicateRowsTeamDF = team_df[team_df.duplicated(['team_long_name'])]
duplicateNumber = duplicateRowsTeamDF.shape[0]
print('The number of duplicated rows in team_long_name: ' + str(duplicateNumber))

duplicateRowsTeamDF = team_df[team_df.duplicated(['team_fifa_api_id'])]
duplicateNumber = duplicateRowsTeamDF.shape[0]
print('The number of duplicated rows in team_fifa_api_id: ' + str(duplicateNumber))

duplicateRowsTeamDF = team_df[team_df.duplicated(['team_api_id'])]
duplicateNumber = duplicateRowsTeamDF.shape[0]
print('The number of duplicated rows in team_api_id: ' + str(duplicateNumber))

print('The number of missing values in each column:')
team_df.dropna(inplace=True)
missing_values_count_team= team_df.isnull().sum()
print(missing_values_count_team)

The number of duplicated rows in team_long_name: 0
The number of duplicated rows in team_fifa_api_id: 0
The number of duplicated rows in team_api_id: 0
The number of missing values in each column:
id                  0
team_api_id         0
team_fifa_api_id    0
team_long_name      0
team_short_name     0
dtype: int64


In [68]:
del team_df['id']
team_df.insert(0, 'team_id', range(1, len(team_df)+1))
team_df

Unnamed: 0,team_id,team_api_id,team_fifa_api_id,team_long_name,team_short_name
27,1,9825,1.0,Arsenal,ARS
33,2,10252,2.0,Aston Villa,AVL
36,3,8655,3.0,Blackburn Rovers,BLB
39,4,8559,4.0,Bolton Wanderers,BOL
43,5,8455,5.0,Chelsea,CHE
...,...,...,...,...,...
82,281,108893,111989.0,AC Arles-Avignon,ARL
149,282,6269,112225.0,Novara,NOV
155,283,208931,112409.0,Carpi,CAP
202,284,8027,112512.0,Zawisza Bydgoszcz,ZAW


### Cleaning The Team Attributes Dataframe

In [82]:
duplicateRowsTeamDF = teamAttributes_df[teamAttributes_df.duplicated(['team_api_id'])]
duplicateNumber = duplicateRowsTeamDF.shape[0]
print('The number of duplicated rows in team_api_id: ' + str(duplicateNumber))

duplicateRowsTeamDF = teamAttributes_df[teamAttributes_df.duplicated(['team_fifa_api_id'])]
duplicateNumber = duplicateRowsTeamDF.shape[0]
print('The number of duplicated rows in team_fifa_api_id: ' + str(duplicateNumber))

The number of duplicated rows in team_api_id: 1170
The number of duplicated rows in team_fifa_api_id: 1173


In [70]:
duplicated_api= teamAttributes_df[teamAttributes_df.duplicated(['team_api_id'], keep=False)]
duplicated_fifa_api= teamAttributes_df[teamAttributes_df.duplicated(['team_fifa_api_id'], keep=False)]
differ= duplicated_fifa_api[~duplicated_fifa_api['team_api_id'].isin(duplicated_api['team_api_id'])]
differ

Unnamed: 0,id,team_fifa_api_id,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,...,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
858,859,111560,274581,2015-09-10 00:00:00,50,Balanced,50.0,Normal,50,Mixed,...,50,Normal,Organised,45,Medium,45,Press,50,Normal,Cover


In [83]:
teamAttributes_df=teamAttributes_df.sort_values(['team_fifa_api_id','team_api_id','date'])
teamAttributes_df.drop_duplicates(subset =['team_fifa_api_id', 'team_api_id'], keep = 'last', inplace = True)

duplicateRowsTeamDF = teamAttributes_df[teamAttributes_df.duplicated(['team_api_id'])]
duplicateNumber = duplicateRowsTeamDF.shape[0]
print('The number of duplicated rows in team_api_id: ' + str(duplicateNumber))

duplicateRowsTeamDF = teamAttributes_df[teamAttributes_df.duplicated(['team_fifa_api_id'])]
duplicateNumber = duplicateRowsTeamDF.shape[0]
print('The number of duplicated rows in team_fifa_api_id: ' + str(duplicateNumber))

The number of duplicated rows in team_api_id: 0
The number of duplicated rows in team_fifa_api_id: 3


Unnamed: 0,id,team_fifa_api_id,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,...,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
75,76,1,9825,2015-09-10 00:00:00,59,Balanced,51.0,Normal,30,Short,...,46,Normal,Free Form,51,Medium,44,Press,52,Normal,Cover
81,82,2,10252,2015-09-10 00:00:00,63,Balanced,37.0,Normal,54,Mixed,...,38,Normal,Organised,35,Medium,44,Press,54,Normal,Cover
177,178,3,8655,2015-09-10 00:00:00,60,Balanced,60.0,Normal,65,Mixed,...,42,Normal,Organised,50,Medium,50,Press,57,Normal,Cover
202,203,4,8559,2015-09-10 00:00:00,57,Balanced,34.0,Normal,61,Mixed,...,38,Normal,Organised,39,Medium,48,Press,58,Normal,Cover
311,312,5,8455,2015-09-10 00:00:00,67,Fast,41.0,Normal,36,Mixed,...,44,Normal,Organised,39,Medium,41,Press,46,Normal,Cover
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,65,111989,108893,2015-09-10 00:00:00,56,Balanced,46.0,Normal,54,Mixed,...,43,Normal,Organised,48,Medium,53,Press,57,Normal,Cover
913,914,112225,6269,2014-09-19 00:00:00,66,Balanced,57.0,Normal,56,Mixed,...,57,Normal,Organised,36,Medium,49,Press,53,Normal,Cover
275,276,112409,208931,2015-09-10 00:00:00,80,Fast,45.0,Normal,65,Mixed,...,50,Normal,Organised,25,Deep,55,Press,35,Normal,Cover
1451,1452,112512,8027,2015-09-10 00:00:00,54,Balanced,51.0,Normal,40,Mixed,...,52,Normal,Organised,44,Medium,47,Press,52,Normal,Cover


In [84]:
duplicated_fifa_api= teamAttributes_df[teamAttributes_df.duplicated(['team_fifa_api_id'], keep=False)]
duplicated_fifa_api

Unnamed: 0,id,team_fifa_api_id,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,...,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
1397,1398,301,8024,2014-09-19 00:00:00,66,Balanced,52.0,Normal,72,Long,...,51,Normal,Organised,36,Medium,38,Press,65,Normal,Cover
757,758,301,8244,2014-09-19 00:00:00,66,Balanced,52.0,Normal,72,Long,...,51,Normal,Organised,36,Medium,38,Press,65,Normal,Cover
523,524,111429,8020,2011-02-22 00:00:00,46,Balanced,,Little,38,Mixed,...,53,Normal,Organised,53,Medium,54,Press,55,Normal,Cover
996,997,111429,8031,2011-02-22 00:00:00,46,Balanced,,Little,38,Mixed,...,53,Normal,Organised,53,Medium,54,Press,55,Normal,Cover
860,861,111560,9996,2015-09-10 00:00:00,50,Balanced,50.0,Normal,50,Mixed,...,50,Normal,Organised,45,Medium,45,Press,50,Normal,Cover
858,859,111560,274581,2015-09-10 00:00:00,50,Balanced,50.0,Normal,50,Mixed,...,50,Normal,Organised,45,Medium,45,Press,50,Normal,Cover


In [85]:
teamAttributes_df=teamAttributes_df.sort_values(['team_fifa_api_id','team_api_id','date'])
teamAttributes_df.drop_duplicates(subset =['team_fifa_api_id'], keep = 'last', inplace = True)

duplicateRowsTeamDF = teamAttributes_df[teamAttributes_df.duplicated(['team_fifa_api_id'])]
duplicateNumber = duplicateRowsTeamDF.shape[0]
print('The number of duplicated rows in team_fifa_api_id: ' + str(duplicateNumber))

The number of duplicated rows in team_fifa_api_id: 0


Unnamed: 0,id,team_fifa_api_id,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,...,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
75,76,1,9825,2015-09-10 00:00:00,59,Balanced,51.0,Normal,30,Short,...,46,Normal,Free Form,51,Medium,44,Press,52,Normal,Cover
81,82,2,10252,2015-09-10 00:00:00,63,Balanced,37.0,Normal,54,Mixed,...,38,Normal,Organised,35,Medium,44,Press,54,Normal,Cover
177,178,3,8655,2015-09-10 00:00:00,60,Balanced,60.0,Normal,65,Mixed,...,42,Normal,Organised,50,Medium,50,Press,57,Normal,Cover
202,203,4,8559,2015-09-10 00:00:00,57,Balanced,34.0,Normal,61,Mixed,...,38,Normal,Organised,39,Medium,48,Press,58,Normal,Cover
311,312,5,8455,2015-09-10 00:00:00,67,Fast,41.0,Normal,36,Mixed,...,44,Normal,Organised,39,Medium,41,Press,46,Normal,Cover
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,65,111989,108893,2015-09-10 00:00:00,56,Balanced,46.0,Normal,54,Mixed,...,43,Normal,Organised,48,Medium,53,Press,57,Normal,Cover
913,914,112225,6269,2014-09-19 00:00:00,66,Balanced,57.0,Normal,56,Mixed,...,57,Normal,Organised,36,Medium,49,Press,53,Normal,Cover
275,276,112409,208931,2015-09-10 00:00:00,80,Fast,45.0,Normal,65,Mixed,...,50,Normal,Organised,25,Deep,55,Press,35,Normal,Cover
1451,1452,112512,8027,2015-09-10 00:00:00,54,Balanced,51.0,Normal,40,Mixed,...,52,Normal,Organised,44,Medium,47,Press,52,Normal,Cover


In [110]:
print('The number of missing values in each column:')
missing_values_count_team_attributes = teamAttributes_df.isnull().sum()
print(missing_values_count_team_attributes)

The number of missing values in each column:
id                                 0
team_fifa_api_id                   0
team_api_id                        0
date                               0
buildUpPlaySpeed                   0
buildUpPlaySpeedClass              0
buildUpPlayDribbling              25
buildUpPlayDribblingClass          0
buildUpPlayPassing                 0
buildUpPlayPassingClass            0
buildUpPlayPositioningClass        0
chanceCreationPassing              0
chanceCreationPassingClass         0
chanceCreationCrossing             0
chanceCreationCrossingClass        0
chanceCreationShooting             0
chanceCreationShootingClass        0
chanceCreationPositioningClass     0
defencePressure                    0
defencePressureClass               0
defenceAggression                  0
defenceAggressionClass             0
defenceTeamWidth                   0
defenceTeamWidthClass              0
defenceDefenderLineClass           0
dtype: int64


In [115]:
subset=teamAttributes_df.loc[:,['buildUpPlayDribblingClass','buildUpPlayDribbling']]
display(subset[subset.isnull().any(axis=1)])
display(subset[subset.isnull().any(axis=1)].count())
display(subset.loc[subset['buildUpPlayDribblingClass'] == 'Little'])
subset_little= subset.loc[subset['buildUpPlayDribblingClass'] == 'Little']
subset_little.dropna(inplace=True)
little_mean=round(subset_little["buildUpPlayDribbling"].mean())
print('mean value that is supposed to be filled:', little_mean)
teamAttributes_df["buildUpPlayDribbling"].fillna(little_mean, inplace = True)

Unnamed: 0,buildUpPlayDribblingClass,buildUpPlayDribbling
418,Little,
364,Little,
1155,Little,
693,Little,
1428,Little,
1102,Little,
343,Little,
151,Little,
1235,Little,
926,Little,


buildUpPlayDribblingClass    25
buildUpPlayDribbling          0
dtype: int64

Unnamed: 0,buildUpPlayDribblingClass,buildUpPlayDribbling
834,Little,32.0
1185,Little,31.0
147,Little,24.0
141,Little,29.0
1108,Little,27.0
418,Little,
171,Little,32.0
1383,Little,32.0
364,Little,
983,Little,31.0


mean value that is supposed to be filled: 31.0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [118]:
duplicateRowsTeamDF = teamAttributes_df[teamAttributes_df.duplicated(['team_api_id'])]
duplicateNumber = duplicateRowsTeamDF.shape[0]
print('The number of duplicated rows in team_api_id: ' + str(duplicateNumber))

duplicateRowsTeamDF = teamAttributes_df[teamAttributes_df.duplicated(['team_fifa_api_id'])]
duplicateNumber = duplicateRowsTeamDF.shape[0]
print('The number of duplicated rows in team_fifa_api_id: ' + str(duplicateNumber))

print('The number of missing values in each column:')
missing_values_count_team_attributes = teamAttributes_df.isnull().sum()
print(missing_values_count_team_attributes)

teamAttributes_df

The number of duplicated rows in team_api_id: 0
The number of duplicated rows in team_fifa_api_id: 0
The number of missing values in each column:
id                                0
team_fifa_api_id                  0
team_api_id                       0
date                              0
buildUpPlaySpeed                  0
buildUpPlaySpeedClass             0
buildUpPlayDribbling              0
buildUpPlayDribblingClass         0
buildUpPlayPassing                0
buildUpPlayPassingClass           0
buildUpPlayPositioningClass       0
chanceCreationPassing             0
chanceCreationPassingClass        0
chanceCreationCrossing            0
chanceCreationCrossingClass       0
chanceCreationShooting            0
chanceCreationShootingClass       0
chanceCreationPositioningClass    0
defencePressure                   0
defencePressureClass              0
defenceAggression                 0
defenceAggressionClass            0
defenceTeamWidth                  0
defenceTeamWidthClass     

Unnamed: 0,id,team_fifa_api_id,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,...,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
75,76,1,9825,2015-09-10 00:00:00,59,Balanced,51.0,Normal,30,Short,...,46,Normal,Free Form,51,Medium,44,Press,52,Normal,Cover
81,82,2,10252,2015-09-10 00:00:00,63,Balanced,37.0,Normal,54,Mixed,...,38,Normal,Organised,35,Medium,44,Press,54,Normal,Cover
177,178,3,8655,2015-09-10 00:00:00,60,Balanced,60.0,Normal,65,Mixed,...,42,Normal,Organised,50,Medium,50,Press,57,Normal,Cover
202,203,4,8559,2015-09-10 00:00:00,57,Balanced,34.0,Normal,61,Mixed,...,38,Normal,Organised,39,Medium,48,Press,58,Normal,Cover
311,312,5,8455,2015-09-10 00:00:00,67,Fast,41.0,Normal,36,Mixed,...,44,Normal,Organised,39,Medium,41,Press,46,Normal,Cover
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,65,111989,108893,2015-09-10 00:00:00,56,Balanced,46.0,Normal,54,Mixed,...,43,Normal,Organised,48,Medium,53,Press,57,Normal,Cover
913,914,112225,6269,2014-09-19 00:00:00,66,Balanced,57.0,Normal,56,Mixed,...,57,Normal,Organised,36,Medium,49,Press,53,Normal,Cover
275,276,112409,208931,2015-09-10 00:00:00,80,Fast,45.0,Normal,65,Mixed,...,50,Normal,Organised,25,Deep,55,Press,35,Normal,Cover
1451,1452,112512,8027,2015-09-10 00:00:00,54,Balanced,51.0,Normal,40,Mixed,...,52,Normal,Organised,44,Medium,47,Press,52,Normal,Cover


In [121]:
del teamAttributes_df['id']
teamAttributes_df= teamAttributes_df.sort_values(['team_fifa_api_id'])
teamAttributes_df.insert(0, 'team_attributes_id', range(1, len(teamAttributes_df)+1))
teamAttributes_df

Unnamed: 0,team_attributes_id,team_fifa_api_id,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,...,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
75,1,1,9825,2015-09-10 00:00:00,59,Balanced,51.0,Normal,30,Short,...,46,Normal,Free Form,51,Medium,44,Press,52,Normal,Cover
81,2,2,10252,2015-09-10 00:00:00,63,Balanced,37.0,Normal,54,Mixed,...,38,Normal,Organised,35,Medium,44,Press,54,Normal,Cover
177,3,3,8655,2015-09-10 00:00:00,60,Balanced,60.0,Normal,65,Mixed,...,42,Normal,Organised,50,Medium,50,Press,57,Normal,Cover
202,4,4,8559,2015-09-10 00:00:00,57,Balanced,34.0,Normal,61,Mixed,...,38,Normal,Organised,39,Medium,48,Press,58,Normal,Cover
311,5,5,8455,2015-09-10 00:00:00,67,Fast,41.0,Normal,36,Mixed,...,44,Normal,Organised,39,Medium,41,Press,46,Normal,Cover
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,281,111989,108893,2015-09-10 00:00:00,56,Balanced,46.0,Normal,54,Mixed,...,43,Normal,Organised,48,Medium,53,Press,57,Normal,Cover
913,282,112225,6269,2014-09-19 00:00:00,66,Balanced,57.0,Normal,56,Mixed,...,57,Normal,Organised,36,Medium,49,Press,53,Normal,Cover
275,283,112409,208931,2015-09-10 00:00:00,80,Fast,45.0,Normal,65,Mixed,...,50,Normal,Organised,25,Deep,55,Press,35,Normal,Cover
1451,284,112512,8027,2015-09-10 00:00:00,54,Balanced,51.0,Normal,40,Mixed,...,52,Normal,Organised,44,Medium,47,Press,52,Normal,Cover


In [134]:
teams1=team_df.loc[:,'team_fifa_api_id']
teams2=teamAttributes_df.loc[:,'team_fifa_api_id']
differ1= teams1[~teams1.isin(teams2)]
print('number of teams present in team_df and not present in teamAttributes_df= ', differ1.count())
differ2= teams2[~teams2.isin(teams1)]
print('number of teams present in teamAttributes_df and not present in team_df= ', differ2.count())

number of teams present in team_df and not present in teamAttributes_df=  0
number of teams present in teamAttributes_df and not present in team_df=  0
