# **Loading the dataset**

***


### Importing all modules and packages that will be used

In [2]:
import pandas as pd
import numpy as np
# % matplotlib inline

import sqlite3
import pprint
from sqlalchemy import create_engine

### Connecting to the SQLite database

In [3]:
# This code connects to the database through sqlite3
# This enables the database to be queried with traditional SQL code
load = sqlite3.connect('database.sqlite')
cursor = load.cursor()

# This code connects to the database through sqlalchemy
# This enables the database tables to be stored in a pandas dataframe and analyzed with python code and packages
engine = create_engine('sqlite:///database.sqlite')

### Listing all tables in the database

In [4]:
# This code checks the database to list all tables inside
cursor.execute('SELECT name from sqlite_master where type= "table"')
pprint.pprint(cursor.fetchall())

[('sqlite_sequence',),
 ('Player_Attributes',),
 ('Player',),
 ('Match',),
 ('League',),
 ('Country',),
 ('Team',),
 ('Team_Attributes',)]


Even though the above list contains 8 tables, there are actually 7 tables in the database. The first table `sqlite_sequence` is automatically generated by [SQLite](https://www.sqlite.org/autoinc.html#:~:text=The%20sqlite_sequence%20table%20is%20created,%2C%20INSERT%2C%20and%20DELETE%20statements.) to keep track of the other tables.

### Creating dataframes for each of the 7 tables

In [49]:
df_league = pd.read_sql_table("League", engine)
df_player = pd.read_sql_table("Player", engine)
df_match = pd.read_sql_table("Match", engine)
df_country = pd.read_sql_table("Country", engine)
df_team = pd.read_sql_table("Team", engine)
df_team_attr = pd.read_sql_table("Team_Attributes", engine)
df_player_attr = pd.read_sql_table("Player_Attributes", engine)

# cursor.close()

*From this point on, I try to run my analysis in both Python and SQL code.*

# **Investigating The Dataset**

---


## **1. `Match` Table**

In [53]:
df_match.head()

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
0,1,1,1,2008/2009,1,2008-08-17 00:00:00,492473,9987,9993,1,...,4.0,1.65,3.4,4.5,1.78,3.25,4.0,1.73,3.4,4.2
1,2,1,1,2008/2009,1,2008-08-16 00:00:00,492474,10000,9994,0,...,3.8,2.0,3.25,3.25,1.85,3.25,3.75,1.91,3.25,3.6
2,3,1,1,2008/2009,1,2008-08-16 00:00:00,492475,9984,8635,0,...,2.5,2.35,3.25,2.65,2.5,3.2,2.5,2.3,3.2,2.75
3,4,1,1,2008/2009,1,2008-08-17 00:00:00,492476,9991,9998,5,...,7.5,1.45,3.75,6.5,1.5,3.75,5.5,1.44,3.75,6.5
4,5,1,1,2008/2009,1,2008-08-16 00:00:00,492477,7947,9985,1,...,1.73,4.5,3.4,1.65,4.5,3.5,1.65,4.75,3.3,1.67


### Removing Columns

The `Match` table is very large and contains many columns which I would not need for my current analysis (especially the betting odds columns), so I drop all irrelevant columns.

For starters, I will be removing 
1. all columns that have **api** in their name
2. all columns with exactly 3 characters (these are the betting odd columns)

In [50]:
# Storing the table columns in a variable
match_columns = df_match.columns

# Creating a list of all columns to be deleted
match_columns_to_delete = [head for head in match_columns if head.find('api') > 0 or head.find('365') > 0 or len(head) == 3]

# Confirming changes
match_columns_to_delete


['match_api_id',
 'home_team_api_id',
 'away_team_api_id',
 'B365H',
 'B365D',
 'B365A',
 'BWH',
 'BWD',
 'BWA',
 'IWH',
 'IWD',
 'IWA',
 'LBH',
 'LBD',
 'LBA',
 'PSH',
 'PSD',
 'PSA',
 'WHH',
 'WHD',
 'WHA',
 'SJH',
 'SJD',
 'SJA',
 'VCH',
 'VCD',
 'VCA',
 'GBH',
 'GBD',
 'GBA',
 'BSH',
 'BSD',
 'BSA']

In [55]:
# Dropping unwanted columns
df_match_ed = df_match.drop(columns=match_columns_to_delete)

# Confirming changes
df_match_ed.head()

Unnamed: 0,id,country_id,league_id,season,stage,date,home_team_goal,away_team_goal,home_player_X1,home_player_X2,...,away_player_10,away_player_11,goal,shoton,shotoff,foulcommit,card,cross,corner,possession
0,1,1,1,2008/2009,1,2008-08-17 00:00:00,1,1,,,...,,,,,,,,,,
1,2,1,1,2008/2009,1,2008-08-16 00:00:00,0,0,,,...,,,,,,,,,,
2,3,1,1,2008/2009,1,2008-08-16 00:00:00,0,3,,,...,,,,,,,,,,
3,4,1,1,2008/2009,1,2008-08-17 00:00:00,5,0,,,...,,,,,,,,,,
4,5,1,1,2008/2009,1,2008-08-16 00:00:00,1,3,,,...,,,,,,,,,,


In [56]:
df_match_ed.isna().sum()

id                0
country_id        0
league_id         0
season            0
stage             0
              ...  
foulcommit    11762
card          11762
cross         11762
corner        11762
possession    11762
Length: 82, dtype: int64

In [41]:
list(df_match.columns)

['id',
 'country_id',
 'league_id',
 'season',
 'stage',
 'date',
 'match_api_id',
 'home_team_api_id',
 'away_team_api_id',
 'home_team_goal',
 'away_team_goal',
 'home_player_X1',
 'home_player_X2',
 'home_player_X3',
 'home_player_X4',
 'home_player_X5',
 'home_player_X6',
 'home_player_X7',
 'home_player_X8',
 'home_player_X9',
 'home_player_X10',
 'home_player_X11',
 'away_player_X1',
 'away_player_X2',
 'away_player_X3',
 'away_player_X4',
 'away_player_X5',
 'away_player_X6',
 'away_player_X7',
 'away_player_X8',
 'away_player_X9',
 'away_player_X10',
 'away_player_X11',
 'home_player_Y1',
 'home_player_Y2',
 'home_player_Y3',
 'home_player_Y4',
 'home_player_Y5',
 'home_player_Y6',
 'home_player_Y7',
 'home_player_Y8',
 'home_player_Y9',
 'home_player_Y10',
 'home_player_Y11',
 'away_player_Y1',
 'away_player_Y2',
 'away_player_Y3',
 'away_player_Y4',
 'away_player_Y5',
 'away_player_Y6',
 'away_player_Y7',
 'away_player_Y8',
 'away_player_Y9',
 'away_player_Y10',
 'away_player

In [101]:
df_team.query('team_fifa_api_id.isnull()')

Unnamed: 0,id,team_api_id,team_fifa_api_id,team_long_name,team_short_name
8,9,7947,,FCV Dender EH,DEN
14,15,4049,,Tubize,TUB
170,26561,6601,,FC Volendam,VOL
204,34816,177361,,Termalica Bruk-Bet Nieciecza,TBN
208,35286,7992,,Trofense,TRO
213,35291,10213,,Amadora,AMA
223,36248,9765,,Portimonense,POR
225,36723,4064,,Feirense,FEI
232,38789,6367,,Uniao da Madeira,MAD
233,38791,188163,,Tondela,TON


## SQL query to ....

In [100]:
cursor.execute(""
"SELECT * FROM Team WHERE team_fifa_api_id is Null"""
)
pprint.pprint(cursor.fetchall())

[(9, 7947, None, 'FCV Dender EH', 'DEN'),
 (15, 4049, None, 'Tubize', 'TUB'),
 (26561, 6601, None, 'FC Volendam', 'VOL'),
 (34816, 177361, None, 'Termalica Bruk-Bet Nieciecza', 'TBN'),
 (35286, 7992, None, 'Trofense', 'TRO'),
 (35291, 10213, None, 'Amadora', 'AMA'),
 (36248, 9765, None, 'Portimonense', 'POR'),
 (36723, 4064, None, 'Feirense', 'FEI'),
 (38789, 6367, None, 'Uniao da Madeira', 'MAD'),
 (38791, 188163, None, 'Tondela', 'TON'),
 (51606, 7896, None, 'Lugano', 'LUG')]


In [None]:
# df_player.nunique()
# df_player.duplicated().sum()

df_player.isnull().sum()
dp = df_player.duplicated(keep=False, subset=['player_name'])
df_player[dp]