# DATA CLEANING

In [1]:
import pandas as pd
import numpy as np

## Data Gathering

In [2]:
results_csv_link = 'https://raw.githubusercontent.com/bechosen-spec/Women-Football-Result-Prediction/main/results.csv'
football_df = pd.read_csv(results_csv_link)

football_df.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1969-11-01,Italy,France,1,0,Euro,Novara,Italy,False
1,1969-11-01,Denmark,England,4,3,Euro,Aosta,Italy,True
2,1969-11-02,England,France,2,0,Euro,Turin,Italy,True
3,1969-11-02,Italy,Denmark,3,1,Euro,Turin,Italy,False
4,1975-08-25,Thailand,Australia,3,2,AFC Championship,Hong Kong,Hong Kong,True


In [3]:
#check data types
football_df.dtypes

date          object
home_team     object
away_team     object
home_score     int64
away_score     int64
tournament    object
city          object
country       object
neutral         bool
dtype: object

## Data Cleaning

In [4]:
# checking for missing values
football_df.isnull().sum()

date          0
home_team     0
away_team     0
home_score    0
away_score    0
tournament    0
city          0
country       0
neutral       0
dtype: int64

In [5]:
# checking for duplicates
football_df.duplicated().sum()

0

In [6]:
#new data types
new_dtype = {'date' : 'datetime64', 'home_team' : 'string', 'away_team' : 'string', 
             'home_score': 'int','away_score' : 'int', 'tournament': 'string',
            'city': 'string', 'country' : 'string'}

football_df = football_df.astype(new_dtype)

#check the result
football_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4884 entries, 0 to 4883
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   date        4884 non-null   datetime64[ns]
 1   home_team   4884 non-null   string        
 2   away_team   4884 non-null   string        
 3   home_score  4884 non-null   int64         
 4   away_score  4884 non-null   int64         
 5   tournament  4884 non-null   string        
 6   city        4884 non-null   string        
 7   country     4884 non-null   string        
 8   neutral     4884 non-null   bool          
dtypes: bool(1), datetime64[ns](1), int64(2), string(5)
memory usage: 310.1 KB


In [7]:
#ShootOut dataset

shootouts_csv_link = 'https://raw.githubusercontent.com/bechosen-spec/Women-Football-Result-Prediction/main/shootouts.csv'

shootouts_df = pd.read_csv(shootouts_csv_link)

shootouts_df.head()

Unnamed: 0,date,home_team,away_team,winner
0,1995-06-13,Sweden,China PR,China PR
1,1999-07-10,Brazil,Norway,Brazil
2,1999-07-10,United States,China PR,United States
3,2009-03-12,France,New Zealand,France
4,2011-07-09,England,France,France


In [8]:
shootouts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26 entries, 0 to 25
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   date       26 non-null     object
 1   home_team  26 non-null     object
 2   away_team  26 non-null     object
 3   winner     26 non-null     object
dtypes: object(4)
memory usage: 960.0+ bytes


In [9]:
#change data type
newdtype = {'date' : 'datetime64', 'home_team' : 'string', 'away_team' : 'string', 'winner' : 'string'}

shootouts_df = shootouts_df.astype(newdtype)

shootouts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26 entries, 0 to 25
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date       26 non-null     datetime64[ns]
 1   home_team  26 non-null     string        
 2   away_team  26 non-null     string        
 3   winner     26 non-null     string        
dtypes: datetime64[ns](1), string(3)
memory usage: 960.0 bytes


In [10]:
# Export cleaned Data

football_df.to_csv('cleaned_results.csv', index=False)

shootouts_df.to_csv('cleaned_shootouts.csv', index=False)