In [48]:
import pandas as pd

# Reading the Sports_Analytics Dataset (Football)
df = pd.read_csv('Sports_Dataset.csv')

In [49]:
# Analyze the Data Structure

# Display basic information and summary statistics
print(df.info())
print(df.describe(include='all'))  # include='all' gives statistics for categorical and numerical columns


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 25 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   MatchID                100000 non-null  int64 
 1   Team1                  100000 non-null  object
 2   Team2                  100000 non-null  object
 3   Team1_Score            100000 non-null  int64 
 4   Team2_Score            100000 non-null  int64 
 5   Date                   100000 non-null  object
 6   Location               100000 non-null  object
 7   Weather                100000 non-null  object
 8   Attendance             100000 non-null  int64 
 9   Team1_Possession       100000 non-null  int64 
 10  Team2_Possession       100000 non-null  int64 
 11  Shots_On_Target_Team1  100000 non-null  int64 
 12  Shots_On_Target_Team2  100000 non-null  int64 
 13  Fouls_Team1            100000 non-null  int64 
 14  Fouls_Team2            100000 non-null  int64 
 15  Y

In [57]:
import numpy as np
import pandas as pd
from math import ceil

num_records = 1_000_000

# Let's say you want to repeat each date roughly 100 times to cover the million entries
date_repeats = ceil(num_records / 1000)  # Total period of unique dates

synthetic_data = {
    'MatchID': np.arange(10001, 10001 + num_records),
    'Team1': np.random.choice(df['Team1'].unique(), size=num_records),
    'Team2': np.random.choice(df['Team2'].unique(), size=num_records),
    'Team1_Score': np.random.randint(0, df['Team1_Score'].max() + 1, size=num_records),
    'Team2_Score': np.random.randint(0, df['Team2_Score'].max() + 1, size=num_records),
    'Date': pd.date_range(start='2020-01-01', periods=1000).repeat(date_repeats)[:num_records].strftime('%Y-%m-%d'),
    'Location': np.random.choice(df['Location'].unique(), size=num_records),
    'Weather': np.random.choice(df['Weather'].unique(), size=num_records),
    'Attendance': np.random.randint(df['Attendance'].min(), df['Attendance'].max() + 1, size=num_records),
    'Team1_Possession': np.random.randint(40, 60, size=num_records),
    'Team2_Possession': np.random.randint(40, 60, size=num_records),
    'Shots_On_Target_Team1': np.random.randint(0, 20, size=num_records),
    'Shots_On_Target_Team2': np.random.randint(0, 20, size=num_records),
    'Fouls_Team1': np.random.randint(0, 15, size=num_records),
    'Fouls_Team2': np.random.randint(0, 15, size=num_records),
    'Yellow_Cards_Team1': np.random.randint(0, 5, size=num_records),
    'Yellow_Cards_Team2': np.random.randint(0, 5, size=num_records),
    'Red_Cards_Team1': np.random.randint(0, 3, size=num_records),
    'Red_Cards_Team2': np.random.randint(0, 3, size=num_records),
    'PlayerID': np.random.randint(1, 500, size=num_records),
    'PlayerName': np.random.choice(['Player' + str(i) for i in range(1, 501)], size=num_records),
    'Position': np.random.choice(['Goalkeeper', 'Defender', 'Midfielder', 'Forward'], size=num_records),
    'Goals': np.random.randint(0, 4, size=num_records),
    'Assists': np.random.randint(0, 3, size=num_records),
    'Minutes_Played': np.random.randint(10, 90, size=num_records)
}

# Convert to DataFrame
synthetic_df = pd.DataFrame(synthetic_data)


In [58]:
# Save to CSV
synthetic_df.to_csv('Synthetic_Sports_Dataset.csv', index=False)

In [53]:
print(synthetic_df)

        MatchID                Team1                Team2  Team1_Score  \
0         10001            Liverpool             AC Milan            4   
1         10002          Real Madrid  Paris Saint-Germain            3   
2         10003              Chelsea              Chelsea            3   
3         10004              Chelsea             Juventus            3   
4         10005              Chelsea        Bayern Munich            4   
...         ...                  ...                  ...          ...   
999995  1009996            Liverpool  Paris Saint-Germain            1   
999996  1009997  Paris Saint-Germain  Paris Saint-Germain            3   
999997  1009998            Liverpool            Liverpool            0   
999998  1009999    Manchester United              Arsenal            4   
999999  1010000          Real Madrid        Bayern Munich            3   

        Team2_Score        Date  Location   Weather  Attendance  \
0                 3  2016-01-01  Stadium1   

In [59]:
# Identify and Handle Missing Values

# Identify columns with missing values
print(synthetic_df.isnull().sum())


MatchID                  0
Team1                    0
Team2                    0
Team1_Score              0
Team2_Score              0
Date                     0
Location                 0
Weather                  0
Attendance               0
Team1_Possession         0
Team2_Possession         0
Shots_On_Target_Team1    0
Shots_On_Target_Team2    0
Fouls_Team1              0
Fouls_Team2              0
Yellow_Cards_Team1       0
Yellow_Cards_Team2       0
Red_Cards_Team1          0
Red_Cards_Team2          0
PlayerID                 0
PlayerName               0
Position                 0
Goals                    0
Assists                  0
Minutes_Played           0
dtype: int64


In [60]:

# Compare imputation methods for a sample column, e.g., 'Attendance'
from sklearn.impute import SimpleImputer, KNNImputer

# Mean imputation
mean_imputer = SimpleImputer(strategy='mean')
synthetic_df['Attendance'] = mean_imputer.fit_transform(synthetic_df['Attendance'].values.reshape(-1, 1))

# KNN imputation for more complex scenarios
knn_imputer = KNNImputer(n_neighbors=5)
synthetic_df[['Attendance', 'Team1_Score']] = knn_imputer.fit_transform(synthetic_df[['Attendance', 'Team1_Score']])



In [61]:
# Remove rows where any of the cells in that row is NA
df_cleaned = synthetic_df.dropna()

# Display new count of rows to show how many were removed
print("Number of rows after removing nulls:", df_cleaned.shape[0])

# Save the cleaned data if necessary
df_cleaned.to_csv('Cleaned_Sports_Dataset.csv', index=False)

Number of rows after removing nulls: 1000000


In [63]:
#Problem Statement 3: Detect and Correct Anomalies


from scipy import stats
import numpy as np

# Z-score method
z_scores = np.abs(stats.zscore(df['Attendance']))
df = df[z_scores < 3]  # Filter out outliers

# IQR method
Q1 = df['Attendance'].quantile(0.25)
Q3 = df['Attendance'].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df['Attendance'] < (Q1 - 1.5 * IQR)) | (df['Attendance'] > (Q3 + 1.5 * IQR)))]

# Document the results of anomaly corrections
print("Data after anomaly correction:", df.describe())


Data after anomaly correction:              MatchID    Team1_Score   Team2_Score    Attendance  \
count  100000.000000  100000.000000  100000.00000  100000.00000   
mean    50000.500000       2.006160       2.00120   27491.09053   
std     28867.657797       1.414886       1.41653   13009.87868   
min         1.000000       0.000000       0.00000    5000.00000   
25%     25000.750000       1.000000       1.00000   16279.75000   
50%     50000.500000       2.000000       2.00000   27429.00000   
75%     75000.250000       3.000000       3.00000   38740.00000   
max    100000.000000       4.000000       4.00000   49999.00000   

       Team1_Possession  Team2_Possession  Shots_On_Target_Team1  \
count     100000.000000     100000.000000           100000.00000   
mean          49.517530         49.513320                9.51954   
std            5.761111          5.761345                5.77318   
min           40.000000         40.000000                0.00000   
25%           45.000000  

In [None]:
#Problem Statement 4: Perform Comprehensive EDA

import seaborn as sns
import matplotlib.pyplot as plt

# Advanced visualizations
sns.pairplot(df.select_dtypes(include=['float64', 'int64']))  # Pair plots for numerical data
plt.show()

# Heatmap for correlations
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.show()

# Box plots for distributions
sns.boxplot(x='Team1_Score', y='Attendance', data=df)
plt.show()

# Generate detailed EDA report
# Documentation should include figures, insights from plots, and potential impacts on the project
