In [None]:
import pandas as pd
df=pd.read_csv('/content/sports_dataset_uncleaned.csv')

Identify and remove duplicate rows in the dataset.

In [None]:
t=df[df.duplicated()]
df = df.drop_duplicates()
t



Unnamed: 0,Player_ID,Player_Name,Gender,Age,Country,Team,Position,Sport_Type,Match_ID,Tournament,...,Bonus,Sponsorship_Amount,Ticket_Sales,Fan_ID,Ticket_Type,Payment_Mode,Feedback_Score,Coach_Name,Win_Status,Weather


Identify and remove duplicate Player_ID values.



In [None]:
df[df['Player_ID'].duplicated()]
df = df.drop_duplicates(subset='Player_ID')

Identify and remove duplicate Match_ID entries.

In [None]:
df[df['Match_ID'].duplicated()]
df = df.drop_duplicates(subset='Match_ID')

Identify duplicate player names with different countries (possible mismatched records).



In [None]:
t = df.groupby('Player_Name')['Country'].nunique()
u = t[t > 1]
u


Unnamed: 0_level_0,Country
Player_Name,Unnamed: 1_level_1
Amit,30
Carlos,33
David,32
John,33
Liu,38
Maria,28
Mike,36
Priya,28
Sara,33


Fill missing Coach_Name values with "Unknown Coach".



In [None]:
df['Coach_Name'] = df['Coach_Name'].fillna('Unknown Coach')


Fill missing Country values using the team’s most common country.

In [None]:
t = df.groupby('Team')['Country'].agg(pd.Series.mode)
df['Country'] = df['Country'].fillna(df['Team'].map(t))



Fill missing Feedback_Score values with the median feedback per sport type.



In [None]:
t = df.groupby('Sport_Type')['Feedback_Score'].median()
df['Feedback_Score'] = df['Feedback_Score'].fillna(df['Sport_Type'].map(t))

Fill missing Bonus values with the average bonus of the player’s team.



In [None]:
a = df.groupby('Team')['Bonus'].mean()
df['Bonus'] = df['Bonus'].fillna(df['Team'].map(a))



Fill missing Sport_Type with "Other".


In [None]:
c = 'Other'
df['Sport_Type'] = df['Sport_Type'].fillna(c)


Fill missing Venue values with "Unspecified".

In [None]:
h = 'Unspecified'
df['Venue'] = df['Venue'].fillna(h)


Remove rows where Match_Fee is less than or equal to 0.



In [None]:
df = df[df['Match_Fee'] > 0].reset_index(drop=True)


Remove rows where Bonus is negative.

In [None]:
n = df['Bonus'] >= 0
df = df[n]


Remove rows where Age < 10 or Age > 60.

In [None]:
t = (df['Age'] >= 10) & (df['Age'] <= 60)
df = df[t]


Remove rows where Feedback_Score is outside the range 1–5.



In [None]:
u = (df['Feedback_Score'] >= 1) & (df['Feedback_Score'] <= 5)
df = df[u]


Remove rows where Ticket_Sales > 50,000 (possible entry error).

In [None]:
a = df['Ticket_Sales'] <= 50000
df = df[a].reset_index(drop=True)


Fix invalid Match_Date formats (mixed date formats).




In [None]:
d = pd.to_datetime(df['Match_Date'], errors='coerce')
df['Match_Date'] = d.dt.strftime('%Y-%m-%d')


Correct typos in Gender column (Mle, femal, mal, etc.).

In [None]:
h = {'male': 'Male', 'MALE': 'Male', 'Xale': 'Male', 'Yale': 'Male',
     'Zale': 'Male', 'XALE': 'Male',
    'female': 'Female', 'FEMALE': 'Female', 'Xemale': 'Female',
    'Yemale': 'Female', 'Zemale': 'Female', 'XEMALE': 'Female',
    'ZEMALE': 'Female',
    'other': 'Other', 'OTHER': 'Other', 'Yther': 'Other',
    'Zther': 'Other', 'Xther': 'Other', 'YTHER': 'Other'}

df['Gender'] = df['Gender'].replace(h)


Correct typos in Payment_Mode column (creadit card, upiid, etc.).



In [None]:
n = {'Credit Card': 'Credit Card', 'credit card': 'Credit Card',
    'CREDIT CARD': 'Credit Card', 'Xredit Xard': 'Credit Card',
    'Yredit Yard': 'Credit Card', 'Zredit Zard': 'Credit Card',
    'XREDIT XARD': 'Credit Card', 'YREDIT YARD': 'Credit Card',
    'ZREDIT ZARD': 'Credit Card',

    'Cash': 'Cash', 'cash': 'Cash', 'CASH': 'Cash',
    'Xash': 'Cash', 'Yash': 'Cash', 'Zash': 'Cash',
    'YASH': 'Cash', 'ZASH': 'Cash',

    'UPI': 'UPI', 'upi': 'UPI',
    'XPI': 'UPI', 'YPI': 'UPI', 'ZPI': 'UPI',
    'Xpi': 'UPI', 'Ypi': 'UPI',

    'Online': 'Online', 'online': 'Online',
    'ONLINE': 'Online', 'Xnline': 'Online',
    'Ynline': 'Online', 'Znline': 'Online',
    'YNLINE': 'Online'}
df['Payment_Mode'] = df['Payment_Mode'].replace(n)


Standardize Win_Status values to ["Win", "Lose", "Draw"].

In [None]:
h = {'Win': 'Win', 'WIN': 'Win', 'win': 'Win',
    'Zin': 'Win', 'Yin': 'Win', 'Xin': 'Win',
    'ZIN': 'Win', 'YIN': 'Win',

    'Lose': 'Lose', 'lose': 'Lose', 'LOSE': 'Lose',
    'Xose': 'Lose', 'Yose': 'Lose', 'Zose': 'Lose',
    'XOSE': 'Lose', 'YOSE': 'Lose', 'ZOSE': 'Lose',

    'Draw': 'Draw', 'draw': 'Draw', 'DRAW': 'Draw',
    'Xraw': 'Draw', 'Yraw': 'Draw', 'Zraw': 'Draw',
    'XRAW': 'Draw', 'YRAW': 'Draw'}
df['Win_Status'] = df['Win_Status'].replace(h)


Replace unrealistic Sponsorship_Amount values (< 0 or > 1,000,000) with median

In [None]:
t = df['Sponsorship_Amount'].median()
df['Sponsorship_Amount'] = df['Sponsorship_Amount'].where((df['Sponsorship_Amount'] >= 0) & (df['Sponsorship_Amount'] <= 1_000_000),t)


Fetch all matches where Match_Fee > 4000.



In [None]:
t = df['Match_Fee'] > 4000
df[t]


Unnamed: 0,Player_ID,Player_Name,Gender,Age,Country,Team,Position,Sport_Type,Match_ID,Tournament,...,Bonus,Sponsorship_Amount,Ticket_Sales,Fan_ID,Ticket_Type,Payment_Mode,Feedback_Score,Coach_Name,Win_Status,Weather
4,P4657,Liu,Other,25,Germany,Team C,Forward,Basketball,M55549,World Cup,...,28.75,9181.16,2383.16,F24847,VVIP,Credit Card,3,Brown,Win,Windy
9,P7912,Mike,Other,40,South Africa,Team B,Bowler,Cricket,M60157,Friendly,...,220.50,8815.61,9130.40,F29794,VIP,Online,2,Lee,Lose,Sunny
12,P2535,John,Female,25,South Africa,Team A,Goalkeeper,basketball,M65085,League,...,677.79,8726.81,6165.10,F66389,VIP,Cash,1,Patel,Draw,Windy
15,P9279,Liu,Other,35,South Africa,Team D,Defender,Tennis,M77300,Friendly,...,347.86,1833.51,8499.91,F13053,Regular,Credit Card,4,Lee,Draw,Windy
20,P4611,David,Other,36,Australia,Team B,Bowler,Cricket,M49747,Championship,...,1283.58,4019.68,225.84,F70378,VVIP,Cash,2,Patel,Lose,Rainy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5792,P6789,David,Female,31,south africa,Team A,Defender,Tennis,M42006,League,...,505.63,2846.65,242.54,F27622,VVIP,Credit Card,4,Anderson,Lose,Cloudy
5804,P7360,Liu,Other,33,Brazil,Team B,Bowler,Basketball,M43247,League,...,1437.69,8562.36,8988.04,F53018,VVIP,Online,3,Rahul,Draw,Cloudy
5808,P8447,Amit,Other,33,UK,Team B,Bowler,Football,M22695,World Cup,...,1236.86,6860.10,7034.32,F24884,VVIP,Online,2,Lee,Lose,Sunny
5814,P2134,Mike,Female,18,Australia,Team E,Batsman,Cricket,M89954,League,...,1325.53,8832.33,6070.75,F86187,Regular,Online,3,Rahul,Draw,Sunny


Fetch all players with Feedback_Score < 3.



In [None]:
f=df[df['Feedback_Score']<3]
print(f)


     Player_ID Player_Name  Gender  Age       Country    Team    Position  \
1        P1409       Maria  Female   34       Germany  Team D  Goalkeeper   
6        P2679      Carlos    Male   43        Brazil  Team E     Batsman   
7        P9935        John    Male   35       Germany  Team B     Batsman   
8        P2424       Maria   Other   21           USA  Team E  Goalkeeper   
9        P7912        Mike   Other   40  South Africa  Team B      Bowler   
...        ...         ...     ...  ...           ...     ...         ...   
5819     P1880       Priya    Male   18            UK  Team A    Defender   
5820     P8021       Maria   Other   26        brazil  Team A      Bowler   
5824     P3632        Mike  Female   30        Brazil  Team C  Goalkeeper   
5825     P8156       Maria     NaN   38         India  Team D     Forward   
5826     P6811       David   Other   28            UK  Team B     Forward   

      Sport_Type Match_ID    Tournament  ...    Bonus Sponsorship_Amount  \

Fetch all players from India who played more than 10 matches.



In [None]:
p=df[(df['Score']>100) & (df['Country']=='india')]
print(p)

     Player_ID Player_Name  Gender  Age Country    Team    Position  \
968      P7464        Mike  Female   19   india  Team A      Bowler   
1557     P3102       David  Female   36   india  Team C    Defender   
2427     P5693        Sara  Female   30   india  Team E    Defender   
4980     P3401        John  Female   26   india  Team D     Batsman   
4987     P5320        Amit  Female   31   india  Team C     Batsman   
5243     P8559        Mike  Female   39   india  Team D    Defender   
5540     P5620       Maria     NaN   23   india  Team A  Goalkeeper   

      Sport_Type Match_ID    Tournament  ...    Bonus Sponsorship_Amount  \
968       Tennis   M74239  Championship  ...  1994.86            6969.03   
1557  Basketball   M71460        League  ...   555.92            6111.83   
2427  Basketball   M39193     World Cup  ...  1791.03            7714.44   
4980      Tennis   M33190     World Cup  ...  1771.98            9516.57   
4987    Football   M14764  Championship  ...  1964.

Fetch all VIP ticket buyers where Ticket_Sales > 5000.



In [None]:
t=df[(df['Ticket_Type']=='VIP')&(df['Ticket_Sales']>5000)]
print(t)

     Player_ID Player_Name  Gender  Age       Country    Team    Position  \
8        P2424       Maria   Other   21           USA  Team E  Goalkeeper   
9        P7912        Mike   Other   40  South Africa  Team B      Bowler   
12       P2535        John  Female   25  South Africa  Team A  Goalkeeper   
18       P9928         Liu    Male   38           USA  Team E  Goalkeeper   
21       P8359        Amit   Other   35     Australia  Team E    Defender   
...        ...         ...     ...  ...           ...     ...         ...   
5777     P3638        John   Other   21     AUSTRALIA  Team E      Bowler   
5779     P9459       Priya   Other   16     Australia  Team C    Defender   
5796     P5696       David  Female   40           USA  Team D      Bowler   
5802     P7157       Priya    Male   44       Germany  Team C     Forward   
5811     P1513         Liu    Male   35           USA  Team C     Batsman   

      Sport_Type Match_ID    Tournament  ...    Bonus Sponsorship_Amount  \


Fetch all matches where Win_Status = "Win" and Score > 100.



In [None]:
i=df[(df['Win_Status']=='Win')&(df['Score']>100)]
print(i)

     Player_ID Player_Name  Gender  Age       Country    Team    Position  \
4        P4657         Liu   Other   25       Germany  Team C     Forward   
14       P4811        Amit  Female   38           USA  Team B      Bowler   
16       P1434      Carlos  Female   18           USA  Team B     Forward   
68       P8573       Priya  Female   23  South Africa  Team D    Defender   
71       P4598        Amit    Male   16       Germany  Team E  Goalkeeper   
...        ...         ...     ...  ...           ...     ...         ...   
5781     P5765        Amit  Female   29           USA  Team E  Goalkeeper   
5786     P2428        Amit   Other   27  South Africa  Team C     Forward   
5788     P2541       David   Other   41     Australia  Team D  Goalkeeper   
5812     P1375        Mike    Male   44           USA  Team E  Goalkeeper   
5825     P8156       Maria     NaN   38         India  Team D     Forward   

      Sport_Type Match_ID    Tournament  ...    Bonus Sponsorship_Amount  \

Fetch all cricket players with Wickets > 3.




In [None]:
t=df[(df['Player_Name'].notnull())&(df['Wickets']>3)]
print(t)

     Player_ID Player_Name  Gender  Age       Country    Team    Position  \
0        P2824        Mike    Male   21            UK  Team E     Forward   
2        P5506       David    Male   43  South Africa  Team D  Goalkeeper   
10       P1520       Maria    Male   33     Australia  Team B      Bowler   
14       P4811        Amit  Female   38           USA  Team B      Bowler   
32       P2519       Maria   Other   40           USA  Team E     Batsman   
...        ...         ...     ...  ...           ...     ...         ...   
5814     P2134        Mike  Female   18     Australia  Team E     Batsman   
5818     P9597        Amit    Male   44         India  Team E  Goalkeeper   
5823     P8890        John   Other   41           USA  Team A      Bowler   
5824     P3632        Mike  Female   30        Brazil  Team C  Goalkeeper   
5825     P8156       Maria     NaN   38         India  Team D     Forward   

      Sport_Type Match_ID    Tournament  ...    Bonus Sponsorship_Amount  \

Fetch all football players with Goals >= 2.

In [None]:
y=df[(df['Player_Name'].notnull())&(df['Goals']>=2)]
y[['Goals','Player_Name']]

Unnamed: 0,Goals,Player_Name
0,4,Mike
1,4,Maria
2,4,David
3,4,Sara
6,3,Carlos
...,...,...
5819,4,Priya
5820,4,Maria
5822,2,Maria
5823,3,John


Fetch all matches with total Bonus + Match_Fee > 6000.

In [None]:
t=df[df['Bonus']+df['Match_Fee']>6000]
print(t)

     Player_ID Player_Name  Gender  Age    Country    Team    Position  \
38       P1711         Liu   Other   41    Germany  Team D      Bowler   
44       P5803         Liu    Male   29     Brazil  Team B      Bowler   
64       P3803        Sara    Male   42    GERMANY  Team E      Bowler   
69       P7216         Liu   Other   40      India  Team E     Batsman   
86       P3340      Carlos     NaN   22    Germany  Team A     Forward   
...        ...         ...     ...  ...        ...     ...         ...   
5743     P7823       Maria  Female   30      India  Team B    Defender   
5754     P7940       David   Other   28    Germany  Team A  Goalkeeper   
5755     P6014      Carlos   Other   18      INDIA  Team B      Bowler   
5761     P3666       Maria   Other   23        usa  Team C  Goalkeeper   
5766     P1748         Liu  Female   27  Australia  Team A  Goalkeeper   

      Sport_Type Match_ID    Tournament  ...    Bonus Sponsorship_Amount  \
38       Cricket   M55598      Frie

Fetch all players whose average Rating < 4.



In [None]:
t=df.loc[df['Rating']<4,'Rating'].mean()
print(t)

2.485864166244298


Fetch all matches in Rainy weather conditions

In [None]:
t=df[(df['Match_ID'].notnull()) & (df['Weather']=='Rainy')]
print(t[['Match_ID','Weather']])

     Match_ID Weather
7      M10217   Rainy
14     M27004   Rainy
16     M64419   Rainy
17     M82458   Rainy
20     M49747   Rainy
...       ...     ...
5790   M72554   Rainy
5796   M74126   Rainy
5810   M29175   Rainy
5823   M45447   Rainy
5824   M61557   Rainy

[1362 rows x 2 columns]


Concatenate Player id and Gender

In [None]:
t=df['Player_ID']+"-" + df['Gender']
print(t)

0         P2824-Male
1       P1409-Female
2         P5506-Male
3         P5012-Male
4        P4657-Other
            ...     
5822      P6722-Male
5823     P8890-Other
5824    P3632-Female
5825             NaN
5826     P6811-Other
Length: 5827, dtype: object


Concatenate MatchID and FeedbackScore

In [None]:
f=df['Match_ID'].astype(str)+"-" + df['Feedback_Score'].astype(str)
print(f)

0       M65327-5
1       M93279-1
2       M93088-4
3       M84016-5
4       M55549-3
          ...   
5822    M17649-5
5823    M45447-5
5824    M61557-2
5825    M89652-1
5826    M10609-2
Length: 5827, dtype: object


Compain playerID and fanID as unified record

In [None]:
df['unfied_Record']=df['Player_ID']+"-" + df['Fan_ID']
print(df['unfied_Record'])

0       P2824-F83977
1       P1409-F51266
2       P5506-F74976
3       P5012-F36267
4       P4657-F24847
            ...     
5822    P6722-F59608
5823    P8890-F88607
5824    P3632-F62760
5825    P8156-F34685
5826    P6811-F93735
Name: unfied_Record, Length: 5827, dtype: object


Calculate total earning per player (Match_Fee + Bonus + Sponsorship_Amount).



In [None]:
df['g']=df['Match_Fee']+df['Bonus']+df['Sponsorship_Amount']
y=(df[['Player_ID','g']])
print(y)

     Player_ID         g
0        P2824   9674.93
1        P1409   8751.73
2        P5506   6559.52
3        P5012  11808.05
4        P4657  13369.51
...        ...       ...
5822     P6722  10112.50
5823     P8890   9845.48
5824     P3632  12751.89
5825     P8156   5916.25
5826     P6811   7757.14

[5827 rows x 2 columns]


Calculate average rating per sport type.




In [None]:
y=df.groupby('Sport_Type')['Rating'].mean()
print(y)

Sport_Type
BASKETBALL    4.849118
Basketball    5.302352
CRICKET       5.488571
Cricket       5.573162
FOOTBALL      5.645862
Football      5.555507
TENNIS        5.800333
Tennis        5.472500
Xasketball    4.968462
Xennis        4.571667
Xootball      7.031765
Xricket       5.260556
YENNIS        6.320000
Yasketball    4.906000
Yennis        5.643000
Yootball      5.228667
Yricket       7.004375
ZOOTBALL      2.560000
Zasketball    4.361333
Zennis        6.272000
Zootball      5.312353
Zricket       3.973750
basketball    5.305806
cricket       5.173333
football      6.470000
tennis        5.692703
Name: Rating, dtype: float64


Identify players who only played one type of sport.

In [None]:
t=df.groupby('Rating')['Sport_Type'].nunique().eq(1)
print(t,'Rating','Sport_type')

Rating
1.00     False
1.01     False
1.02     False
1.03     False
1.04     False
         ...  
9.96     False
9.97     False
9.98     False
9.99     False
10.00    False
Name: Sport_Type, Length: 900, dtype: bool Rating Sport_type


Identify top 10 players by total sponsorship earnings.




In [None]:
top10=df.groupby('Player_Name')['Sponsorship_Amount'].sum().nlargest(10).reset_index()
print(top10)

  Player_Name  Sponsorship_Amount
0       David          3799915.60
1        Amit          3697906.35
2        Sara          3662246.04
3        Mike          3645019.89
4         Liu          3621170.83
5       Maria          3493804.97
6       Priya          3484091.80
7        John          3423621.47
8      Carlos          3258390.88


Fench all playername who have no coach

In [None]:
t=df['Coach_Name'].isnull()
y=df.loc[t,'Player_Name']
y

Unnamed: 0,Player_Name


Identify venues used by more than one sport type.




In [None]:
t=df.groupby('Venue')['Sport_Type'].nunique()
print(t)

Venue
Arena 2        24
Court 4        22
Ground 3       25
Stadium 1      24
Unspecified    19
Name: Sport_Type, dtype: int64


Add a column Total_Earning = Match_Fee + Bonus + Sponsorship_Amount.




In [None]:
df['Total_Earning']=(df['Match_Fee']+ df['Bonus'] +df['Sponsorship_Amount'])
df[['Total_Earning','Match_ID']]

Unnamed: 0,Total_Earning,Match_ID
0,9674.93,M65327
1,8751.73,M93279
2,6559.52,M93088
3,11808.05,M84016
4,13369.51,M55549
...,...,...
5822,10112.50,M17649
5823,9845.48,M45447
5824,12751.89,M61557
5825,5916.25,M89652


Add a column Performance_Score = Score + (Assists * 2) + (Goals * 3) + (Wickets * 4).



In [None]:
df['Performence_Score']=(df['Score'] +(df['Assists']*2) +(df['Goals']*3)+(df['Wickets']*4))
df['Performence_Score']

Unnamed: 0,Performence_Score
0,45.0
1,84.0
2,128.0
3,46.0
4,137.0
...,...
5822,66.0
5823,31.0
5824,128.0
5825,130.0


Add a column Revenue = Ticket_Sales + Merch_Sales (create Merch_Sales if missing).



In [None]:
if 'Merch_Sales' not in df.columns:
    df['Merch_Sales']=0


In [None]:
df['Revenue']=(df['Ticket_Sales']+df['Merch_Sales'])
df['Revenue']

Unnamed: 0,Revenue
0,3413.96
1,9388.07
2,4737.38
3,9372.68
4,2383.16
...,...
5822,4317.91
5823,3646.70
5824,3381.72
5825,8115.38


Create an Age_Group column:
<20: Junior
20–30: Adult
31–45: Senior

In [None]:
def Age_group(Age):
  if Age <20:
    return 'Junior'
  elif 20<=Age<=30:
    return 'Adult'
  elif 31<=Age<=45:
    return 'Senior'
df['Age_group']=df['Age'].apply(Age_group)

Create a Performance_Level column:
Rating < 4 → “Low”
4–7 → “Medium”
7 → “High”

In [None]:
def Performance_level(Rating):
  if Rating<4:
    return 'low'
  elif 4<=Rating<=7:
    return 'medium'
  else:
    return 'high'
df['Performance_level']=df['Rating'].apply(Performance_level)


Encode Gender numerically (Male = 1, Female = 2, Other = 3).




In [None]:
df['gender_numeric']=df['Gender'].map({'Male':1,'Female':2,'Others':3})

Encode Win_Status numerically (Win = 1, Draw = 0.5, Lose = 0).



In [None]:
df['Win_status_numeric']=df['Win_Status'].map({'Win':1,'Lose':2,'Draw':3})

Standardize Country names (capitalize first letters).



In [None]:
df['Country']=df['Country'].str.title()

Remove extra spaces from all text columns.

In [None]:
col=['Country','Player_Name','Gender','Team','Position','Sport_Type','Tournament','Opponent_Team','Venue','City','Assists','Ticket_Type','Payment_Mode','Coach_Name','Win_Status','Weather']
df[col]=df[col].astype(str)
for c in col:
  df[c]=df[c].str.strip()

Convert all string data to lowercase for text comparison.


In [None]:
for c in df.columns:
  if df[c].dtype=="object":
    df[c]=df[c].astype(str).str.lower()

Find top 5 players by total earnings.



In [None]:
top5 = (df.groupby("Player_ID")["Match_Fee"].sum().sort_values(ascending=False).head(5))
print(top5)

Player_ID
p4245    4999.73
p5092    4998.96
p1701    4998.02
p8216    4998.02
p1060    4997.55
Name: Match_Fee, dtype: float64



Find top 5 teams by total runs or goals scored.



In [None]:
top5_teams = (df.groupby("Team")["Score"].sum().sort_values(ascending=False).head(5))
print(top5_teams)


Team
team e    90524
team a    90420
team b    84721
team d    84287
team c    79017
Name: Score, dtype: int64


Find average feedback score per sport type.



In [None]:
y = (df.groupby("Sport_Type")["Feedback_Score"].mean().reset_index().sort_values("Feedback_Score", ascending=False))
print(y)


    Sport_Type  Feedback_Score
11     yricket        3.625000
10    yootball        3.266667
15     zricket        3.250000
6     xootball        3.235294
14    zootball        3.166667
7      xricket        3.166667
3       tennis        3.037063
2     football        3.032066
5       xennis        3.000000
0   basketball        2.982517
1      cricket        2.962264
4   xasketball        2.615385
12  zasketball        2.600000
9       yennis        2.583333
8   yasketball        2.533333
13      zennis        2.200000


Find average bonus per team.



In [None]:
avg_bonus = (df.groupby("Team")["Bonus"].mean().reset_index().sort_values("Bonus", ascending=False))
print(avg_bonus)


     Team        Bonus
2  team c  1025.523864
1  team b  1005.087877
0  team a   999.295274
3  team d   992.946326
4  team e   992.162723


Find the most common payment mode among fans.



In [None]:
most_common_payment = df["Payment_Mode"].value_counts().idxmax()
print(most_common_payment)

upi


Identify the player with the highest single match score.




In [None]:
top_player= df.loc[df["Score"].idxmax()]
print(top_player["Player_ID"], top_player["Score"])


p9830 149


Find players who have played more than 15 matches.



In [None]:
players_15plus = (df.groupby("Player_ID").size().reset_index(name="Match_count").query("Match_count > 15"))
print(players_15plus)


Empty DataFrame
Columns: [Player_ID, Match_count]
Index: []


Sort players by their total minutes played (descending).



In [None]:
players_sorted = (df.groupby("Player_ID")["Minutes_Played"].sum().reset_index().sort_values("Minutes_Played", ascending=False))
print(players_sorted)


     Player_ID  Minutes_Played
1020     p2629           119.0
4517     p7960           119.0
933      p2486           119.0
4659     p8183           119.0
1036     p2651           119.0
...        ...             ...
3594     p6546             0.0
3595     p6548             0.0
1195     p2876             0.0
2766     p5295             0.0
2881     p5469             0.0

[5827 rows x 2 columns]


Identify the most frequently played venue.



In [None]:
t = df['Venue'].value_counts().head(1)
print(t)


Venue
stadium 1    1411
Name: count, dtype: int64


Group data by Country and Sport_Type, and calculate total ticket sales.




In [None]:
grouped = df.groupby(['Country', 'Sport_Type'])['Ticket_Sales'].sum().reset_index()
print(grouped)


       Country  Sport_Type  Ticket_Sales
0    australia  basketball     979545.35
1    australia     cricket     941828.64
2    australia    football     817791.85
3    australia      tennis    1019670.77
4    australia  xasketball       3960.60
..         ...         ...           ...
172        zsa      tennis      11272.42
173  zustralia  basketball      17699.68
174  zustralia     cricket      13724.13
175  zustralia    football       9917.30
176  zustralia      tennis      26850.39

[177 rows x 3 columns]


Detect inconsistent date formats in Match_Date and fix them to YYYY-MM-DD.



In [None]:
df['Match_Date'] = pd.to_datetime(df['Match_Date'], errors='coerce').dt.strftime('%Y-%m-%d')
df['Match_Date']

Unnamed: 0,Match_Date
0,2023-05-05
1,
2,
3,
4,
...,...
5822,
5823,
5824,2023-08-08
5825,


Identify players who appear with different Countries (data inconsistency).



In [None]:
inconsistent_players = df.groupby('Player_ID')['Country'].nunique().gt(1)

player_names = inconsistent_players[inconsistent_players].index
print(player_names)



Index([], dtype='object', name='Player_ID')


Detect outliers in Sponsorship_Amount using IQR method.



In [None]:
Q1 = df['Sponsorship_Amount'].quantile(0.25)
Q3 = df['Sponsorship_Amount'].quantile(0.75)
IQR = Q3 - Q1

outlier_values = df.loc[
    (df['Sponsorship_Amount'] < Q1 - 1.5 * IQR) |
    (df['Sponsorship_Amount'] > Q3 + 1.5 * IQR),
    'Sponsorship_Amount']

print(outlier_values)


Series([], Name: Sponsorship_Amount, dtype: float64)


Replace blank Player_Name values with "Anonymous".



In [None]:
df['Player_Name'] = df['Player_Name'].replace('', 'Anonymous')
df['Player_Name']


Unnamed: 0,Player_Name
0,mike
1,maria
2,david
3,sara
4,liu
...,...
5822,maria
5823,john
5824,mike
5825,maria



Validate Payment_Mode entries (must be from the list ["Cash", "Credit Card", "UPI", "Online"]).



In [None]:
valid_modes = ["Cash", "Credit Card", "UPI", "Online"]
invalid_payments = df.loc[~df['Payment_Mode'].isin(valid_modes),['Player_Name', 'Payment_Mode']]
print(invalid_payments)


     Player_Name Payment_Mode
0           mike  credit card
1          maria         cash
2          david          upi
3           sara          upi
4            liu  credit card
...          ...          ...
5822       maria          upi
5823        john         cash
5824        mike       online
5825       maria          upi
5826       david       online

[5827 rows x 2 columns]


Identify and fix misspelled Sport_Type (e.g., “footbal”, “basktball”).



In [None]:
valid_sports = ["Cricket", "Football", "Basketball", "Tennis"]
misspelled = df[~df['Sport_Type'].str.title().isin(valid_sports)][['Player_Name', 'Sport_Type']]
print(misspelled)


     Player_Name  Sport_Type
46          mike  zasketball
56          sara      yennis
102        david    xootball
131         john     zricket
149         amit     xricket
...          ...         ...
5638        amit      zennis
5702       priya     xricket
5709        john  yasketball
5716        amit    yootball
5731       maria  zasketball

[169 rows x 2 columns]


Verify all Fan_ID values are unique.



In [None]:
df = df.drop_duplicates(subset='Fan_ID')
df['Fan_ID'].is_unique


True

Remove leading/trailing spaces from Player_Name, Coach_Name, and Country.



In [None]:
df['Player_Name'] = df['Player_Name'].str.strip()
df['Coach_Name'] = df['Coach_Name'].str.strip()
df['Country'] = df['Country'].str.strip()


Convert all Match_Date to datetime and sort chronologically.



In [None]:
df['Match_Date'] = pd.to_datetime(df['Match_Date'], errors='coerce')
df = df.sort_values(by='Match_Date')


Drop all irrelevant columns after cleaning (like Remarks, Seat_Number, etc.).

In [None]:
df=df.drop(columns=['Weather'])



Remove all column with null value from the data set

In [None]:
df=df.dropna(axis=0)

In [None]:
from google.colab import files
df.to_excel("cleaned_file.xlsx", index=False)
files.download("cleaned_file.xlsx")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>