In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as py 
import plotly.express as px

In [None]:
ipl=pd.read_csv("ipl.csv")
ipl

Q1. What is the maximum number of matches played by an individual player in a season? Print the player name along with the number of matched played.

In [None]:
ipl.groupby("Player")["Matches"].sum().sort_values(ascending=False).head(4).reset_index()

Q2. Top 2 players with maximum Average who have scored atleast 2 half centuries ?

In [None]:
ipl[ipl["50"]>=2].sort_values(by="Avg",ascending=False).head(2)

Q3. Create 2 new columns based on Player name. First column will have first name and second column will have last name. Eg: for the player Shikhar Dhawan, Shikhar will be the first name and Dhawan will be the last name.

In [None]:
ipl[['First Name', 'Last Name']] =ipl['Player'].str.split(' ', n=1,expand=True)
ipl.head()

Q4. Create a new column (Cleaned_Highest_score) based on Highest score variable. Remove the Asterik(*) mark and convert the data type into INT.

In [None]:
ipl["Cleaned_Highest_score"]=ipl["Highest Score"].str.replace("*","").astype(int)
ipl

Q5. Print the total number of centuries scored in the entire season.

In [None]:
ipl["100"].sum()

Q6. Print all the player names whose strike rate is less than the average strike rate of all players in entire season. Print the player name, his strike rate and average strike rate.

In [None]:
avg_run_rate=(ipl["Runs"].sum() / ipl["Balls faced"].sum()) * 100

In [None]:
df_run_rate=(ipl.groupby("Player")["Runs"].sum() / ipl.groupby("Player")["Balls faced"].sum()) * 100
filtered_df=df_run_rate[df_run_rate < avg_run_rate].reset_index().rename(
    columns={'index': 'Player', 0: 'His_Run_Rate'}
)
filtered_df["average strike rate"]=avg_run_rate
filtered_df

Q7. Please check the correlation between the features and create a heat map.

In [None]:
ipl_numeric = ipl.drop(columns=['Jersey No', 'Player'], errors='ignore').select_dtypes(include=['int64', 'float64'])
correlation_matrix = ipl_numeric.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(
    correlation_matrix,
    annot=True,          
    fmt=".2f",           
    cmap='coolwarm',     
    linewidths=.5,      
    cbar_kws={'label': 'Correlation Coefficient'}
)
plt.title('Correlation Matrix of IPL Player Statistics (2020)', fontsize=16)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()

Q8. Check the list of players who has an average greater than 50 as well strike rate above 120. Print player name, average and strike rate.

In [None]:
ipl.loc[(ipl["Avg"]>50) & (ipl["Strike rate"]>120),["Player","Avg","Strike rate"]]

Q9. Please check the list of players who has an average greater than 40 and balls faced above 100. Print player name, average and balls faced.

In [None]:
ipl.loc[(ipl["Avg"]>40) & (ipl["Balls faced"]>100),["Player","Avg","Strike rate"]]

Q10. Players who scored atleast one century in this season. Create visualization.

In [None]:
ipl_century = ipl[ipl["100"] >= 1].sort_values(by="100", ascending=False)
plot_data = ipl_century[['Player', '100']]
plt.figure(figsize=(10, 6))
plt.bar(plot_data['Player'], plot_data['100'], color='darkorange')
plt.title('Players Who Scored at Least One Century (IPL Season)', fontsize=14)
plt.xlabel('Player Name', fontsize=12)
plt.ylabel('Number of Centuries (100s)', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.yticks(plot_data['100'].unique())
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

Q11. Players who scored atleast 4 half centuries in this season.

In [None]:
ipl[ipl["50"]>=4]

Q12. Check the list of players who hit more than 45 boundaries and more than 10 sixes in this season.

In [None]:
ipl["Total_Boundary"]=ipl["4s"]+ipl["6s"]

In [None]:
ipl[(ipl["Total_Boundary"]>45) & (ipl["6s"]>10)    ]

Q13. Plot a histogram of number of matches played in a season by players.

In [None]:
matches_played = ipl['Matches']

# Create the histogram
plt.figure(figsize=(10, 6))
# Use bins appropriate for the range of matches (max is 17)
plt.hist(matches_played, bins=17, edgecolor='black', color='skyblue', rwidth=0.8)

plt.title('Distribution of Matches Played by Players in the Season', fontsize=14)
plt.xlabel('Number of Matches Played', fontsize=12)
plt.ylabel('Number of Players', fontsize=12)
plt.grid(axis='y', alpha=0.6)
# Set x-ticks to be integers for better readability of match counts
plt.xticks(range(0, matches_played.max() + 1)) 

plt.tight_layout()

Q14. Plot the histogram of balls faced by players.

In [None]:
balls_faced = ipl['Balls faced']

# Create the histogram
plt.figure(figsize=(10, 6))

# Using 20 bins for a clear visualization of the data spread (0 to ~518)
n_bins = 20

plt.hist(balls_faced, bins=n_bins, edgecolor='black', color='lightcoral', rwidth=0.8)

plt.title('Distribution of Balls Faced by Players in the Season', fontsize=14)
plt.xlabel('Number of Balls Faced', fontsize=12)
plt.ylabel('Number of Players', fontsize=12)
plt.grid(axis='y', alpha=0.6, linestyle='--')

plt.tight_layout()


Q15. Top 10 players with most runs in a season.

In [None]:
ipl.sort_values(by="Runs",ascending=False).head(10)

Q16. Print the players who played the match but didn't get the batting.

In [None]:
players_no_batting = ipl[(ipl['Matches'] > 0) & (ipl['Inns'] == 0)]
players_no_batting

Q17. Create a new column to show the percentage of total runs scored in 4s and 6s. Then print the top 5 players with maximum percentage.

In [None]:
ipl["total runs scored in 4s and 6s"]=ipl["4s"]*4+ipl["6s"]*6
ipl["%age"]=(ipl["total runs scored in 4s and 6s"]/ipl["Runs"])*100
ipl.sort_values(by="%age",ascending=False).head(5)["Player"]

Q18. Print the players with top 5 Not out percentages (Not Out percentage can be calculated as number of Not outs divided by Innings).

In [None]:
ipl["Not_out_%age"]=(ipl["Not Out"]/ipl["Inns"])*100
ipl.sort_values(by="Not_out_%age",ascending=False).head(5)["Player"]

Q19. Create visualization of top 10 players with highest number of sixes.

In [None]:
# 1. Sort the DataFrame by '6s' in descending order
top_sixes = ipl.sort_values(by='6s', ascending=False).head(10)

# 2. Select the relevant columns
plot_data = top_sixes[['Player', '6s']]

# 3. Create the visualization (Horizontal Bar Chart)
plt.figure(figsize=(10, 6))

sns.barplot(x='6s', y='Player', data=plot_data)

plt.title('Top 10 Players with the Highest Number of Sixes', fontsize=16)
plt.xlabel('Number of Sixes (6s)', fontsize=12)
plt.ylabel('Player Name', fontsize=12)

plt.tight_layout()

Q20. Scatter plot of runs scored by a player v/s balls faced in a season. Then find the relationship between these 2 variables.

In [None]:

plt.figure(figsize=(10, 6))
sns.scatterplot(x='Balls faced', y='Runs', data=ipl, alpha=0.7)

plt.title('Runs Scored vs. Balls Faced by Players in a Season', fontsize=14)
plt.xlabel('Balls Faced', fontsize=12)
plt.ylabel('Runs Scored', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.6)

In [None]:
# 2. Find the relationship (Calculate Correlation Coefficient)
correlation = ipl['Runs'].corr(ipl['Balls faced'])
print(correlation)