# **1. Data Collection**

**1.1 Importing File Formats**

In [2]:
import pandas as pd

# Importing CSV file
df = pd.read_csv('/content/ShakeupAnalysis.csv')

**1.21 Checking Data Types**

In [3]:
data_types = df.dtypes
print("Data Types:\n", data_types)

Data Types:
 Private Rank             int64
TeamId                   int64
TeamName                object
LastSubmissionDate      object
Private Score          float64
SubmissionCount          int64
TeamMemberUserNames     object
Public Score           float64
Public Rank              int64
Unnamed: 9             float64
dtype: object


**1.22 Checking Duplicates**

In [4]:
duplicates = df.duplicated().sum()
print("Number of Duplicates:\n", duplicates)

Number of Duplicates:
 0


# **2. Data Exploration**

**2.1 Understanding the Structure**

**2.11 View the First Few Rows**

In [5]:
df.head()

Unnamed: 0,Private Rank,TeamId,TeamName,LastSubmissionDate,Private Score,SubmissionCount,TeamMemberUserNames,Public Score,Public Rank,Unnamed: 9
0,1,10820725,Tom Wright-Anderson,05-09-2023 09:02,1.00294,51,thomaswrightanderson,1.39531,496,
1,2,10891496,rdebona,10-09-2023 21:40,1.00964,2,rdebona,1.41269,779,
2,3,10884347,Sando Breezez,10-09-2023 19:34,1.01086,1,sandobreezez,1.40443,546,
3,4,10883372,Maxime Perez,11-09-2023 09:42,1.01111,5,maximeperez,1.40479,552,
4,5,10819987,Faysal Miah,31-08-2023 16:41,1.01132,1,faysalmiah1721758,1.38375,453,


**2.12 View DataFrame Information**

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 955 entries, 0 to 954
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Private Rank         955 non-null    int64  
 1   TeamId               955 non-null    int64  
 2   TeamName             955 non-null    object 
 3   LastSubmissionDate   955 non-null    object 
 4   Private Score        955 non-null    float64
 5   SubmissionCount      955 non-null    int64  
 6   TeamMemberUserNames  955 non-null    object 
 7   Public Score         955 non-null    float64
 8   Public Rank          955 non-null    int64  
 9   Unnamed: 9           0 non-null      float64
dtypes: float64(3), int64(4), object(3)
memory usage: 74.7+ KB


**2.13 Summary Statistics**

In [7]:
df.describe()

Unnamed: 0,Private Rank,TeamId,Private Score,SubmissionCount,Public Score,Public Rank,Unnamed: 9
count,955.0,955.0,955.0,955.0,955.0,955.0,0.0
mean,478.0,10839770.0,1.22232,10.831414,1.52941,478.0,
std,275.829053,20864.63,1.285462,13.741849,1.255847,275.829053,
min,1.0,10818990.0,1.00294,1.0,1.23336,1.0,
25%,239.5,10822320.0,1.02648,2.0,1.3282,239.5,
50%,478.0,10832670.0,1.0489,5.0,1.39151,478.0,
75%,716.5,10851820.0,1.077965,14.0,1.40776,716.5,
max,955.0,10899200.0,30.20613,105.0,30.20645,955.0,


**2.14 View Column Names**

In [8]:
df.columns

Index(['Private Rank', 'TeamId', 'TeamName', 'LastSubmissionDate',
       'Private Score', 'SubmissionCount', 'TeamMemberUserNames',
       'Public Score', 'Public Rank', 'Unnamed: 9'],
      dtype='object')

**2.15 Shape of the Data**

In [9]:
df.shape

(955, 10)

**2.16 View a Desired Sample of 5 Rows**

In [10]:
df.sample(5)

Unnamed: 0,Private Rank,TeamId,TeamName,LastSubmissionDate,Private Score,SubmissionCount,TeamMemberUserNames,Public Score,Public Rank,Unnamed: 9
456,457,10846025,gondayo,30-08-2023 10:52,1.04714,3,kanayamisato,1.31559,189,
263,264,10832265,Rlxxnnn,08-09-2023 09:22,1.02648,2,rlxxnnn,1.352,312,
507,508,10826578,Seikou Ryo,23-08-2023 15:26,1.05008,1,seikouryo,1.37859,417,
70,71,10822122,haruakif,22-08-2023 16:04,1.02393,3,haruakif,1.40402,542,
469,470,10864896,muhammad adeel badar,09-09-2023 05:33,1.04846,5,madeelbadar,1.41814,793,


**2.2 Missing Values**

**2.21 Identifying Missing Values**

In [11]:
print("Missing Values:\n", df.isnull().sum())

Missing Values:
 Private Rank             0
TeamId                   0
TeamName                 0
LastSubmissionDate       0
Private Score            0
SubmissionCount          0
TeamMemberUserNames      0
Public Score             0
Public Rank              0
Unnamed: 9             955
dtype: int64


**2.22 Dropping Rows or Columns with Missing Values**

In [12]:
# Drop rows with any missing values
df_cleaned_rows = df.dropna()

# Drop columns with any missing values
df_cleaned_columns = df.dropna(axis=1)

# **3. Data Cleaning**

**3.1 Identifying Outliers using Z-Score**

In [13]:
from scipy import stats

# Calculate Z-scores of each value in the DataFrame
z_scores = stats.zscore(df.select_dtypes(include=['float64', 'int64']))

# Identify outliers (Z-score > 3 or < -3)
df_outliers = df[(z_scores > 3).any(axis=1) | (z_scores < -3).any(axis=1)]
print("Outliers:\n", df_outliers)

Outliers:
      Private Rank    TeamId                    TeamName LastSubmissionDate  \
11             12  10823072              Hıdır BozkurtT   11-09-2023 19:28   
31             32  10819189            Samvel Kocharyan   11-09-2023 04:44   
308           309  10819817  Luck Processing Unit (TPU)   07-09-2023 06:50   
314           315  10823617                  Jim Gruman   10-09-2023 20:33   
363           364  10820096                 Oppenheimer   11-09-2023 06:25   
373           374  10819137                       Lafoi   11-09-2023 04:23   
387           388  10821168                   Dmitri K.   09-09-2023 22:42   
398           399  10820005         KLYUSHNIK-Alexsandr   11-09-2023 11:51   
463           464  10825670           PRASHANT SHUKLA91   11-09-2023 06:58   
506           507  10819608               Itachi Uchiha   11-09-2023 09:57   
621           622  10820110              Dr. Alvinleenh   10-09-2023 08:47   
631           632  10819160                       Boy

**3.2 Remove Outliers**

In [14]:
# Remove rows with outliers based on Z-score
df_no_outliers = df[(z_scores < 3).all(axis=1) & (z_scores > -3).all(axis=1)]
print("DataFrame without Outliers:\n", df_no_outliers)

DataFrame without Outliers:
 Empty DataFrame
Columns: [Private Rank, TeamId, TeamName, LastSubmissionDate, Private Score, SubmissionCount, TeamMemberUserNames, Public Score, Public Rank, Unnamed: 9]
Index: []


# **In Depth Detailed Analysis**

**How do 'Private Score' and 'Public Score' relate to each other?**

In [15]:
import plotly.express as px

fig = px.scatter(df, x='Private Score', y='Public Score', title='Private Score vs Public Score')
fig.show()

**How many submissions did each team make?**

In [16]:
fig = px.bar(df, x='TeamName', y='SubmissionCount', title='Submission Count by Team Name')
fig.update_layout(xaxis_tickangle=-90, height=800, width=2000)
fig.show()

**What is the relationship between 'SubmissionCount' and 'Public Score' for each team?**

In [17]:
fig = px.scatter(df, x='SubmissionCount', y='Public Score', color='TeamName',
                 title='Submission Count vs Public Score by Team Name',
                 labels={'SubmissionCount': 'Submission Count', 'Public Score': 'Public Score'})
fig.show()

**How do 'Private Score' and 'Public Score' compare within the dataset?**

In [18]:
fig = px.scatter(df, x='Private Score', y='Public Score', color='TeamName', trendline='ols',
                 title='Private Score vs Public Score with Trend Line',
                 labels={'Private Score': 'Private Score', 'Public Score': 'Public Score'})
fig.show()

**What are the top 10 teams by 'SubmissionCount'?**

In [19]:
top_teams = df.nlargest(10, 'SubmissionCount')
fig = px.bar(top_teams, x='SubmissionCount', y='TeamName', color='SubmissionCount',
             title='Top 10 Teams by Submission Count',
             labels={'SubmissionCount': 'Submission Count', 'TeamName': 'Team Name'},
             orientation='h')
fig.show()

**What is the average 'Private Score' for each 'TeamName'?**

In [20]:
import plotly.express as px

average_private_score = df.groupby('TeamName')['Private Score'].mean().reset_index()
fig = px.bar(average_private_score, x='TeamName', y='Private Score', color='Private Score',
             title='Average Private Score by Team Name',
             labels={'TeamName': 'Team Name', 'Private Score': 'Average Private Score'})
fig.update_layout(xaxis_tickangle=-90, width=8000, height=800)
fig.show()

**What is the distribution of 'SubmissionCount' across different 'TeamName'?**

In [21]:
total_submission_count = df.groupby('TeamName')['SubmissionCount'].sum().reset_index()
fig = px.bar(total_submission_count, x='TeamName', y='SubmissionCount', color='SubmissionCount',
             title='Total Submission Count by Team Name',
             labels={'TeamName': 'Team Name', 'SubmissionCount': 'Total Submission Count'})
fig.update_layout(xaxis_tickangle=-90, width=8000, height=800)
fig.show()

**How do the 'Public Score' and 'Private Score' averages compare for the top 5 teams with the highest 'SubmissionCount'?**

In [22]:
top_5_teams = df.nlargest(5, 'SubmissionCount')
average_scores = top_5_teams.groupby('TeamName').agg({'Private Score': 'mean', 'Public Score': 'mean'}).reset_index()
fig = px.bar(average_scores, x='TeamName', y=['Private Score', 'Public Score'],
             title='Average Private and Public Scores for Top 5 Teams by Submission Count',
             labels={'value': 'Average Score', 'variable': 'Score Type'})
fig.update_layout(xaxis_tickangle=-45)
fig.show()

**What is the total number of submissions by 'TeamName' grouped by 'Private Score' range?**

In [23]:
import warnings
warnings.filterwarnings('ignore')
# Create a new column for Private Score ranges
df['Private Score Range'] = pd.cut(df['Private Score'], bins=[0, 50, 100, 150, 200, 250, 300],
                                   labels=['0-50', '51-100', '101-150', '151-200', '201-250', '251-300'])

submission_count_by_score_range = df.groupby('Private Score Range')['SubmissionCount'].sum().reset_index()
fig = px.bar(submission_count_by_score_range, x='Private Score Range', y='SubmissionCount', color='SubmissionCount',
             title='Total Submission Count by Private Score Range',
             labels={'Private Score Range': 'Private Score Range', 'SubmissionCount': 'Total Submission Count'})
fig.show()

**What is the distribution of 'Private Score'?**

In [24]:
import plotly.express as px

fig = px.histogram(df, x='Private Score', nbins=30, color_discrete_sequence=['#1f77b4'],
                   title='Distribution of Private Score',
                   labels={'Private Score': 'Private Score'})
fig.update_layout(xaxis_title='Private Score', yaxis_title='Frequency')
fig.show()

**How is the 'Public Score' distributed?**

In [25]:
fig = px.histogram(df, x='Public Score', nbins=30, color_discrete_sequence=['#ff7f0e'],
                   title='Distribution of Public Score',
                   labels={'Public Score': 'Public Score'})
fig.update_layout(xaxis_title='Public Score', yaxis_title='Frequency')
fig.show()

**What are the distributions of 'SubmissionCount' across different ranges?**

In [26]:
fig = px.histogram(df, x='SubmissionCount', nbins=20, color_discrete_sequence=['#2ca02c'],
                   title='Distribution of Submission Count',
                   labels={'SubmissionCount': 'Submission Count'})
fig.update_layout(xaxis_title='Submission Count', yaxis_title='Frequency')
fig.show()

**How does the distribution of 'Private Score' vary by 'TeamName'?**

In [27]:
fig = px.histogram(df, x='Private Score', color='TeamName', nbins=30,
                   title='Distribution of Private Score by Team Name',
                   labels={'Private Score': 'Private Score', 'TeamName': 'Team Name'})
fig.update_layout(xaxis_title='Private Score', yaxis_title='Frequency')
fig.show()

**What is the distribution of 'Public Score' for the top 10 teams by 'SubmissionCount'?**

In [28]:
top_10_teams = df.nlargest(10, 'SubmissionCount')
fig = px.histogram(top_10_teams, x='Public Score', nbins=20, color='TeamName',
                   title='Distribution of Public Score for Top 10 Teams by Submission Count',
                   labels={'Public Score': 'Public Score', 'TeamName': 'Team Name'})
fig.update_layout(xaxis_title='Public Score', yaxis_title='Frequency')
fig.show()

**What is the distribution of 'Private Score' for teams with high vs. low 'SubmissionCount'?**

In [29]:
df['SubmissionCategory'] = df['SubmissionCount'].apply(lambda x: 'High' if x > df['SubmissionCount'].median() else 'Low')

fig = px.histogram(df, x='Private Score', color='SubmissionCategory', nbins=30,
                   title='Distribution of Private Score by Submission Category',
                   labels={'Private Score': 'Private Score', 'SubmissionCategory': 'Submission Category'})
fig.update_layout(xaxis_title='Private Score', yaxis_title='Frequency')
fig.show()

**What is the distribution of 'SubmissionCount' categories?**

In [30]:
# Create a new column for Submission Count categories
df['Submission Category'] = pd.cut(df['SubmissionCount'], bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                                   labels=['0-10', '11-20', '21-30', '31-40', '41-50', '51-60', '61-70', '71-80', '81-90', '91-100'])

submission_category_counts = df['Submission Category'].value_counts().reset_index()
submission_category_counts.columns = ['Submission Category', 'Count']

fig = px.pie(submission_category_counts, names='Submission Category', values='Count',
             title='Distribution of Submission Count Categories',
             labels={'Submission Category': 'Submission Count Category', 'Count': 'Number of Teams'})
fig.show()


**What is the proportion of 'TeamName' with respect to 'Private Score'?**

In [31]:
# Summarize Private Score by TeamName
team_private_scores = df.groupby('TeamName')['Private Score'].sum().reset_index()
team_private_scores = team_private_scores.sort_values(by='Private Score', ascending=False).head(10)  # Top 10 Teams

fig = px.pie(team_private_scores, names='TeamName', values='Private Score',
             title='Proportion of Private Score by Team Name',
             labels={'TeamName': 'Team Name', 'Private Score': 'Private Score'})
fig.show()


**How is the distribution of 'Public Score' across different 'TeamName'?**

In [32]:
# Summarize Public Score by TeamName
team_public_scores = df.groupby('TeamName')['Public Score'].sum().reset_index()
team_public_scores = team_public_scores.sort_values(by='Public Score', ascending=False).head(10)  # Top 10 Teams

fig = px.pie(team_public_scores, names='TeamName', values='Public Score',
             title='Distribution of Public Score by Team Name',
             labels={'TeamName': 'Team Name', 'Public Score': 'Public Score'})
fig.show()

**What is the distribution of 'TeamName' in terms of 'Private Score'?**

In [33]:
# Summarize Private Score by TeamName
team_private_scores = df.groupby('TeamName')['Private Score'].sum().reset_index()
team_private_scores = team_private_scores.sort_values(by='Private Score', ascending=False).head(10)  # Top 10 Teams

fig = px.pie(team_private_scores, names='TeamName', values='Private Score',
             title='Distribution of Private Score Among Top 10 Teams',
             labels={'TeamName': 'Team Name', 'Private Score': 'Private Score'})
fig.show()

**How is 'Public Score' distributed across different 'SubmissionCount' ranges?**

In [34]:
# Define submission count ranges
submission_ranges = [0, 20, 40, 60, 80, 100]

# Create a new column for Submission Count ranges
df['Submission Range'] = pd.cut(df['SubmissionCount'], bins=submission_ranges,
                                 labels=[f'{submission_ranges[i]}-{submission_ranges[i+1]}' for i in range(len(submission_ranges)-1)])

# Calculate average public score for each submission range
submission_range_scores = df.groupby('Submission Range')['Public Score'].mean().reset_index()

fig = px.pie(submission_range_scores, names='Submission Range', values='Public Score',
             title='Average Public Score Distribution by Submission Count Range',
             labels={'Submission Range': 'Submission Count Range', 'Public Score': 'Average Public Score'})
fig.show()

**What is the distribution of 'SubmissionCount' across different 'TeamName'?**

In [35]:
# Summarize Submission Count by TeamName
team_submission_counts = df.groupby('TeamName')['SubmissionCount'].sum().reset_index()
team_submission_counts = team_submission_counts.sort_values(by='SubmissionCount', ascending=False).head(10)  # Top 10 Teams

fig = px.pie(team_submission_counts, names='TeamName', values='SubmissionCount',
             title='Distribution of Submission Count Among Top 10 Teams',
             labels={'TeamName': 'Team Name', 'SubmissionCount': 'Submission Count'})
fig.show()

**How do 'Private Score', 'Public Score', and 'SubmissionCount' interact with each other?**

In [36]:
import plotly.express as px

fig = px.scatter_matrix(df, dimensions=['Private Score', 'Public Score', 'SubmissionCount'],
                       title='Pair Plot of Private Score, Public Score, and Submission Count',
                       labels={'Private Score': 'Private Score', 'Public Score': 'Public Score', 'SubmissionCount': 'Submission Count'})
fig.update_layout(xaxis_title='Variable', yaxis_title='Variable')
fig.show()

**How do 'Private Score', 'Public Score', and 'SubmissionCount' interact across different 'TeamId' categories?**

In [37]:
import plotly.express as px

# Assuming 'TeamId' is categorical, use it for coloring
fig = px.scatter_matrix(df, dimensions=['Private Score', 'Public Score', 'SubmissionCount'],
                       color='TeamId',
                       title='Pair Plot of Private Score, Public Score, and Submission Count by TeamId',
                       labels={'Private Score': 'Private Score', 'Public Score': 'Public Score', 'SubmissionCount': 'Submission Count'})
fig.update_layout(xaxis_title='Variable', yaxis_title='Variable')
fig.show()

**How do 'Private Score' and 'Public Score' relate to each other across different 'SubmissionCount' ranges?**

In [38]:
import plotly.express as px

# Define submission count ranges
submission_ranges = [0, 20, 40, 60, 80, 100]

# Create a new column for Submission Count ranges
df['Submission Range'] = pd.cut(df['SubmissionCount'], bins=submission_ranges,
                                 labels=[f'{submission_ranges[i]}-{submission_ranges[i+1]}' for i in range(len(submission_ranges)-1)])

fig = px.scatter_matrix(df, dimensions=['Private Score', 'Public Score'],
                       color='Submission Range',
                       title='Pair Plot of Private Score and Public Score by Submission Count Range',
                       labels={'Private Score': 'Private Score', 'Public Score': 'Public Score', 'Submission Range': 'Submission Count Range'})
fig.update_layout(xaxis_title='Variable', yaxis_title='Variable')
fig.show()
