# Clash of Clans - Explatory Data Analysis (Chosen for larger Dataset size)

Let's start with importing libraries and see what the data will take us! First, explore the data.

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('../input/coc_clans_dataset.csv', index_col = 0)

In [3]:
df = df
intended_df_size_in_MB = 256
factor = intended_df_size_in_MB*(2**20)//df.memory_usage(index=True).sum()
if factor > 0:
    df = pd.concat([df]*factor, ignore_index=True)
df = df
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3559743 entries, #UQVQRJQ0 to #2YRU29GGL
Data columns (total 26 columns):
 #   Column                          Dtype 
---  ------                          ----- 
 0   clan_name                       object
 1   clan_type                       object
 2   clan_description                object
 3   clan_location                   object
 4   isFamilyFriendly                bool  
 5   clan_badge_url                  object
 6   clan_level                      int64 
 7   clan_points                     int64 
 8   clan_builder_base_points        int64 
 9   clan_versus_points              int64 
 10  required_trophies               int64 
 11  war_frequency                   object
 12  war_win_streak                  int64 
 13  war_wins                        int64 
 14  war_ties                        int64 
 15  war_losses                      int64 
 16  clan_war_league                 object
 17  num_members                     int64 
 

### I try to recognize which columns I will use and which colums are unnecessary for me.

In [4]:
df.head()

Unnamed: 0_level_0,clan_name,clan_type,clan_description,clan_location,isFamilyFriendly,clan_badge_url,clan_level,clan_points,clan_builder_base_points,clan_versus_points,...,clan_war_league,num_members,required_builder_base_trophies,required_versus_trophies,required_townhall_level,clan_capital_hall_level,clan_capital_points,capital_league,mean_member_level,mean_member_trophies
clan_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
#UQVQRJQ0,KOJIS' CLAN,closed,STRATEGY IS MUST...RESPECT THE OTHERS...FORGIV...,International,True,https://api-assets.clashofclans.com/badges/200...,6,6887,5213,5213,...,Unranked,11,1000,1000,1,1,0,Unranked,83,1254
#2QC9Y0CQU,uye,open,,Indonesia,True,https://api-assets.clashofclans.com/badges/200...,1,876,1019,1019,...,Unranked,1,0,0,1,0,0,Unranked,90,1752
#202CJRP2U,Uprising rivals,open,💥official uprising rivals clan💥we bring 10% lu...,Indonesia,True,https://api-assets.clashofclans.com/badges/200...,1,732,670,670,...,Unranked,2,0,0,1,0,0,Unranked,30,733
#2Y89RRGLY,2inchersonly,open,"Whom is not packing the meat, Shall be packing...",International,True,https://api-assets.clashofclans.com/badges/200...,2,1156,457,457,...,Bronze League I,2,0,0,1,1,0,Unranked,61,1156
#99PU9QPY,aymil,open,regla1=no empezar la guerra sin mi permiso reg...,International,True,https://api-assets.clashofclans.com/badges/200...,2,674,399,399,...,Unranked,2,0,0,1,0,0,Unranked,23,674


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3559743 entries, #UQVQRJQ0 to #2YRU29GGL
Data columns (total 26 columns):
 #   Column                          Dtype 
---  ------                          ----- 
 0   clan_name                       object
 1   clan_type                       object
 2   clan_description                object
 3   clan_location                   object
 4   isFamilyFriendly                bool  
 5   clan_badge_url                  object
 6   clan_level                      int64 
 7   clan_points                     int64 
 8   clan_builder_base_points        int64 
 9   clan_versus_points              int64 
 10  required_trophies               int64 
 11  war_frequency                   object
 12  war_win_streak                  int64 
 13  war_wins                        int64 
 14  war_ties                        int64 
 15  war_losses                      int64 
 16  clan_war_league                 object
 17  num_members                     int64 
 

#### I like to use visualizations, pretty much for everything :) Let's see how many null data we have.


## Let's explore one of those columns with empty cells, "clan_location".
## I want to see how many empty cells we have, after that explore its distribution.


In [6]:
missing_count = df['clan_location'].isnull().sum()

non_missing_count = len(df) - missing_count

labels = ['Missing Data', 'Non-Missing Data']
sizes = [missing_count, non_missing_count]

In [7]:
df['clan_location'].unique(), df['clan_location'].value_counts()

(array(['International', 'Indonesia', 'Pakistan', nan, 'Türkiye',
        'Bahamas', 'Chile', 'Brazil', 'Philippines', 'Italy', 'Canada',
        'Mexico', 'Malaysia', 'India', 'Afghanistan', 'United States',
        'Iran', 'Germany', 'Uruguay', 'American Samoa', 'Thailand',
        'Vietnam', 'Greenland', 'Latvia', 'South Africa', 'Albania',
        'Japan', 'Hungary', 'Bangladesh', 'Norway', 'France',
        'Myanmar (Burma)', 'Kuwait', 'Australia', 'Bosnia and Herzegovina',
        'Ecuador', 'Kazakhstan', 'North Korea', 'Saudi Arabia',
        'New Zealand', 'Jamaica', 'China', 'Hong Kong', 'Panama',
        'United Kingdom', 'Barbados', 'Netherlands', 'Belgium', 'Russia',
        'Fiji', 'Uzbekistan', 'Swaziland', 'Dominican Republic', 'Iraq',
        'Portugal', 'Egypt', 'Israel', 'Western Sahara', 'Ireland',
        'Côte d’Ivoire', 'Heard & McDonald Islands', 'Åland Islands',
        'Bolivia', 'Samoa', 'Honduras', 'French Polynesia', 'Azerbaijan',
        'Andorra', 'Algeria

In [8]:
count = (df['clan_location'] == 'Türkiye').sum()
count

68730

## This was a simple exploration for a column. Let's dive deep into remaining of the data.

### Is Family Friendly? Let's find out.

### When we look into the clan_level column, we see that most clans are just level 1 and the frequency decreases sharply. So even with the bins, visualization is not very pleasant, but I'll keep it either way :)

In [9]:
df['clan_level'].value_counts()

1     2452736
2      371010
3      191613
4      122426
5       83669
6       62030
7       47044
8       36859
9       32036
10      24932
11      19981
12      15919
14      14728
13      12963
15      11791
16       9317
17       8247
19       7421
18       6851
20       6067
21       5276
22       4427
23       3528
24       3067
25       2295
26       1463
27        906
28        524
29        298
30        160
31         75
32         39
33         29
34         13
35          2
36          1
Name: clan_level, dtype: int64

In [10]:
bins = [0, 6, 12, 18, 24, 30, 36]  # Group levels 1-6, 7-12, 13-18, and so on

# Group the clan levels and create a new column to represent the groups
df['clan_level_group'] = pd.cut(df['clan_level'], bins=bins, labels=['1-6', '7-12', '13-18', '19-24', '25-30', '31-36'])

#### Let's show something obvious:

### This lmplot took some time to process, most probably because of the size of the data. 

## Distribution of Clan Leagues

In [11]:
df['capital_league'].value_counts()

Unranked               3386262
Bronze League III        23486
Bronze League II         20708
Bronze League I          16887
Silver League III        14662
Silver League II         13419
Silver League I          12484
Gold League III          11442
Gold League II           10251
Gold League I             9006
Crystal League III        7919
Crystal League II         6753
Crystal League I          5617
Master League III         4594
Master League II          3675
Champion League III       3480
Master League I           2932
Champion League II        2342
Champion League I         1582
Titan League III          1018
Titan League II            605
Titan League I             353
Legend League              266
Name: capital_league, dtype: int64

### So many unranked, just exclude it for the sake of the graph.

In [12]:
df_ranked = df[df['capital_league'] != 'Unranked']

## Correlation Map

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3559743 entries, #UQVQRJQ0 to #2YRU29GGL
Data columns (total 27 columns):
 #   Column                          Dtype   
---  ------                          -----   
 0   clan_name                       object  
 1   clan_type                       object  
 2   clan_description                object  
 3   clan_location                   object  
 4   isFamilyFriendly                bool    
 5   clan_badge_url                  object  
 6   clan_level                      int64   
 7   clan_points                     int64   
 8   clan_builder_base_points        int64   
 9   clan_versus_points              int64   
 10  required_trophies               int64   
 11  war_frequency                   object  
 12  war_win_streak                  int64   
 13  war_wins                        int64   
 14  war_ties                        int64   
 15  war_losses                      int64   
 16  clan_war_league                 object  
 17  nu

In [14]:
selected_columns = ['clan_level', 'clan_points', 'clan_builder_base_points', 'clan_versus_points',
                    'required_trophies', 'war_win_streak', 'war_wins', 'war_ties', 'war_losses',
                    'num_members', 'required_builder_base_trophies','required_versus_trophies', 
                    'required_townhall_level', 'clan_capital_hall_level','clan_capital_points', 
                    'mean_member_level', 'mean_member_trophies',
                    ]

correlation_matrix = df[selected_columns].corr()