## IMPORT LIBRARIES

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import re

import warnings
warnings.filterwarnings('ignore')

## OVERVIEW

In [2]:
df = pd.read_csv('dataset\evnts.csv')

In [3]:
df.head(3)

Unnamed: 0,user_id,event_id,organization_id,transaction_date,event_type,organization_type,event_category,user_type,user_location,event_location,gender,age_when_register,price,total_ticket_sold
0,4f3e46a6e94452b566553485b619d2f305bd9d875d4f6d...,6629d5a73302852ef915dbda22fa1994213eb36e41c893...,9bace67f3bda20854668f8b8a6756d2541c6ec49223803...,2019-01-01 03:04:32,Event,Event Organizer,Art Exhibition - Family - Art - Exhibition - G...,Guest,,Jakarta,none,,75000,2
1,23c44b20dafb34672868945ce6428ead219eb8bb5312c5...,6629d5a73302852ef915dbda22fa1994213eb36e41c893...,9bace67f3bda20854668f8b8a6756d2541c6ec49223803...,2019-01-01 03:11:44,Event,Event Organizer,Art Exhibition - Family - Art - Exhibition - G...,Registered,,Jakarta,female,29.0,100000,1
2,23c44b20dafb34672868945ce6428ead219eb8bb5312c5...,6629d5a73302852ef915dbda22fa1994213eb36e41c893...,9bace67f3bda20854668f8b8a6756d2541c6ec49223803...,2019-01-01 03:11:50,Event,Event Organizer,Art Exhibition - Family - Art - Exhibition - G...,Registered,,Jakarta,female,29.0,100000,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54032 entries, 0 to 54031
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   user_id            54032 non-null  object 
 1   event_id           54032 non-null  object 
 2   organization_id    54032 non-null  object 
 3   transaction_date   54032 non-null  object 
 4   event_type         54032 non-null  object 
 5   organization_type  54032 non-null  object 
 6   event_category     54026 non-null  object 
 7   user_type          53979 non-null  object 
 8   user_location      9166 non-null   object 
 9   event_location     54026 non-null  object 
 10  gender             50205 non-null  object 
 11  age_when_register  22228 non-null  float64
 12  price              54032 non-null  int64  
 13  total_ticket_sold  54032 non-null  int64  
dtypes: float64(1), int64(2), object(11)
memory usage: 5.8+ MB


In [5]:
pd.DataFrame({'dataFeatures' : df.columns, 'dataType' : df.dtypes.values, 
              'null' : [df[i].isna().sum() for i in df.columns],
              'nullPct' : [((df[i].isna().sum()/len(df[i]))*100).round(1) for i in df.columns],
              'Nunique' : [df[i].nunique() for i in df.columns],
              'uniqueSample' : [list(pd.Series(df[i].unique()).sample()) for i in df.columns]}).reset_index(drop = True)

Unnamed: 0,dataFeatures,dataType,null,nullPct,Nunique,uniqueSample
0,user_id,object,0,0.0,32178,[241637b050ea7ad605072a056a49506c7cc96a6416165...
1,event_id,object,0,0.0,811,[160b8561b0489f54d7bee7fefea388357e5acc1db657c...
2,organization_id,object,0,0.0,274,[1a0317b905124a447f5bbc88a2d3e83eb9e988671f593...
3,transaction_date,object,0,0.0,49308,[2020-05-04 15:24:47]
4,event_type,object,0,0.0,1,[Event]
5,organization_type,object,0,0.0,7,[Event Organizer]
6,event_category,object,6,0.0,296,[Webinar - Networking - Seminar]
7,user_type,object,53,0.1,2,[Guest]
8,user_location,object,44866,83.0,34,[Probolinggo]
9,event_location,object,6,0.0,23,[Palembang]


In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age_when_register,22228.0,24.080079,8.330055,0.0,18.0,22.0,29.0,120.0
price,54032.0,82539.488729,310605.878561,0.0,0.0,0.0,70000.0,8100000.0
total_ticket_sold,54032.0,1.623556,2.734496,1.0,1.0,1.0,1.0,134.0


In [7]:
df.describe(include = 'object').T

Unnamed: 0,count,unique,top,freq
user_id,54032,32178,74b15dc9dd3a61432731b8edd5588b6cb35ac12ba720d9...,804
event_id,54032,811,9441a919384943d46915a2efaaad637bb0ca08cf0de0d9...,4397
organization_id,54032,274,1af57e2c0b4cd813117b114b25c36938b219e172830653...,4413
transaction_date,54032,49308,2020-05-22 13:56:48,31
event_type,54032,1,Event,54032
organization_type,54032,7,Event Organizer and Promotor,26751
event_category,54026,296,Webinar,5619
user_type,53979,2,Guest,30854
user_location,9166,34,Jakarta,5577
event_location,54026,23,Jakarta,25347


## PREPROCESSING

> First we start by fixing the **event_category**.

In [8]:
df[df['event_category'].isna()]

Unnamed: 0,user_id,event_id,organization_id,transaction_date,event_type,organization_type,event_category,user_type,user_location,event_location,gender,age_when_register,price,total_ticket_sold
21800,20f1f78d5eb02b09a503b26e7294399b6b7b3ca7d1565f...,83cadc8cf1544e253e3e75fdf140d38badbc1461124e4b...,bf98663c9884ad409695575399cd63a9caf2217c742ae7...,2020-02-15 06:08:50,Event,Event Organizer and Promotor,,Guest,,,none,,3750000,1
21920,d706ae4c83455692d099e6255583f51bcaee466c01b602...,a51ab54c9c7394a052dc73b3adc164af628dcb1049e759...,bd3e4020cae3d9d7e8817cd8fdbf351b59faaf02a150fd...,2020-02-17 09:41:04,Event,Event Organizer,,Registered,Jakarta,,male,26.0,0,1
23056,5a5bad93c516903217429296f41e7c72b270159da54da5...,a51ab54c9c7394a052dc73b3adc164af628dcb1049e759...,bd3e4020cae3d9d7e8817cd8fdbf351b59faaf02a150fd...,2020-04-17 03:09:18,Event,Event Organizer,,Registered,Jakarta,,female,20.0,0,5
45189,7c6a0a99b02e47badc12b2f4ac52db93a5d8c789986912...,a51ab54c9c7394a052dc73b3adc164af628dcb1049e759...,bd3e4020cae3d9d7e8817cd8fdbf351b59faaf02a150fd...,2020-04-17 03:09:18,Event,Event Organizer,,Guest,,,female,,0,3
45190,1b710ade9432a036d79a44bf79368326b21919b6973616...,a51ab54c9c7394a052dc73b3adc164af628dcb1049e759...,bd3e4020cae3d9d7e8817cd8fdbf351b59faaf02a150fd...,2020-04-17 03:09:18,Event,Event Organizer,,Guest,,,none,,0,2
45191,917378748f44c1b205af7a0eaa97c8e049857931c25b2e...,a51ab54c9c7394a052dc73b3adc164af628dcb1049e759...,bd3e4020cae3d9d7e8817cd8fdbf351b59faaf02a150fd...,2020-04-17 03:09:19,Event,Event Organizer,,Registered,Jakarta,,female,22.0,0,3


> As we can see that the missing values are only on two events and belongs to two different organizations. We will fill it by using this information.

In [9]:
df[df['event_category'].isna()].groupby('event_id')['organization_id'].value_counts()

event_id                                                          organization_id                                                 
83cadc8cf1544e253e3e75fdf140d38badbc1461124e4b75f6022d97747e8672  bf98663c9884ad409695575399cd63a9caf2217c742ae795bd82268011ef4432    1
a51ab54c9c7394a052dc73b3adc164af628dcb1049e75912ecfb702b1024cd9a  bd3e4020cae3d9d7e8817cd8fdbf351b59faaf02a150fd6b12a02c914e914fc0    5
Name: organization_id, dtype: int64

In [10]:
df[df['event_id']=='a51ab54c9c7394a052dc73b3adc164af628dcb1049e75912ecfb702b1024cd9a']['event_category']

21920    NaN
23056    NaN
45189    NaN
45190    NaN
45191    NaN
Name: event_category, dtype: object

> Seems that we cannot rely on **event id**, let's try the **organization id**.

In [11]:
df[df['organization_id']=='bd3e4020cae3d9d7e8817cd8fdbf351b59faaf02a150fd6b12a02c914e914fc0']['event_category'].value_counts()

Online Event - Webinar                                                              13
Webinar - Art                                                                       12
Music                                                                                6
Travel - Culture - Adventure - Outdoor - Open Trip - Tour Package - Tourism Site     6
Name: event_category, dtype: int64

> We can see that using this method, we can get a clearer view on how to fill these values. 

In [12]:
df[df['organization_id']=='bf98663c9884ad409695575399cd63a9caf2217c742ae795bd82268011ef4432']['event_category'].value_counts()

Festival - Music    34
Music - Concert      5
Name: event_category, dtype: int64

In [13]:
# Locate the organization id

event_fill = df[df['organization_id']=='bd3e4020cae3d9d7e8817cd8fdbf351b59faaf02a150fd6b12a02c914e914fc0']
event_fill2 = df[df['organization_id']=='bf98663c9884ad409695575399cd63a9caf2217c742ae795bd82268011ef4432']

# Filling missing event category

event_fill['event_category'] = event_fill['event_category'].fillna('Online Event - Webinar')
event_fill2['event_category'] = event_fill2['event_category'].fillna('Festival - Music')

In [14]:
event_fills = pd.concat([event_fill, event_fill2])

In [15]:
# Merging to original dataframe

df.fillna(event_fills, inplace=True)

In [16]:
df.isna().sum()

user_id                  0
event_id                 0
organization_id          0
transaction_date         0
event_type               0
organization_type        0
event_category           0
user_type               53
user_location        44866
event_location           6
gender                3827
age_when_register    31804
price                    0
total_ticket_sold        0
dtype: int64

> Now moving to **user type** and **user location**.

In [17]:
df[df['user_type'].isna()]['event_category'].value_counts()

Webinar                                                                                34
Webinar - Game                                                                         12
Webinar - Education - Networking - Seminar                                              2
Webinar - Conference - Education - Networking - Seminar                                 2
Webinar - Networking - Seminar                                                          1
Webinar - Conference - Education - Education Expo - Networking - Seminar - Workshop     1
Webinar - Education                                                                     1
Name: event_category, dtype: int64

In [18]:
df[df['event_category']=='Webinar']['user_type'].value_counts()

Guest         3751
Registered    1834
Name: user_type, dtype: int64

In [19]:
df['user_type'].fillna('Guest', inplace=True)

In [20]:
df.isna().sum()

user_id                  0
event_id                 0
organization_id          0
transaction_date         0
event_type               0
organization_type        0
event_category           0
user_type                0
user_location        44866
event_location           6
gender                3827
age_when_register    31804
price                    0
total_ticket_sold        0
dtype: int64

> Moving to **user location**.

In [21]:
df.groupby('user_type')['user_location'].value_counts().head(20)

user_type   user_location
Guest       Yogyakarta          5
            Bekasi              1
            Jakarta             1
Registered  Jakarta          5576
            Tangerang        1044
            Bandung           651
            Depok             520
            Makassar          368
            Bekasi            302
            Yogyakarta        296
            Surabaya          180
            Bali               52
            Medan              25
            Cirebon            17
            Malang             17
            Lampung            14
            Karawang           12
            Palembang          12
            Semarang           12
            Balikpapan          7
Name: user_location, dtype: int64

In [22]:
df[df['user_location'].isna()]['user_type'].value_counts()

Guest         30900
Registered    13966
Name: user_type, dtype: int64

> As we can see, almost all of the **Guest** users are mostly using website to book the ticket, therefore it might cause the system to not record their location, resulted in these missing values. As for the **Registered** users, I think they're might not filled all of their profile page for various reasons. Logically by this theory, I would rather fill these missing values as **'Unknown Location'**. But if we fill all these missing values as **'Unknown Location'**, it would probably distort the data as we can see the missing values in this features are 83% of total data. To mitigate this, I'll try to manually find the location by matching it with their location. It might not help by much, but I think it's better than leave it just as it is.

In [23]:
# Guest user

loc_guest_isna = df[(df['user_location'].isna()) & (df['user_type']=='Guest')]
loc_guest_notna = df[(df['user_location'].notna()) & (df['user_type']=='Guest')]

In [24]:
loc_guest_isna.isna().sum()

user_id                  0
event_id                 0
organization_id          0
transaction_date         0
event_type               0
organization_type        0
event_category           0
user_type                0
user_location        30900
event_location           3
gender                3724
age_when_register    30887
price                    0
total_ticket_sold        0
dtype: int64

In [25]:
loc_guest_isna['event_category'].value_counts()

Webinar - Concert - Education                      3937
Webinar                                            3785
Workshop                                           2187
Webinar - Tech Startup                             1882
Talkshow                                           1241
                                                   ... 
Webinar - Education - Seminar                         1
Conference - Festival - Performance                   1
Sport - Concert - Art - Competition                   1
Seminar - Workshop - Tech Startup - Education         1
Music - Concert - Art - Performance - Orchestra       1
Name: event_category, Length: 242, dtype: int64

> Notice that in this category, we find that there are many events that categorized as **Webinar**, I think we cannot find the specific location for user who were attending the webinar events as it occured online, therefore I'll try to filter it out using *regular expression* to help pinpoint this problem.

In [26]:
# Filter the non webinar event 

guest_not_web_isna = loc_guest_isna[(~loc_guest_isna['event_category'].str.contains('Webinar')==True) & (loc_guest_isna['event_location'] != 'Indonesia')]

In [27]:
guest_not_web_isna.head()

Unnamed: 0,user_id,event_id,organization_id,transaction_date,event_type,organization_type,event_category,user_type,user_location,event_location,gender,age_when_register,price,total_ticket_sold
0,4f3e46a6e94452b566553485b619d2f305bd9d875d4f6d...,6629d5a73302852ef915dbda22fa1994213eb36e41c893...,9bace67f3bda20854668f8b8a6756d2541c6ec49223803...,2019-01-01 03:04:32,Event,Event Organizer,Art Exhibition - Family - Art - Exhibition - G...,Guest,,Jakarta,none,,75000,2
3,b1fc4cf5297d6639937d7806cf8ff626e4fdef6548c892...,6629d5a73302852ef915dbda22fa1994213eb36e41c893...,9bace67f3bda20854668f8b8a6756d2541c6ec49223803...,2019-01-01 03:24:59,Event,Event Organizer,Art Exhibition - Family - Art - Exhibition - G...,Guest,,Jakarta,none,,75000,6
4,1da4ae70dde95b9d83c7586d1af62599cb01c4fc660f63...,6629d5a73302852ef915dbda22fa1994213eb36e41c893...,9bace67f3bda20854668f8b8a6756d2541c6ec49223803...,2019-01-01 03:53:11,Event,Event Organizer,Art Exhibition - Family - Art - Exhibition - G...,Guest,,Jakarta,none,,75000,2
5,7dddb95cc3da557d3c0b993d4ad148182cdccd5833387e...,6629d5a73302852ef915dbda22fa1994213eb36e41c893...,9bace67f3bda20854668f8b8a6756d2541c6ec49223803...,2019-01-01 05:12:57,Event,Event Organizer,Art Exhibition - Family - Art - Exhibition - G...,Guest,,Jakarta,none,,75000,4
6,ebee53b4d8aa0e1d82385d191b9a6b27c21125689f99f7...,42fbc58b0793f7a78da3aed5023efa91bdfbf219963614...,58b192b21f26e839d98afc2dce1e03e354756c73ed3f57...,2019-01-01 05:13:45,Event,Event Organizer,Wedding Expo - Exhibition,Guest,,Jakarta,none,,15000,2


> Since event location : **'Indonesia'** are considered as online event, therefore we will filter that too.

In [28]:
print(len(guest_not_web_isna))

17907


> There are 17907 events filtered, hopefully we can reduce this number by filling out with the values in the not missing categories.

In [29]:
guest_not_web_isna.groupby('event_location')['event_category'].value_counts()

event_location  event_category                                                  
Bali            Tech Startup - Group Activities - Conference                        69
                Competition                                                         11
                Workshop - Seminar - Festival - Travel                               4
                Concert - Music - Festival                                           3
                Workshop - Travel - Seminar - Conference                             2
                                                                                    ..
Yogyakarta      Music                                                               71
                Conference                                                          47
                Movies - Bazaar                                                      3
                Networking                                                           3
                Travel - Education - University S

In [30]:
guest_not_web_isna['price'].value_counts().head(5)

0         8532
50000     1330
250000     807
232000     701
175000     682
Name: price, dtype: int64

In [31]:
guest_not_web_isna[guest_not_web_isna['price']==0]

Unnamed: 0,user_id,event_id,organization_id,transaction_date,event_type,organization_type,event_category,user_type,user_location,event_location,gender,age_when_register,price,total_ticket_sold
157,67de3e9d4e18bdcaa28e43ed01f6f3365b2a80fc13b8df...,69def6ad4b68084a98a62b8974a3a8bc06f56982199bac...,28471a21135fc8615295eeebb675e09f915e0b8731eb90...,2019-01-04 13:00:31,Event,Event Organizer,Workshop,Guest,,Makassar,none,,0,1
163,9b2a9add88a0a944f9570ad5fe51dc77f4717bb2ce4681...,4043c87f9b90f7100a9fec2b3e7e9119d76a1f410a628e...,28471a21135fc8615295eeebb675e09f915e0b8731eb90...,2019-01-04 13:51:17,Event,Event Organizer,Seminar - Workshop,Guest,,Makassar,none,,0,1
166,a37a2419d24e21ee4ef278d6b23badc366792f274f76a5...,69def6ad4b68084a98a62b8974a3a8bc06f56982199bac...,28471a21135fc8615295eeebb675e09f915e0b8731eb90...,2019-01-04 14:34:36,Event,Event Organizer,Workshop,Guest,,Makassar,female,,0,1
167,9a8759ac98052d967687cceff79e2d89bcedb92e8c76f9...,69def6ad4b68084a98a62b8974a3a8bc06f56982199bac...,28471a21135fc8615295eeebb675e09f915e0b8731eb90...,2019-01-04 14:35:48,Event,Event Organizer,Workshop,Guest,,Makassar,none,,0,1
168,a37a2419d24e21ee4ef278d6b23badc366792f274f76a5...,4043c87f9b90f7100a9fec2b3e7e9119d76a1f410a628e...,28471a21135fc8615295eeebb675e09f915e0b8731eb90...,2019-01-04 14:39:13,Event,Event Organizer,Seminar - Workshop,Guest,,Makassar,female,,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51395,d8b7e30ac3b76f280a6efc8629dc74e9d337fcf4fe273e...,937bddf6c441aabe01578fb57f35e0774b344fc81d36d1...,4a3fcfd7c45cffae0a7bb4fa7a39a228b41e57318a4515...,2020-06-01 11:50:01,Event,Event Organizer and Promotor,Group Activities,Guest,,Jakarta,none,,0,4
51400,7cbd5ca37c4bb7f9989e783279b282a9e982f3e403254e...,937bddf6c441aabe01578fb57f35e0774b344fc81d36d1...,4a3fcfd7c45cffae0a7bb4fa7a39a228b41e57318a4515...,2020-06-01 12:53:30,Event,Event Organizer and Promotor,Group Activities,Guest,,Jakarta,none,,0,1
53506,84b68f07a14a13395b4e5c6c77a6025aa4bc8a538c8149...,6776582fb496f1cfb82be60916be60bb735dd028cd2cdb...,a78539c39f39c635d2a128d26662f3a2af30de53b08a9c...,2020-06-23 03:57:42,Event,Event Organizer,Tech Startup,Guest,,Yogyakarta,male,,0,1
53529,44985cfad052079d2398b64c70f06127cc74ab3d74f004...,6776582fb496f1cfb82be60916be60bb735dd028cd2cdb...,a78539c39f39c635d2a128d26662f3a2af30de53b08a9c...,2020-06-23 07:02:10,Event,Event Organizer,Tech Startup,Guest,,Yogyakarta,male,,0,1


> As we can see, there are 8532 events with **Guest** users. That are *free*. And these events are not considered as **Webinars** or **Online events**, therefore logically it's safe to assume, there are high chance that the location of these users are from the same location of the event held. 

In [32]:
# Filling the user location

loc_guest_isna['user_location'] = guest_not_web_isna['event_location']

In [33]:
loc_guest_isna.head()

Unnamed: 0,user_id,event_id,organization_id,transaction_date,event_type,organization_type,event_category,user_type,user_location,event_location,gender,age_when_register,price,total_ticket_sold
0,4f3e46a6e94452b566553485b619d2f305bd9d875d4f6d...,6629d5a73302852ef915dbda22fa1994213eb36e41c893...,9bace67f3bda20854668f8b8a6756d2541c6ec49223803...,2019-01-01 03:04:32,Event,Event Organizer,Art Exhibition - Family - Art - Exhibition - G...,Guest,Jakarta,Jakarta,none,,75000,2
3,b1fc4cf5297d6639937d7806cf8ff626e4fdef6548c892...,6629d5a73302852ef915dbda22fa1994213eb36e41c893...,9bace67f3bda20854668f8b8a6756d2541c6ec49223803...,2019-01-01 03:24:59,Event,Event Organizer,Art Exhibition - Family - Art - Exhibition - G...,Guest,Jakarta,Jakarta,none,,75000,6
4,1da4ae70dde95b9d83c7586d1af62599cb01c4fc660f63...,6629d5a73302852ef915dbda22fa1994213eb36e41c893...,9bace67f3bda20854668f8b8a6756d2541c6ec49223803...,2019-01-01 03:53:11,Event,Event Organizer,Art Exhibition - Family - Art - Exhibition - G...,Guest,Jakarta,Jakarta,none,,75000,2
5,7dddb95cc3da557d3c0b993d4ad148182cdccd5833387e...,6629d5a73302852ef915dbda22fa1994213eb36e41c893...,9bace67f3bda20854668f8b8a6756d2541c6ec49223803...,2019-01-01 05:12:57,Event,Event Organizer,Art Exhibition - Family - Art - Exhibition - G...,Guest,Jakarta,Jakarta,none,,75000,4
6,ebee53b4d8aa0e1d82385d191b9a6b27c21125689f99f7...,42fbc58b0793f7a78da3aed5023efa91bdfbf219963614...,58b192b21f26e839d98afc2dce1e03e354756c73ed3f57...,2019-01-01 05:13:45,Event,Event Organizer,Wedding Expo - Exhibition,Guest,Jakarta,Jakarta,none,,15000,2


In [34]:
loc_guest_notna

Unnamed: 0,user_id,event_id,organization_id,transaction_date,event_type,organization_type,event_category,user_type,user_location,event_location,gender,age_when_register,price,total_ticket_sold
15487,acda42628433d3ace7246178d403f71cf3dd892125f7db...,ea9af3b765d1ca781f3ba325e52cde149e7d7ed98c61fb...,ccc11d9b108e2c73c4954d0fc68a56ad0d74e4d175cfde...,2019-10-10 04:49:20,Event,Event Organizer and Promotor,Music - Art - Festival - Concert,Guest,Bekasi,Bekasi,none,,0,1
26895,c9d8e3ee303dddf7dbb29da1dd79eea0d21b4e6278c26e...,9441a919384943d46915a2efaaad637bb0ca08cf0de0d9...,1af57e2c0b4cd813117b114b25c36938b219e172830653...,2020-05-20 04:39:54,Event,Event Organizer and Promotor,Webinar - Concert - Education,Guest,Yogyakarta,Indonesia,none,,0,1
27236,c9d8e3ee303dddf7dbb29da1dd79eea0d21b4e6278c26e...,9441a919384943d46915a2efaaad637bb0ca08cf0de0d9...,1af57e2c0b4cd813117b114b25c36938b219e172830653...,2020-05-21 04:29:48,Event,Event Organizer and Promotor,Webinar - Concert - Education,Guest,Yogyakarta,Indonesia,none,,0,1
27673,c9d8e3ee303dddf7dbb29da1dd79eea0d21b4e6278c26e...,9441a919384943d46915a2efaaad637bb0ca08cf0de0d9...,1af57e2c0b4cd813117b114b25c36938b219e172830653...,2020-05-21 05:52:33,Event,Event Organizer and Promotor,Webinar - Concert - Education,Guest,Yogyakarta,Indonesia,none,,0,1
28118,c9d8e3ee303dddf7dbb29da1dd79eea0d21b4e6278c26e...,9441a919384943d46915a2efaaad637bb0ca08cf0de0d9...,1af57e2c0b4cd813117b114b25c36938b219e172830653...,2020-05-21 11:05:36,Event,Event Organizer and Promotor,Webinar - Concert - Education,Guest,Yogyakarta,Indonesia,none,,0,1
28631,c9d8e3ee303dddf7dbb29da1dd79eea0d21b4e6278c26e...,9441a919384943d46915a2efaaad637bb0ca08cf0de0d9...,1af57e2c0b4cd813117b114b25c36938b219e172830653...,2020-05-22 13:58:06,Event,Event Organizer and Promotor,Webinar - Concert - Education,Guest,Yogyakarta,Indonesia,none,,0,1
30287,641706ca6b6502c33eda0b99361d99d5886902878cab14...,552dcd9035c4d520da6318e41c831953b3f84c4a7a9cfc...,39d13f95b71eb21f7b80bb1fcc3795548688fd1a8e8f27...,2020-06-12 04:57:02,Event,Event Organizer and Promotor,Webinar - Education,Guest,Jakarta,Indonesia,none,,0,1


> Even in this categories, we can still see the webinar events are still dominating.

In [35]:
loc_guest_notna.groupby('event_id')['event_type'].value_counts()

event_id                                                          event_type
552dcd9035c4d520da6318e41c831953b3f84c4a7a9cfc79f127c911da52a76a  Event         1
9441a919384943d46915a2efaaad637bb0ca08cf0de0d96a19597aa72ba8d6b2  Event         5
ea9af3b765d1ca781f3ba325e52cde149e7d7ed98c61fb6c1996431588ffeda8  Event         1
Name: event_type, dtype: int64

In [36]:
loc_guest_notna.groupby('event_id')['user_location'].value_counts()

event_id                                                          user_location
552dcd9035c4d520da6318e41c831953b3f84c4a7a9cfc79f127c911da52a76a  Jakarta          1
9441a919384943d46915a2efaaad637bb0ca08cf0de0d96a19597aa72ba8d6b2  Yogyakarta       5
ea9af3b765d1ca781f3ba325e52cde149e7d7ed98c61fb6c1996431588ffeda8  Bekasi           1
Name: user_location, dtype: int64

In [37]:
print('Bekasi ' + str(len(loc_guest_isna[loc_guest_isna['event_id']=='ea9af3b765d1ca781f3ba325e52cde149e7d7ed98c61fb6c1996431588ffeda8']))) # Bekasi

Bekasi 212


In [38]:
# Locating the event location based on event id

guest_fill = loc_guest_isna[loc_guest_isna['event_id']=='ea9af3b765d1ca781f3ba325e52cde149e7d7ed98c61fb6c1996431588ffeda8']
guest_fill['user_location'] = guest_fill['user_location'].fillna('Bekasi')

In [39]:
loc_guest_isna['user_location'] = loc_guest_isna['user_location'].fillna(guest_fill['user_location'])

In [40]:
loc_guest_isna.isna().sum()

user_id                  0
event_id                 0
organization_id          0
transaction_date         0
event_type               0
organization_type        0
event_category           0
user_type                0
user_location        12994
event_location           3
gender                3724
age_when_register    30887
price                    0
total_ticket_sold        0
dtype: int64

> Now moving to registered user. We will use the same method as before.

In [41]:
# Registered user

loc_regist_isna = df[(df['user_location'].isna()) & (df['user_type']=='Registered')]
loc_regist_notna = df[(df['user_location'].notna()) & (df['user_type']=='Registered')]

In [42]:
loc_regist_isna['event_category'].value_counts()

Music - Highschool Student - Concert - Pensi                                                                  1810
Music - Food - Fashion - Festival                                                                             1754
Webinar                                                                                                       1571
Music - Concert - Live Music - Pensi                                                                           933
Music - Concert                                                                                                812
                                                                                                              ... 
Seminar - Conference - Talkshow - Career Expo                                                                    1
Webinar - Education - Networking                                                                                 1
Sport - Concert - Art - Competition                                             

In [43]:
loc_regist_isna.head(3)

Unnamed: 0,user_id,event_id,organization_id,transaction_date,event_type,organization_type,event_category,user_type,user_location,event_location,gender,age_when_register,price,total_ticket_sold
1,23c44b20dafb34672868945ce6428ead219eb8bb5312c5...,6629d5a73302852ef915dbda22fa1994213eb36e41c893...,9bace67f3bda20854668f8b8a6756d2541c6ec49223803...,2019-01-01 03:11:44,Event,Event Organizer,Art Exhibition - Family - Art - Exhibition - G...,Registered,,Jakarta,female,29.0,100000,1
2,23c44b20dafb34672868945ce6428ead219eb8bb5312c5...,6629d5a73302852ef915dbda22fa1994213eb36e41c893...,9bace67f3bda20854668f8b8a6756d2541c6ec49223803...,2019-01-01 03:11:50,Event,Event Organizer,Art Exhibition - Family - Art - Exhibition - G...,Registered,,Jakarta,female,29.0,100000,1
7,23c44b20dafb34672868945ce6428ead219eb8bb5312c5...,6629d5a73302852ef915dbda22fa1994213eb36e41c893...,9bace67f3bda20854668f8b8a6756d2541c6ec49223803...,2019-01-01 05:15:37,Event,Event Organizer,Art Exhibition - Family - Art - Exhibition - G...,Registered,,Jakarta,female,29.0,100000,3


In [44]:
loc_regist_isna.isna().sum()

user_id                  0
event_id                 0
organization_id          0
transaction_date         0
event_type               0
organization_type        0
event_category           0
user_type                0
user_location        13966
event_location           0
gender                 101
age_when_register      742
price                    0
total_ticket_sold        0
dtype: int64

In [45]:
regist_not_web_isna = loc_regist_isna[(~loc_regist_isna['event_category'].str.contains('Webinar')==True) & (loc_regist_isna['event_location']!= 'Indonesia')]

In [46]:
regist_not_web_isna.head()

Unnamed: 0,user_id,event_id,organization_id,transaction_date,event_type,organization_type,event_category,user_type,user_location,event_location,gender,age_when_register,price,total_ticket_sold
1,23c44b20dafb34672868945ce6428ead219eb8bb5312c5...,6629d5a73302852ef915dbda22fa1994213eb36e41c893...,9bace67f3bda20854668f8b8a6756d2541c6ec49223803...,2019-01-01 03:11:44,Event,Event Organizer,Art Exhibition - Family - Art - Exhibition - G...,Registered,,Jakarta,female,29.0,100000,1
2,23c44b20dafb34672868945ce6428ead219eb8bb5312c5...,6629d5a73302852ef915dbda22fa1994213eb36e41c893...,9bace67f3bda20854668f8b8a6756d2541c6ec49223803...,2019-01-01 03:11:50,Event,Event Organizer,Art Exhibition - Family - Art - Exhibition - G...,Registered,,Jakarta,female,29.0,100000,1
7,23c44b20dafb34672868945ce6428ead219eb8bb5312c5...,6629d5a73302852ef915dbda22fa1994213eb36e41c893...,9bace67f3bda20854668f8b8a6756d2541c6ec49223803...,2019-01-01 05:15:37,Event,Event Organizer,Art Exhibition - Family - Art - Exhibition - G...,Registered,,Jakarta,female,29.0,100000,3
8,23c44b20dafb34672868945ce6428ead219eb8bb5312c5...,6629d5a73302852ef915dbda22fa1994213eb36e41c893...,9bace67f3bda20854668f8b8a6756d2541c6ec49223803...,2019-01-01 05:58:08,Event,Event Organizer,Art Exhibition - Family - Art - Exhibition - G...,Registered,,Jakarta,female,29.0,100000,3
9,23c44b20dafb34672868945ce6428ead219eb8bb5312c5...,6629d5a73302852ef915dbda22fa1994213eb36e41c893...,9bace67f3bda20854668f8b8a6756d2541c6ec49223803...,2019-01-01 06:22:07,Event,Event Organizer,Art Exhibition - Family - Art - Exhibition - G...,Registered,,Jakarta,female,29.0,100000,5


In [47]:
len(regist_not_web_isna)

9768

> There are 9768 rows of user with unknown location. We will fill it by previously stated logic.

In [48]:
loc_regist_isna['user_location'] = regist_not_web_isna['event_location']

In [49]:
loc_regist_notna.head(3)

Unnamed: 0,user_id,event_id,organization_id,transaction_date,event_type,organization_type,event_category,user_type,user_location,event_location,gender,age_when_register,price,total_ticket_sold
43,d0903036489a613b93c0e7037227ea8784c3605cf9a3d4...,6629d5a73302852ef915dbda22fa1994213eb36e41c893...,9bace67f3bda20854668f8b8a6756d2541c6ec49223803...,2019-01-01 13:30:03,Event,Event Organizer,Art Exhibition - Family - Art - Exhibition - G...,Registered,Jakarta,Jakarta,female,28.0,50000,2
45,40c77d899a2e3d450ddfa0b992cf9ec08899ae838fa278...,698c42fe0a65f35b77acbc485109e982eb4f72e313f29d...,d560043d425e34bd6b17fea670e7d087f293404fc370a4...,2019-01-01 17:19:47,Event,Event Organizer,Seminar - Workshop - Networking - Education,Registered,Medan,Jakarta,female,19.0,0,1
69,321cfe8c4510fb3fc9b9c4be89d0ca56a797308dff53c3...,6629d5a73302852ef915dbda22fa1994213eb36e41c893...,9bace67f3bda20854668f8b8a6756d2541c6ec49223803...,2019-01-02 08:46:07,Event,Event Organizer,Art Exhibition - Family - Art - Exhibition - G...,Registered,Jakarta,Jakarta,female,26.0,50000,4


In [50]:
regist_not_web_notna = loc_regist_notna[(~loc_regist_notna['event_category'].str.contains('Webinar')==True) & (loc_regist_notna['event_location'] != 'Indonesia')]

In [51]:
regist_not_web_notna.groupby('user_location')['event_id'].value_counts()

user_location  event_id                                                        
Bali           e6771360e4d6834825c98aaf0b31b7417ae00b89e4e13dd6f17a9f5bdfb37032    9
               09cd82981be844fbd7b3fd9eba0bf48f569531a77a2601963c4e634878a87669    6
               1f9dadc037de809cf2ecebab04fa3533aadf7476bb99e5c7e6592e4d8ea4624c    5
               14ab144da2643b8a484f07971151f0cf501ba12e69daaae5e129bc81ae3bf4c9    4
               6bacbebd8d909ba3dfb1a47bf8d272ad1a083701d625ccc2f20bc365de2857e4    4
                                                                                  ..
Yogyakarta     953e8635cc23864639d0530fa19639bd6250dcb47f363f6c0c604fbf686e0f71    1
               9f51168136fa5fbef34f00b364328d9d6b393589fde86616d489d95e1cfcc2df    1
               a2ca1b9ecb351b1dd4891c02d8e2958ac2e839cb7893fe134dd08cf5ae9a2b92    1
               d703662dac6a3f47d6789be421c14222e24e44c7034aed83865955fbab694889    1
               f1be90db3a6d7abb3b58491e482836dde80fcb1d7e8f892f467d057

In [52]:
regist_not_web_notna.groupby('event_id')['user_location'].unique().value_counts().head(10)

[Jakarta]                                                 179
[Makassar]                                                 38
[Bandung]                                                  29
[Yogyakarta]                                               11
[Tangerang]                                                10
[Bekasi]                                                    8
[Depok]                                                     5
[Surabaya]                                                  5
[Lampung]                                                   3
[Jakarta, Semarang, Bandung, Bekasi, Tangerang, Depok]      1
Name: user_location, dtype: int64

In [53]:
# Identify location

loc_jkt = regist_not_web_notna[regist_not_web_notna['user_location']=='Jakarta']['event_id'].tolist()
loc_mks = regist_not_web_notna[regist_not_web_notna['user_location']=='Makassar']['event_id'].tolist()
loc_bdg = regist_not_web_notna[regist_not_web_notna['user_location']=='Bandung']['event_id'].tolist()

In [54]:
# Jakarta

fill_jkt = loc_regist_isna[loc_regist_isna['event_id'].isin(loc_jkt)]
fill_jkt['user_location'] = fill_jkt['user_location'].fillna('Jakarta')

In [55]:
# Makassar

fill_mks = loc_regist_isna[loc_regist_isna['event_id'].isin(loc_mks)]
fill_mks['user_location'] = fill_mks['user_location'].fillna('Makassar')

In [56]:
# Bandung

fill_bdg = loc_regist_isna[loc_regist_isna['event_id'].isin(loc_bdg)]
fill_bdg['user_location'] = fill_bdg['user_location'].fillna('Bandung')

In [57]:
loc_regist_isna.fillna(fill_jkt, inplace=True)
loc_regist_isna.fillna(fill_mks, inplace=True)
loc_regist_isna.fillna(fill_bdg, inplace=True)

In [58]:
loc_regist_isna.isna().sum()

user_id                 0
event_id                0
organization_id         0
transaction_date        0
event_type              0
organization_type       0
event_category          0
user_type               0
user_location        4198
event_location          0
gender                101
age_when_register     742
price                   0
total_ticket_sold       0
dtype: int64

In [59]:
loc_guest_filled = pd.concat([loc_guest_isna, loc_guest_notna])
loc_regist_filled = pd.concat([loc_regist_isna, loc_regist_notna])
loc_filled = pd.concat([loc_guest_filled, loc_regist_filled])

In [60]:
# Merging to original dataframe

df.fillna(loc_filled, inplace=True)

In [61]:
df.isna().sum()

user_id                  0
event_id                 0
organization_id          0
transaction_date         0
event_type               0
organization_type        0
event_category           0
user_type                0
user_location        17192
event_location           6
gender                3827
age_when_register    31804
price                    0
total_ticket_sold        0
dtype: int64

> We move to **event location** and **gender** feature.

In [62]:
df[df['event_location'].isna()].groupby('user_type')['event_category'].value_counts()

user_type   event_category        
Guest       Online Event - Webinar    2
            Festival - Music          1
Registered  Online Event - Webinar    3
Name: event_category, dtype: int64

> Seems that in this feature the online events and the guest type are causing the data to be missing. The one with **'Webinar'** category, we can fill it with **Indonesia**.

In [63]:
df[df['event_location'].isna()].groupby('user_type')['event_id'].value_counts()

user_type   event_id                                                        
Guest       a51ab54c9c7394a052dc73b3adc164af628dcb1049e75912ecfb702b1024cd9a    2
            83cadc8cf1544e253e3e75fdf140d38badbc1461124e4b75f6022d97747e8672    1
Registered  a51ab54c9c7394a052dc73b3adc164af628dcb1049e75912ecfb702b1024cd9a    3
Name: event_id, dtype: int64

In [64]:
df[df['event_id'] == '83cadc8cf1544e253e3e75fdf140d38badbc1461124e4b75f6022d97747e8672']

Unnamed: 0,user_id,event_id,organization_id,transaction_date,event_type,organization_type,event_category,user_type,user_location,event_location,gender,age_when_register,price,total_ticket_sold
21800,20f1f78d5eb02b09a503b26e7294399b6b7b3ca7d1565f...,83cadc8cf1544e253e3e75fdf140d38badbc1461124e4b...,bf98663c9884ad409695575399cd63a9caf2217c742ae7...,2020-02-15 06:08:50,Event,Event Organizer and Promotor,Festival - Music,Guest,,,none,,3750000,1


> Interestingly, this event are the only events that are avaliable, and the price are also quite expensive. But I think this might be a reasonable price, if we see the categories. Therefore let's try to explore further, maybe I'll narrow it down to typical concert ticket price range.

In [65]:
df[(df['price']>=2000000) & (df['price']<4000000)]['event_category'].value_counts()

Networking - Conference                                     353
Conference                                                   25
Festival - Music                                              8
Seminar                                                       4
Workshop - Seminar - Exibition - Networking - Conference      1
Workshop - Travel - Seminar - Conference                      1
Name: event_category, dtype: int64

> By doing this method, we found that there are 8 similar **Festival - Music** categories. And 7 of them are placed in **Jakarta**. Therefore, we'll fill out this missing values as **Jakarta**.

In [66]:
df[(df['price']>=2000000) & (df['price']<4000000) & (df['event_category']=='Festival - Music')]

Unnamed: 0,user_id,event_id,organization_id,transaction_date,event_type,organization_type,event_category,user_type,user_location,event_location,gender,age_when_register,price,total_ticket_sold
14209,29b6e82d15f0f2916603012e411fc051ffd5f846743e87...,066fd583bb78d0125199ab4a8b5e4efb2953f0a24e2a39...,bf98663c9884ad409695575399cd63a9caf2217c742ae7...,2019-09-20 17:20:03,Event,Event Organizer and Promotor,Festival - Music,Guest,Jakarta,Jakarta,none,,2100000,2
14210,6ad74168b85bad3c51762d130a3b3717bef7e969822863...,066fd583bb78d0125199ab4a8b5e4efb2953f0a24e2a39...,bf98663c9884ad409695575399cd63a9caf2217c742ae7...,2019-09-20 19:16:09,Event,Event Organizer and Promotor,Festival - Music,Guest,Jakarta,Jakarta,none,,2100000,4
14370,7f2e9b1590a2f0dd4632ea3f5eebae5fa393c866de12c3...,066fd583bb78d0125199ab4a8b5e4efb2953f0a24e2a39...,bf98663c9884ad409695575399cd63a9caf2217c742ae7...,2019-09-24 07:58:23,Event,Event Organizer and Promotor,Festival - Music,Guest,Jakarta,Jakarta,none,,2100000,4
14420,e5a5c2b89a0ea5691fb41cce94cdade05865969a9b32f1...,066fd583bb78d0125199ab4a8b5e4efb2953f0a24e2a39...,bf98663c9884ad409695575399cd63a9caf2217c742ae7...,2019-09-25 05:30:18,Event,Event Organizer and Promotor,Festival - Music,Guest,Jakarta,Jakarta,none,,2100000,4
18029,ef44cb258a8e237518afc2d97e6912d4a0720e24c1f377...,066fd583bb78d0125199ab4a8b5e4efb2953f0a24e2a39...,bf98663c9884ad409695575399cd63a9caf2217c742ae7...,2019-11-13 14:55:37,Event,Event Organizer and Promotor,Festival - Music,Guest,Jakarta,Jakarta,none,,2100000,1
21800,20f1f78d5eb02b09a503b26e7294399b6b7b3ca7d1565f...,83cadc8cf1544e253e3e75fdf140d38badbc1461124e4b...,bf98663c9884ad409695575399cd63a9caf2217c742ae7...,2020-02-15 06:08:50,Event,Event Organizer and Promotor,Festival - Music,Guest,,,none,,3750000,1
39311,3b525e5bd4143c0609e7385869dcecd16f1dd39590347d...,066fd583bb78d0125199ab4a8b5e4efb2953f0a24e2a39...,bf98663c9884ad409695575399cd63a9caf2217c742ae7...,2019-09-24 17:49:46,Event,Event Organizer and Promotor,Festival - Music,Guest,Jakarta,Jakarta,none,,2100000,1
39385,871d9a4d6c9c4b21caeb18c52d8b64c588e67e19af8f6f...,066fd583bb78d0125199ab4a8b5e4efb2953f0a24e2a39...,bf98663c9884ad409695575399cd63a9caf2217c742ae7...,2019-09-26 03:53:06,Event,Event Organizer and Promotor,Festival - Music,Guest,Jakarta,Jakarta,none,,2100000,4


In [67]:
# Fill the null values

df[(df['price']>=2000000) & (df['price']<4000000) & (df['event_category']=='Festival - Music')].fillna(method = 'ffill', inplace=True)

In [68]:
# Merging to original dataframe

df['event_location'] = df['event_location'].fillna('Indonesia')

In [69]:
df.isna().sum()

user_id                  0
event_id                 0
organization_id          0
transaction_date         0
event_type               0
organization_type        0
event_category           0
user_type                0
user_location        17192
event_location           0
gender                3827
age_when_register    31804
price                    0
total_ticket_sold        0
dtype: int64

> Now that's cleared up. We're going to move to **gender** feature.

In [70]:
df[df['gender'].isna()].groupby('user_type')['event_category'].value_counts()

user_type   event_category                                                                     
Guest       Webinar - Concert - Education                                                          3462
            Group Activities                                                                        160
            Webinar                                                                                  49
            Webinar - Game                                                                           13
            Music - Concert                                                                           7
            Music - Performance - Highschool Student - Pensi                                          4
            Workshop                                                                                  4
            Webinar - Tech Startup                                                                    3
            Nightlife - Music                                           

In [71]:
df[df['gender'].notna()].groupby('user_type')['event_category'].value_counts()

user_type   event_category                                           
Guest       Webinar                                                      3736
            Workshop                                                     2183
            Webinar - Tech Startup                                       1879
            Talkshow                                                     1241
            Seminar                                                      1165
                                                                         ... 
Registered  Workshop - Seminar - Talkshow - Education - Networking          1
            Workshop - Seminar - Talkshow - Networking - Conference         1
            Workshop - Seminar - Talkshow - Tech Startup - Networking       1
            Workshop - Seminar - Tech StArtup                               1
            Workshop - Webinar - Online Event                               1
Name: event_category, Length: 503, dtype: int64

In [72]:
df[df['gender'].notna()].groupby('event_category')['gender'].value_counts()

event_category                                                                                                  gender
Art                                                                                                             male        1
Art Exhibition - Family - Art - Exhibition - Group Activities - Museum - Photography - Art Space - Go to Malls  female    979
                                                                                                                none      776
                                                                                                                male       44
Bazaar - Music - Culture - Festival - University Student - Competition                                          none        2
                                                                                                                         ... 
Workshop - Webinar - Education                                                                                  male        8

In [73]:
df[df['gender'].notna()].groupby('user_type')['gender'].value_counts()

user_type   gender
Guest       none      23026
            male       2165
            female     1992
Registered  female    14580
            male       7600
            none        842
Name: gender, dtype: int64

In [74]:
# Fill guest user gender

gender_guest = df[(df['gender'].isna()) & (df['user_type']=='Guest')]

# Fill registered user gender

gender_regist = df[(df['gender'].isna()) & (df['user_type']!='Guest')]

In [75]:
gender_guest['gender'] = gender_guest['gender'].fillna('none')
gender_regist['gender'] = gender_regist['gender'].fillna('female')

In [76]:
gender_fills = pd.concat([gender_guest, gender_regist])

In [77]:
# Merging to original dataframe

df['gender'] = df['gender'].fillna(gender_fills['gender'])

In [78]:
df.isna().sum()

user_id                  0
event_id                 0
organization_id          0
transaction_date         0
event_type               0
organization_type        0
event_category           0
user_type                0
user_location        17192
event_location           0
gender                   0
age_when_register    31804
price                    0
total_ticket_sold        0
dtype: int64

> Now we go to next feature **age when register**. We will implement the same method as before.

In [79]:
df[df['age_when_register'].isna()]['user_type'].value_counts()

Guest         30894
Registered      910
Name: user_type, dtype: int64

In [80]:
df[df['age_when_register'].notna()]['user_type'].value_counts()

Registered    22215
Guest            13
Name: user_type, dtype: int64

> From above information, we can see that the missing values in this category are dominated by the **Guest** users. There are only 13 values filled and more than 30000 users are leaving this feature blanks. We will try to explore and mitigate this below. 

In [81]:
df[(df['user_type']=='Registered') & (df['event_location'] != 'Indonesia')].groupby('age_when_register')['event_category'].value_counts()

age_when_register  event_category                                               
0.0                Webinar                                                           2
                   Online Event - Webinar - Tech Startup                             1
                   Seminar - University Student - Networking                         1
9.0                Music - Food - Fashion - Festival                                13
                   Travel - Group Activities - Water Park                           11
                                                                                    ..
89.0               Music - Food - Fashion - Festival                                 1
                   Seminar - Tech Startup                                            1
90.0               Seminar - Art - Culture - Conference - Networking - Education     1
                   Seminar - Tech Startup                                            1
120.0              Music - Concert               

> Seeing above data, it seems a little bit odd since there are user with 0 age, and user with 89 to 120 age. I'll try to explore it below.

In [82]:
df['age_when_register'].value_counts().head(20)

29.0    3386
17.0    3267
19.0    2198
18.0    1562
20.0    1094
21.0    1057
22.0     936
24.0     920
49.0     913
27.0     809
23.0     750
25.0     704
26.0     645
28.0     627
16.0     596
14.0     492
30.0     242
15.0     229
32.0     202
31.0     182
Name: age_when_register, dtype: int64

In [83]:
df['age_when_register'].value_counts(ascending=True).head(20)

63.0      1
3.0       1
61.0      1
56.0      1
58.0      1
72.0      1
120.0     1
57.0      2
90.0      2
55.0      2
54.0      3
62.0      3
59.0      3
11.0      4
89.0      5
0.0       6
52.0      8
45.0     15
53.0     16
10.0     16
Name: age_when_register, dtype: int64

In [84]:
df['age_when_register'].sort_values(ascending=False).head(15)

8502     120.0
5882      90.0
18555     90.0
915       89.0
37464     89.0
2033      89.0
1198      89.0
41450     89.0
30296     72.0
43301     63.0
51646     62.0
53303     62.0
21906     62.0
33166     61.0
12164     59.0
Name: age_when_register, dtype: float64

In [85]:
df['age_when_register'].sort_values(ascending=True).head(15)

23136    0.0
31331    0.0
24931    0.0
33261    0.0
31330    0.0
31327    0.0
52998    3.0
33260    9.0
37748    9.0
37680    9.0
38546    9.0
37678    9.0
11446    9.0
12183    9.0
20309    9.0
Name: age_when_register, dtype: float64

In [86]:
df[df['age_when_register']>65]

Unnamed: 0,user_id,event_id,organization_id,transaction_date,event_type,organization_type,event_category,user_type,user_location,event_location,gender,age_when_register,price,total_ticket_sold
915,b2fb2c1d094c9fcdd279474b12170d648b70eeecc6fb87...,f4393b4283998e6d191a75984dc889277f0b245b673e5e...,82ae7d8f032587ee2cd1f43ef733b1176d02aa13dc7132...,2019-01-21 14:41:38,Event,Event Organizer,Workshop,Registered,Jakarta,Jakarta,female,89.0,50000,3
1198,b2fb2c1d094c9fcdd279474b12170d648b70eeecc6fb87...,c4b31210e5fecaffc236fee3a858e92a395e2a79b0866c...,82ae7d8f032587ee2cd1f43ef733b1176d02aa13dc7132...,2019-02-18 08:23:33,Event,Event Organizer,Workshop,Registered,Jakarta,Jakarta,female,89.0,50000,1
2033,b2fb2c1d094c9fcdd279474b12170d648b70eeecc6fb87...,c4b31210e5fecaffc236fee3a858e92a395e2a79b0866c...,82ae7d8f032587ee2cd1f43ef733b1176d02aa13dc7132...,2019-02-19 10:37:24,Event,Event Organizer,Workshop,Registered,Jakarta,Jakarta,female,89.0,50000,1
5882,2a216ca0f56f5d5e8d17586efc24b3dee1073c4d3ad8c9...,b5376c5e81bdd49acab2233e50d3fe4907332e72e8e5be...,181f3c6fb709aceefd52c408ed503d9f970211a11d8c61...,2019-07-28 09:54:32,Event,Event Organizer and Promotor,Seminar - Art - Culture - Conference - Network...,Registered,Tangerang,Jakarta,male,90.0,419000,1
8502,8c71a58e0bf9f82be88b17929c63384ba32b58a6be7a84...,b5bfc041db54a1f27c6f6563f1b42706e994b4b8f6e82d...,db5b1e7d8aabff82a502dfdac3f701cd5daed066c65766...,2019-08-25 23:09:25,Event,Event Organizer and Promotor,Music - Concert,Registered,Jakarta,Jakarta,female,120.0,0,1
18555,05c8dc87d4c865d05646617bfc4f0c42c4bb51ca5923a4...,8a8c40cba849b7f31635583539dbc80fc7469592387bfc...,dfc7c637af4a6e92c6a22e4dabed4f17b7a4be8ea8d567...,2019-12-04 14:03:27,Event,Event Organizer,Seminar - Tech Startup,Registered,Bandung,Bandung,male,90.0,0,1
30296,34bf65b3b0f5ab834f8961cf148a66d6d97cde49fd8804...,552dcd9035c4d520da6318e41c831953b3f84c4a7a9cfc...,39d13f95b71eb21f7b80bb1fcc3795548688fd1a8e8f27...,2020-06-12 06:42:14,Event,Event Organizer and Promotor,Webinar - Education,Registered,,Indonesia,male,72.0,0,1
37464,ddc23e9940f3894dae65cb130b90a0a15efc59b0f2899d...,9f51168136fa5fbef34f00b364328d9d6b393589fde866...,00426f657147432f752353af9b2db7f19b52beefb261f6...,2019-09-02 13:03:52,Event,Event Organizer,Music - Food - Fashion - Festival,Registered,Tangerang,Tangerang,female,89.0,25000,1
41450,6577762c90899409835d111cee022d0df2fdc0187db325...,8a8c40cba849b7f31635583539dbc80fc7469592387bfc...,dfc7c637af4a6e92c6a22e4dabed4f17b7a4be8ea8d567...,2019-12-04 10:18:22,Event,Event Organizer,Seminar - Tech Startup,Registered,Bandung,Bandung,male,89.0,0,1


In [87]:
len(df)

54032

In [88]:
# Drop value higher than 65 years

df = df.drop(df[df['age_when_register']>65].index)

In [89]:
len(df)

54023

In [90]:
df[df['age_when_register']<18]

Unnamed: 0,user_id,event_id,organization_id,transaction_date,event_type,organization_type,event_category,user_type,user_location,event_location,gender,age_when_register,price,total_ticket_sold
122,973254b7d9f271787318df5dffd2cbb39137570517f8e5...,6629d5a73302852ef915dbda22fa1994213eb36e41c893...,9bace67f3bda20854668f8b8a6756d2541c6ec49223803...,2019-01-04 06:06:05,Event,Event Organizer,Art Exhibition - Family - Art - Exhibition - G...,Registered,Jakarta,Jakarta,female,9.0,50000,3
286,01ec7f5c6166ecce5e79c05cc64b21e56a743d5543af8f...,42fbc58b0793f7a78da3aed5023efa91bdfbf219963614...,58b192b21f26e839d98afc2dce1e03e354756c73ed3f57...,2019-01-07 07:40:20,Event,Event Organizer,Wedding Expo - Exhibition,Registered,Jakarta,Jakarta,female,16.0,15000,1
287,01ec7f5c6166ecce5e79c05cc64b21e56a743d5543af8f...,42fbc58b0793f7a78da3aed5023efa91bdfbf219963614...,58b192b21f26e839d98afc2dce1e03e354756c73ed3f57...,2019-01-07 07:43:31,Event,Event Organizer,Wedding Expo - Exhibition,Registered,Jakarta,Jakarta,female,16.0,15000,1
438,fa85b7e099050ee35665fcd45d7dc1a802ce7379e3a9e8...,6283d5326523497874fd69beb5f4482054c44a08a0b28b...,88b11b0e7ba1e23b5af6878605cc5afb77b4d36c88db7a...,2019-01-10 11:40:59,Event,Event Organizer and Promotor,University Student - Seminar,Registered,Yogyakarta,Bandung,male,16.0,150000,1
488,2daae7d143097267afb41fbdb9f5779f8091c4dab45e2c...,42fbc58b0793f7a78da3aed5023efa91bdfbf219963614...,58b192b21f26e839d98afc2dce1e03e354756c73ed3f57...,2019-01-11 06:46:38,Event,Event Organizer,Wedding Expo - Exhibition,Registered,Jakarta,Jakarta,male,13.0,15000,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53837,bb447d8f7fa559afd5832227ffedf31a1641321d15073a...,215cc964404a1c632341a439e448bd5f1fd4db8ef781bf...,edcd25f0614671feef2c5546910ce05a85396907974305...,2020-06-26 09:36:06,Event,Event Organizer and Promotor,Webinar,Registered,Jakarta,Indonesia,female,17.0,0,1
53838,bb447d8f7fa559afd5832227ffedf31a1641321d15073a...,215cc964404a1c632341a439e448bd5f1fd4db8ef781bf...,edcd25f0614671feef2c5546910ce05a85396907974305...,2020-06-26 09:36:29,Event,Event Organizer and Promotor,Webinar,Registered,Jakarta,Indonesia,female,17.0,0,1
53839,bb447d8f7fa559afd5832227ffedf31a1641321d15073a...,215cc964404a1c632341a439e448bd5f1fd4db8ef781bf...,edcd25f0614671feef2c5546910ce05a85396907974305...,2020-06-26 09:36:53,Event,Event Organizer and Promotor,Webinar,Registered,Jakarta,Indonesia,female,17.0,0,1
53963,d2dd6035c1f5df9ff9af2cae002cf3e1f79cb0cafed659...,0af23f9ed047e4068cfe24d0e75a7b5f641a9ae1ba0958...,c049c2410454768bc6f63910b4234ab909017bd06eea24...,2020-06-29 08:57:48,Event,Event Promotor,Webinar,Registered,,Jakarta,female,17.0,0,1


In [91]:
# Fill value lower than 18 to 18 based on TOC

df.loc[df['age_when_register']<18, 'age_when_register']=18

In [92]:
df[df['age_when_register']==18]

Unnamed: 0,user_id,event_id,organization_id,transaction_date,event_type,organization_type,event_category,user_type,user_location,event_location,gender,age_when_register,price,total_ticket_sold
120,65bc210110ba137cd2a36c03104b2b1bbcdd286071da7a...,6629d5a73302852ef915dbda22fa1994213eb36e41c893...,9bace67f3bda20854668f8b8a6756d2541c6ec49223803...,2019-01-04 05:42:16,Event,Event Organizer,Art Exhibition - Family - Art - Exhibition - G...,Registered,Jakarta,Jakarta,female,18.0,50000,2
122,973254b7d9f271787318df5dffd2cbb39137570517f8e5...,6629d5a73302852ef915dbda22fa1994213eb36e41c893...,9bace67f3bda20854668f8b8a6756d2541c6ec49223803...,2019-01-04 06:06:05,Event,Event Organizer,Art Exhibition - Family - Art - Exhibition - G...,Registered,Jakarta,Jakarta,female,18.0,50000,3
286,01ec7f5c6166ecce5e79c05cc64b21e56a743d5543af8f...,42fbc58b0793f7a78da3aed5023efa91bdfbf219963614...,58b192b21f26e839d98afc2dce1e03e354756c73ed3f57...,2019-01-07 07:40:20,Event,Event Organizer,Wedding Expo - Exhibition,Registered,Jakarta,Jakarta,female,18.0,15000,1
287,01ec7f5c6166ecce5e79c05cc64b21e56a743d5543af8f...,42fbc58b0793f7a78da3aed5023efa91bdfbf219963614...,58b192b21f26e839d98afc2dce1e03e354756c73ed3f57...,2019-01-07 07:43:31,Event,Event Organizer,Wedding Expo - Exhibition,Registered,Jakarta,Jakarta,female,18.0,15000,1
390,a220b54c23a892a78ca07f2ff441ece11d0f2ca86c2ad6...,6629d5a73302852ef915dbda22fa1994213eb36e41c893...,9bace67f3bda20854668f8b8a6756d2541c6ec49223803...,2019-01-09 17:07:04,Event,Event Organizer,Art Exhibition - Family - Art - Exhibition - G...,Registered,Lampung,Jakarta,female,18.0,50000,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53982,24a966bb7fdd0e54529ad245776fd776e2469cf476f4c5...,13b9749de18eafbbedc6d3e04071f1b0c02e454b91a11c...,50540ee80dbc052ba69dbdfb8f61afc9b1cda2228d1926...,2020-06-29 12:06:54,Event,Community,Webinar,Registered,,Jakarta,female,18.0,0,1
53983,24a966bb7fdd0e54529ad245776fd776e2469cf476f4c5...,17a40b515b6a53a300ee81c87ccd3a35d33fdcc0bfa605...,98ed8887cc16897a4dd73a422e2a0ae27c93f76269f0cc...,2020-06-29 12:07:26,Event,Event Organizer and Promotor,Webinar - Education - Networking - Sport,Registered,,Bandung,female,18.0,0,1
53984,24a966bb7fdd0e54529ad245776fd776e2469cf476f4c5...,e8f3cb7ee3d9227a5300eb4546e1fded1c59cdd6801d64...,98ed8887cc16897a4dd73a422e2a0ae27c93f76269f0cc...,2020-06-29 12:07:56,Event,Event Organizer and Promotor,Webinar - Education - Networking - Seminar,Registered,,Jakarta,female,18.0,0,1
53994,5bacb5c19594bc0a0bf19b1402d09e35e84c7736b77846...,d161f9f74fa109cc6642db193903181cc6031357e71999...,d8c53de2e0a2cab13623ddec1b968f3a88db68d8eceef7...,2020-06-29 14:21:34,Event,Event Organizer and Promotor,Webinar,Registered,,Indonesia,male,18.0,0,1


In [93]:
df.groupby('event_id')['age_when_register'].value_counts()

event_id                                                          age_when_register
001a53ec299ccc475e255a8297c76a34a3ada44db051518ac05567cff904f4ff  25.0                 2
                                                                  18.0                 1
                                                                  19.0                 1
0076ded2b0908462ecf5753646cf20ab41b26b3ec9b144c39f90b40b941655a7  23.0                 2
                                                                  26.0                 2
                                                                                      ..
ffbfd3c819ce1dc283e6ed6cb1a7172f6f4c7e45eba417cc9bb16528f60e9be5  31.0                 1
                                                                  32.0                 1
                                                                  35.0                 1
                                                                  48.0                 1
                          

In [94]:
age_isna = df[df['age_when_register'].isna()]
age_notna = df[df['age_when_register'].notna()]

In [95]:
age_notna.groupby('event_id')['age_when_register'].value_counts().head(30)

event_id                                                          age_when_register
001a53ec299ccc475e255a8297c76a34a3ada44db051518ac05567cff904f4ff  25.0                 2
                                                                  18.0                 1
                                                                  19.0                 1
0076ded2b0908462ecf5753646cf20ab41b26b3ec9b144c39f90b40b941655a7  23.0                 2
                                                                  26.0                 2
                                                                  24.0                 1
                                                                  27.0                 1
                                                                  32.0                 1
015b8977189eb613d24ffaa9c53fbf9bcb42a31a005f5e11aea6e03eccc650a5  21.0                 1
                                                                  39.0                 1
017859023da7d47af5353d9dde

In [96]:
age_isna[age_isna['event_id'].isin(age_notna['event_id'])]

Unnamed: 0,user_id,event_id,organization_id,transaction_date,event_type,organization_type,event_category,user_type,user_location,event_location,gender,age_when_register,price,total_ticket_sold
0,4f3e46a6e94452b566553485b619d2f305bd9d875d4f6d...,6629d5a73302852ef915dbda22fa1994213eb36e41c893...,9bace67f3bda20854668f8b8a6756d2541c6ec49223803...,2019-01-01 03:04:32,Event,Event Organizer,Art Exhibition - Family - Art - Exhibition - G...,Guest,Jakarta,Jakarta,none,,75000,2
3,b1fc4cf5297d6639937d7806cf8ff626e4fdef6548c892...,6629d5a73302852ef915dbda22fa1994213eb36e41c893...,9bace67f3bda20854668f8b8a6756d2541c6ec49223803...,2019-01-01 03:24:59,Event,Event Organizer,Art Exhibition - Family - Art - Exhibition - G...,Guest,Jakarta,Jakarta,none,,75000,6
4,1da4ae70dde95b9d83c7586d1af62599cb01c4fc660f63...,6629d5a73302852ef915dbda22fa1994213eb36e41c893...,9bace67f3bda20854668f8b8a6756d2541c6ec49223803...,2019-01-01 03:53:11,Event,Event Organizer,Art Exhibition - Family - Art - Exhibition - G...,Guest,Jakarta,Jakarta,none,,75000,2
5,7dddb95cc3da557d3c0b993d4ad148182cdccd5833387e...,6629d5a73302852ef915dbda22fa1994213eb36e41c893...,9bace67f3bda20854668f8b8a6756d2541c6ec49223803...,2019-01-01 05:12:57,Event,Event Organizer,Art Exhibition - Family - Art - Exhibition - G...,Guest,Jakarta,Jakarta,none,,75000,4
6,ebee53b4d8aa0e1d82385d191b9a6b27c21125689f99f7...,42fbc58b0793f7a78da3aed5023efa91bdfbf219963614...,58b192b21f26e839d98afc2dce1e03e354756c73ed3f57...,2019-01-01 05:13:45,Event,Event Organizer,Wedding Expo - Exhibition,Guest,Jakarta,Jakarta,none,,15000,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54025,2c44415066b2c29532d3a64ecb065bffc89474dc5ccb64...,215cc964404a1c632341a439e448bd5f1fd4db8ef781bf...,edcd25f0614671feef2c5546910ce05a85396907974305...,2020-06-30 10:16:43,Event,Event Organizer and Promotor,Webinar,Guest,,Indonesia,none,,0,1
54026,883e0a9bf3352ebb582b83db4e806702562ff243586f55...,98a5e21b6cdcaa5e7d83bdb710b437afdb98904c79a899...,307d540ca5f39eeeb489168598da8625d45ab0fa8fb14a...,2020-06-30 12:51:51,Event,Event Organizer and Promotor,Webinar - Game,Guest,,Indonesia,none,,0,1
54027,533449881c251c76163bcf060a0b6f5a086b563506d825...,215cc964404a1c632341a439e448bd5f1fd4db8ef781bf...,edcd25f0614671feef2c5546910ce05a85396907974305...,2020-06-30 13:06:28,Event,Event Organizer and Promotor,Webinar,Guest,,Indonesia,none,,0,1
54029,0c4a0bcdf8904e211ca9ef5ed3612f5239ff4ccf9fe21b...,091083ed26d4daa659a2b11d91dccc33d4048977ac0a62...,d54b1a5795940aeee05ad0d0232a74666ef1095ccec17a...,2020-06-30 16:15:58,Event,Event Organizer and Promotor,Webinar,Guest,,Indonesia,none,,50000,1


In [97]:
age = age_notna[age_notna['event_id'].isin(age_isna['event_id'])]

In [98]:
age['event_id'].value_counts().head(10)

c7463ebc8675f70869a51f85bc9b3a54dcb403a46b6d979a6ac8d7f925a553ad    2675
9f51168136fa5fbef34f00b364328d9d6b393589fde86616d489d95e1cfcc2df    2354
209839b3830376c3d0d56cc229843236518737303ccabcdd9c0c2199f27e34df    1412
0823bec079e5e7449851c82b4583fefa40e2dcc75c49494c593f3b1bb19919eb    1071
bbe4d5bcda2e9f0be5d73a917e00a660f62fb027cde81cde843101560789a00d    1044
6629d5a73302852ef915dbda22fa1994213eb36e41c893f4d9391f1e2215aee7    1010
7d23fdf40253bc8aab93fb807851daa96da6b1b02f6ea53fcf92e1991c5f76be     867
9441a919384943d46915a2efaaad637bb0ca08cf0de0d96a19597aa72ba8d6b2     431
f0dc0a7123bedcec5464c2518e47491fa6baa9970850c70fa5067200d9d7284d     350
b5376c5e81bdd49acab2233e50d3fe4907332e72e8e5bee67a7a059ad65a6ec7     345
Name: event_id, dtype: int64

In [99]:
# Locating event_id

age_evnt1 = age_isna[age_isna['event_id'] == 'c7463ebc8675f70869a51f85bc9b3a54dcb403a46b6d979a6ac8d7f925a553ad']
age_evnt2 = age_isna[age_isna['event_id'] == '9f51168136fa5fbef34f00b364328d9d6b393589fde86616d489d95e1cfcc2df']
age_evnt3 = age_isna[age_isna['event_id'] == '209839b3830376c3d0d56cc229843236518737303ccabcdd9c0c2199f27e34df']
age_evnt4 = age_isna[age_isna['event_id'] == '0823bec079e5e7449851c82b4583fefa40e2dcc75c49494c593f3b1bb19919eb']
age_evnt5 = age_isna[age_isna['event_id'] == 'bbe4d5bcda2e9f0be5d73a917e00a660f62fb027cde81cde843101560789a00d']
age_evnt6 = age_isna[age_isna['event_id'] == '6629d5a73302852ef915dbda22fa1994213eb36e41c893f4d9391f1e2215aee7']

In [100]:
# Measure filled values

age_fill1 = round(age[age['event_id']=='c7463ebc8675f70869a51f85bc9b3a54dcb403a46b6d979a6ac8d7f925a553ad']['age_when_register'].mean())
age_fill2 = round(age[age['event_id']=='9f51168136fa5fbef34f00b364328d9d6b393589fde86616d489d95e1cfcc2df']['age_when_register'].mean())
age_fill3 = round(age[age['event_id']=='209839b3830376c3d0d56cc229843236518737303ccabcdd9c0c2199f27e34df']['age_when_register'].mean())
age_fill4 = round(age[age['event_id']=='0823bec079e5e7449851c82b4583fefa40e2dcc75c49494c593f3b1bb19919eb']['age_when_register'].mean())
age_fill5 = round(age[age['event_id']=='bbe4d5bcda2e9f0be5d73a917e00a660f62fb027cde81cde843101560789a00d']['age_when_register'].mean())
age_fill6 = round(age[age['event_id']=='6629d5a73302852ef915dbda22fa1994213eb36e41c893f4d9391f1e2215aee7']['age_when_register'].mean())

In [101]:
# Filling missing value

age_evnt1['age_when_register'] = age_evnt1['age_when_register'].fillna(age_fill1)
age_evnt2['age_when_register'] = age_evnt2['age_when_register'].fillna(age_fill2)
age_evnt3['age_when_register'] = age_evnt3['age_when_register'].fillna(age_fill3)
age_evnt4['age_when_register'] = age_evnt4['age_when_register'].fillna(age_fill4)
age_evnt5['age_when_register'] = age_evnt5['age_when_register'].fillna(age_fill5)
age_evnt6['age_when_register'] = age_evnt6['age_when_register'].fillna(age_fill6)

In [102]:
age_filled = pd.concat([age_evnt1, age_evnt2, age_evnt3, age_evnt4, age_evnt5, age_evnt6])

In [103]:
age_isna['age_when_register'] = age_isna['age_when_register'].fillna(age_filled['age_when_register'])

In [104]:
age_fill = pd.concat([age_isna, age_notna])

In [105]:
# Merging to original dataframe

df['age_when_register'] = df['age_when_register'].fillna(age_fill['age_when_register'])

In [106]:
df.isna().sum()

user_id                  0
event_id                 0
organization_id          0
transaction_date         0
event_type               0
organization_type        0
event_category           0
user_type                0
user_location        17191
event_location           0
gender                   0
age_when_register    28876
price                    0
total_ticket_sold        0
dtype: int64

In [107]:
df[df['age_when_register'].isna()]['user_type'].value_counts()

Guest         28037
Registered      839
Name: user_type, dtype: int64

In [108]:
df[df['age_when_register'].notna()]['user_type'].value_counts()

Registered    22277
Guest          2870
Name: user_type, dtype: int64

In [109]:
pd.DataFrame({'dataFeatures' : df.columns, 'dataType' : df.dtypes.values, 
              'null' : [df[i].isna().sum() for i in df.columns],
              'nullPct' : [((df[i].isna().sum()/len(df[i]))*100).round(1) for i in df.columns],
              'Nunique' : [df[i].nunique() for i in df.columns],
              'uniqueSample' : [list(pd.Series(df[i].unique()).sample()) for i in df.columns]}).reset_index(drop = True)

Unnamed: 0,dataFeatures,dataType,null,nullPct,Nunique,uniqueSample
0,user_id,object,0,0.0,32171,[c74259c0557fd505c19b66142a29e5d5e610ee565eca3...
1,event_id,object,0,0.0,811,[07a2d00608bde67f29b56ece7ce7f129a19c5a1d6596b...
2,organization_id,object,0,0.0,274,[9000a2da973b96eae18bba26566e768e5daf64fd228d8...
3,transaction_date,object,0,0.0,49299,[2019-03-08 07:21:51]
4,event_type,object,0,0.0,1,[Event]
5,organization_type,object,0,0.0,7,[Event Organizer]
6,event_category,object,0,0.0,296,[Travel - Adventure]
7,user_type,object,0,0.0,2,[Guest]
8,user_location,object,17191,31.8,37,[Balikpapan]
9,event_location,object,0,0.0,23,[Madiun]


> Now we will fix the other datatype.

In [110]:
df[df['user_type']=='Guest']['age_when_register'].median()

21.0

In [111]:
# Fill the rest of missing values

df['user_location'] = df['user_location'].fillna('Unknown')
df['age_when_register'] = df['age_when_register'].fillna(21)

In [112]:
# transaction date

df['transaction_date'] = pd.to_datetime(df['transaction_date'])

In [113]:
# age when register

df['age_when_register'] = df['age_when_register'].astype('int64')

In [114]:
# price 

df['price'] = df['price'].astype(float)

In [115]:
pd.DataFrame({'dataFeatures' : df.columns, 'dataType' : df.dtypes.values, 
              'null' : [df[i].isna().sum() for i in df.columns],
              'nullPct' : [((df[i].isna().sum()/len(df[i]))*100).round(1) for i in df.columns],
              'Nunique' : [df[i].nunique() for i in df.columns],
              'uniqueSample' : [list(pd.Series(df[i].unique()).sample()) for i in df.columns]}).reset_index(drop = True)

Unnamed: 0,dataFeatures,dataType,null,nullPct,Nunique,uniqueSample
0,user_id,object,0,0.0,32171,[18052f82ea9700978f93bf06b0eeb50613ac476d16609...
1,event_id,object,0,0.0,811,[bfb01498d470b4af8cf39f9ad81adec8df953b361c57f...
2,organization_id,object,0,0.0,274,[1c0b8a0321e01fe2082f5cf29cbced800b5e7ba60e077...
3,transaction_date,datetime64[ns],0,0.0,49299,[2020-02-09 06:34:59]
4,event_type,object,0,0.0,1,[Event]
5,organization_type,object,0,0.0,7,[Event Organizer and Promotor]
6,event_category,object,0,0.0,296,[Seminar - Education - Talkshow]
7,user_type,object,0,0.0,2,[Guest]
8,user_location,object,0,0.0,38,[Makassar]
9,event_location,object,0,0.0,23,[Medan]


In [116]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54023 entries, 0 to 54031
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   user_id            54023 non-null  object        
 1   event_id           54023 non-null  object        
 2   organization_id    54023 non-null  object        
 3   transaction_date   54023 non-null  datetime64[ns]
 4   event_type         54023 non-null  object        
 5   organization_type  54023 non-null  object        
 6   event_category     54023 non-null  object        
 7   user_type          54023 non-null  object        
 8   user_location      54023 non-null  object        
 9   event_location     54023 non-null  object        
 10  gender             54023 non-null  object        
 11  age_when_register  54023 non-null  int64         
 12  price              54023 non-null  float64       
 13  total_ticket_sold  54023 non-null  int64         
dtypes: dat

> Now that the data have been properly cleaned. I'll pass it into new dataframe to later use it for EDA.

In [117]:
# Pass cleaned data to new dataframe

df.to_csv('dataset\evnts_clean.csv', index=False)

In [118]:
df_new = pd.read_csv('dataset\evnts_clean.csv')

In [120]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54023 entries, 0 to 54022
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   user_id            54023 non-null  object 
 1   event_id           54023 non-null  object 
 2   organization_id    54023 non-null  object 
 3   transaction_date   54023 non-null  object 
 4   event_type         54023 non-null  object 
 5   organization_type  54023 non-null  object 
 6   event_category     54023 non-null  object 
 7   user_type          54023 non-null  object 
 8   user_location      54023 non-null  object 
 9   event_location     54023 non-null  object 
 10  gender             54023 non-null  object 
 11  age_when_register  54023 non-null  int64  
 12  price              54023 non-null  float64
 13  total_ticket_sold  54023 non-null  int64  
dtypes: float64(1), int64(2), object(11)
memory usage: 5.8+ MB
