# Analisis Kepadatan Penumpang KRL Rute Rangkasbitung - Tanah Abang dengan Pendekatan Exploratory Data Analysis

### Collecting Data

#### Import Library

In [1]:
import pandas as pd
import numpy as np
from glob import glob

#### Mengubah CSV files menjadi DataFrame

Mengkombinasikan beberapa CSV files

In [2]:
# membuat list dari csv files yang terpisah ke dalam satu direktori bernama stock_files
# pastikan semua file csv yang akan disatukan memiliki bentuk filename yang sama
stock_files = sorted(glob('./dataset/RESULT PART *.csv'))
stock_files

['./dataset\\RESULT PART I.csv',
 './dataset\\RESULT PART II.csv',
 './dataset\\RESULT PART III.csv']

In [3]:
# gabungkan semua csv files yang terpisah ke dalam satu DataFrame dengan mengguna glob, dan pd.concat()
krl = pd.concat((pd.read_csv(file)
               for file in stock_files), ignore_index = True)
krl

Unnamed: 0,t_d_gatein_gateout_arah_perminutes_result,c_station,d_date,t_minute,vol_in,vol_out,selisih,arah
0,1,CCY,1/1/2020,00:00,0,0,0,\tRK
1,2,CCY,1/1/2020,00:00,1,0,1,THB
2,3,CCY,1/1/2020,00:01,0,0,0,\tRK
3,4,CCY,1/1/2020,00:01,0,0,0,THB
4,5,CCY,1/1/2020,00:02,0,0,0,\tRK
...,...,...,...,...,...,...,...,...
3296007,3296008,TIG,29/2/2020,23:57,0,0,0,THB
3296008,3296009,TIG,29/2/2020,23:58,0,0,0,\tRK
3296009,3296010,TIG,29/2/2020,23:58,0,0,0,THB
3296010,3296011,TIG,29/2/2020,23:59,0,0,0,\tRK


### Preprocessing Data

Menghapus Kolom

In [4]:
# menghapus kolom bernama 't_d_gatein_gateout_arah_perminutes_result'
krl.drop(columns='t_d_gatein_gateout_arah_perminutes_result', inplace=True)

# tampilkan 5 baris pertama
krl.head() 

Unnamed: 0,c_station,d_date,t_minute,vol_in,vol_out,selisih,arah
0,CCY,1/1/2020,00:00,0,0,0,\tRK
1,CCY,1/1/2020,00:00,1,0,1,THB
2,CCY,1/1/2020,00:01,0,0,0,\tRK
3,CCY,1/1/2020,00:01,0,0,0,THB
4,CCY,1/1/2020,00:02,0,0,0,\tRK


In [5]:
# menghapus kolom bernama 't_d_gatein_gateout_arah_perminutes_result'
krl.drop(columns='selisih', inplace=True)

# tampilkan 5 baris pertama
krl.head() 

Unnamed: 0,c_station,d_date,t_minute,vol_in,vol_out,arah
0,CCY,1/1/2020,00:00,0,0,\tRK
1,CCY,1/1/2020,00:00,1,0,THB
2,CCY,1/1/2020,00:01,0,0,\tRK
3,CCY,1/1/2020,00:01,0,0,THB
4,CCY,1/1/2020,00:02,0,0,\tRK


Membersihkan whitespace

In [6]:
#menghilangkan spasi pada nilai kolom c_station
krl['c_station'] = krl['c_station'].str.strip() 
krl

Unnamed: 0,c_station,d_date,t_minute,vol_in,vol_out,arah
0,CCY,1/1/2020,00:00,0,0,\tRK
1,CCY,1/1/2020,00:00,1,0,THB
2,CCY,1/1/2020,00:01,0,0,\tRK
3,CCY,1/1/2020,00:01,0,0,THB
4,CCY,1/1/2020,00:02,0,0,\tRK
...,...,...,...,...,...,...
3296007,TIG,29/2/2020,23:57,0,0,THB
3296008,TIG,29/2/2020,23:58,0,0,\tRK
3296009,TIG,29/2/2020,23:58,0,0,THB
3296010,TIG,29/2/2020,23:59,0,0,\tRK


In [7]:
# menghilangkan \t dari kolom arah
krl['arah'] = krl['arah'].str.replace('\t', '')
krl

Unnamed: 0,c_station,d_date,t_minute,vol_in,vol_out,arah
0,CCY,1/1/2020,00:00,0,0,RK
1,CCY,1/1/2020,00:00,1,0,THB
2,CCY,1/1/2020,00:01,0,0,RK
3,CCY,1/1/2020,00:01,0,0,THB
4,CCY,1/1/2020,00:02,0,0,RK
...,...,...,...,...,...,...
3296007,TIG,29/2/2020,23:57,0,0,THB
3296008,TIG,29/2/2020,23:58,0,0,RK
3296009,TIG,29/2/2020,23:58,0,0,THB
3296010,TIG,29/2/2020,23:59,0,0,RK


In [8]:
#informasi dataset
krl.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3296012 entries, 0 to 3296011
Data columns (total 6 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   c_station  object
 1   d_date     object
 2   t_minute   object
 3   vol_in     int64 
 4   vol_out    int64 
 5   arah       object
dtypes: int64(2), object(4)
memory usage: 150.9+ MB


Menggabungkan Kolom ```d_date``` dan ```t_minute``` agar menjadi satu dan hasil penyatuan disimpan dalam kolom ```timestamps```

In [9]:
krl['timestamps'] = krl['d_date'] + ' ' + krl['t_minute']
krl

Unnamed: 0,c_station,d_date,t_minute,vol_in,vol_out,arah,timestamps
0,CCY,1/1/2020,00:00,0,0,RK,1/1/2020 00:00
1,CCY,1/1/2020,00:00,1,0,THB,1/1/2020 00:00
2,CCY,1/1/2020,00:01,0,0,RK,1/1/2020 00:01
3,CCY,1/1/2020,00:01,0,0,THB,1/1/2020 00:01
4,CCY,1/1/2020,00:02,0,0,RK,1/1/2020 00:02
...,...,...,...,...,...,...,...
3296007,TIG,29/2/2020,23:57,0,0,THB,29/2/2020 23:57
3296008,TIG,29/2/2020,23:58,0,0,RK,29/2/2020 23:58
3296009,TIG,29/2/2020,23:58,0,0,THB,29/2/2020 23:58
3296010,TIG,29/2/2020,23:59,0,0,RK,29/2/2020 23:59


Membuang kolom dan baris yang tidak digunakan

In [10]:
# menghapus kolom bernama 'd_date'
krl.drop(columns='d_date', inplace=True)

# tampilkan 5 baris pertama
krl.head() 

Unnamed: 0,c_station,t_minute,vol_in,vol_out,arah,timestamps
0,CCY,00:00,0,0,RK,1/1/2020 00:00
1,CCY,00:00,1,0,THB,1/1/2020 00:00
2,CCY,00:01,0,0,RK,1/1/2020 00:01
3,CCY,00:01,0,0,THB,1/1/2020 00:01
4,CCY,00:02,0,0,RK,1/1/2020 00:02


In [11]:
# menghapus kolom bernama 't_minute'
krl.drop(columns='t_minute', inplace=True)

# tampilkan 5 baris pertama
krl.head() 

Unnamed: 0,c_station,vol_in,vol_out,arah,timestamps
0,CCY,0,0,RK,1/1/2020 00:00
1,CCY,1,0,THB,1/1/2020 00:00
2,CCY,0,0,RK,1/1/2020 00:01
3,CCY,0,0,THB,1/1/2020 00:01
4,CCY,0,0,RK,1/1/2020 00:02


In [12]:
# men-drop baris yang memiliki nilai arah = RK
krl_df = krl.drop(krl[krl.arah == 'RK'].index)
krl_df

Unnamed: 0,c_station,vol_in,vol_out,arah,timestamps
1,CCY,1,0,THB,1/1/2020 00:00
3,CCY,0,0,THB,1/1/2020 00:01
5,CCY,0,0,THB,1/1/2020 00:02
7,CCY,0,0,THB,1/1/2020 00:03
9,CCY,0,0,THB,1/1/2020 00:04
...,...,...,...,...,...
3296003,TIG,0,0,THB,29/2/2020 23:55
3296005,TIG,0,0,THB,29/2/2020 23:56
3296007,TIG,0,0,THB,29/2/2020 23:57
3296009,TIG,0,0,THB,29/2/2020 23:58


Mengubah tipe data ```timestamps``` menjadi datetime dan membuatnya menjadi ```index```

In [13]:
#mengubah tipe data kolom timestamps menjadi datetime
krl_df['timestamps'] = pd.to_datetime(krl_df['timestamps'], format = '%d/%m/%Y %H:%M')
krl_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1654412 entries, 1 to 3296011
Data columns (total 5 columns):
 #   Column      Non-Null Count    Dtype         
---  ------      --------------    -----         
 0   c_station   1654412 non-null  object        
 1   vol_in      1654412 non-null  int64         
 2   vol_out     1654412 non-null  int64         
 3   arah        1654412 non-null  object        
 4   timestamps  1654412 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 75.7+ MB


In [14]:
krl_df

Unnamed: 0,c_station,vol_in,vol_out,arah,timestamps
1,CCY,1,0,THB,2020-01-01 00:00:00
3,CCY,0,0,THB,2020-01-01 00:01:00
5,CCY,0,0,THB,2020-01-01 00:02:00
7,CCY,0,0,THB,2020-01-01 00:03:00
9,CCY,0,0,THB,2020-01-01 00:04:00
...,...,...,...,...,...
3296003,TIG,0,0,THB,2020-02-29 23:55:00
3296005,TIG,0,0,THB,2020-02-29 23:56:00
3296007,TIG,0,0,THB,2020-02-29 23:57:00
3296009,TIG,0,0,THB,2020-02-29 23:58:00


In [15]:
# membuat timestamps sebagai index
vis = krl_df = krl_df.set_index('timestamps')
vis.head(3)

Unnamed: 0_level_0,c_station,vol_in,vol_out,arah
timestamps,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-01 00:00:00,CCY,1,0,THB
2020-01-01 00:01:00,CCY,0,0,THB
2020-01-01 00:02:00,CCY,0,0,THB


In [16]:
vis

Unnamed: 0_level_0,c_station,vol_in,vol_out,arah
timestamps,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-01 00:00:00,CCY,1,0,THB
2020-01-01 00:01:00,CCY,0,0,THB
2020-01-01 00:02:00,CCY,0,0,THB
2020-01-01 00:03:00,CCY,0,0,THB
2020-01-01 00:04:00,CCY,0,0,THB
...,...,...,...,...
2020-02-29 23:55:00,TIG,0,0,THB
2020-02-29 23:56:00,TIG,0,0,THB
2020-02-29 23:57:00,TIG,0,0,THB
2020-02-29 23:58:00,TIG,0,0,THB


Menyimpan hasil ```set_index timestamps``` ke dalam dataframe ```vis``` dan mengekspornya ke dalam bentuk CSV

In [None]:
vis.to_csv('./dataset/timestamps_krl.csv')

#### berlanjut ke notebook tubes2.ipynb...