# Basic Pandas

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data =[
    ["Rosi", 25, "Jakarta"],
    ["Selvy", 24, "Bandung"],
    ["Rita", 27, "Malang"],
    ["Sony", 26, "Bandung"]
]

In [3]:
df = pd.DataFrame(data, columns=['nama', 'usia', 'kota'], index=['A', 'B', 'C', 'D'])
df

Unnamed: 0,nama,usia,kota
A,Rosi,25,Jakarta
B,Selvy,24,Bandung
C,Rita,27,Malang
D,Sony,26,Bandung


In [8]:
df['nama']

A     Rosi
B    Selvy
C     Rita
D     Sony
Name: nama, dtype: object

In [9]:
df[['nama', 'kota']]

Unnamed: 0,nama,kota
A,Rosi,Jakarta
B,Selvy,Bandung
C,Rita,Malang
D,Sony,Bandung


In [11]:
df['nama']['B']

'Selvy'

In [14]:
# df['gaji'] = 20000000
df['gaji'] = [20000000, 15000000, 13000000, 30000000]

In [15]:
df

Unnamed: 0,nama,usia,kota,gaji
A,Rosi,25,Jakarta,20000000
B,Selvy,24,Bandung,15000000
C,Rita,27,Malang,13000000
D,Sony,26,Bandung,30000000


In [None]:
# list of numpy use --> to generate list of numbers
# np.nan = creating missing value
# np.linspace
# np.arange
# np.random.randint
# np.random
# np.logspace

In [18]:
df['gaji'].mean()

19500000.0

In [19]:
df['gaji'].median()

17500000.0

In [20]:
df['gaji'].sum()

78000000

In [22]:
df['gaji'].std()

7593857.166596345

In [25]:
# Pandas = std sample 
# Numpy = std populasi 

# pandas miu, klu di numpy s

In [24]:
df.describe()

# jika kolom gaji hanya satu data, berapa std nya? 

Unnamed: 0,usia,gaji
count,4.0,4.0
mean,25.5,19500000.0
std,1.290994,7593857.0
min,24.0,13000000.0
25%,24.75,14500000.0
50%,25.5,17500000.0
75%,26.25,22500000.0
max,27.0,30000000.0


In [29]:
data = np.random.randn(5,4)

# rand -> angka yang dihasilkan antara 0 - 1
# randn -> membentuk distribusi normal dengan angka -3 sampai 3

In [30]:
data

array([[ 1.28444497, -0.29603684, -0.9079835 , -1.16755479],
       [-0.41979232,  0.72392747,  1.03608883, -1.34316703],
       [ 1.89336334,  0.7027015 ,  1.23568426, -1.45947893],
       [ 2.30248672,  0.11350007,  0.71224314,  0.83641139],
       [ 0.90659413, -0.74322109,  0.75367363, -0.84577748]])

### Accessing Data

In [31]:
df = pd.DataFrame(data, columns='W,X,Y,Z'.split(','), index = 'A B C D E'.split(' '))
df

Unnamed: 0,W,X,Y,Z
A,1.284445,-0.296037,-0.907984,-1.167555
B,-0.419792,0.723927,1.036089,-1.343167
C,1.893363,0.702701,1.235684,-1.459479
D,2.302487,0.1135,0.712243,0.836411
E,0.906594,-0.743221,0.753674,-0.845777


In [32]:
df[['X', 'Y']]

Unnamed: 0,X,Y
A,-0.296037,-0.907984
B,0.723927,1.036089
C,0.702701,1.235684
D,0.1135,0.712243
E,-0.743221,0.753674


### Add Columns

In [33]:
# Add Columns
df['col_1'] = df['X'] * df['Y']
df

Unnamed: 0,W,X,Y,Z,col_1
A,1.284445,-0.296037,-0.907984,-1.167555,0.268797
B,-0.419792,0.723927,1.036089,-1.343167,0.750053
C,1.893363,0.702701,1.235684,-1.459479,0.868317
D,2.302487,0.1135,0.712243,0.836411,0.08084
E,0.906594,-0.743221,0.753674,-0.845777,-0.560146


In [36]:
df['col_2'] = [i ** 2 for i in range(1,6)]
df

Unnamed: 0,W,X,Y,Z,col_1,col_2
A,1.284445,-0.296037,-0.907984,-1.167555,0.268797,1
B,-0.419792,0.723927,1.036089,-1.343167,0.750053,4
C,1.893363,0.702701,1.235684,-1.459479,0.868317,9
D,2.302487,0.1135,0.712243,0.836411,0.08084,16
E,0.906594,-0.743221,0.753674,-0.845777,-0.560146,25


### Remove Data

In [37]:
df.drop('C')

Unnamed: 0,W,X,Y,Z,col_1,col_2
A,1.284445,-0.296037,-0.907984,-1.167555,0.268797,1
B,-0.419792,0.723927,1.036089,-1.343167,0.750053,4
D,2.302487,0.1135,0.712243,0.836411,0.08084,16
E,0.906594,-0.743221,0.753674,-0.845777,-0.560146,25


In [38]:
df.drop('col_2', axis=1)

Unnamed: 0,W,X,Y,Z,col_1
A,1.284445,-0.296037,-0.907984,-1.167555,0.268797
B,-0.419792,0.723927,1.036089,-1.343167,0.750053
C,1.893363,0.702701,1.235684,-1.459479,0.868317
D,2.302487,0.1135,0.712243,0.836411,0.08084
E,0.906594,-0.743221,0.753674,-0.845777,-0.560146


In [39]:
df.drop('C', axis=0)

Unnamed: 0,W,X,Y,Z,col_1,col_2
A,1.284445,-0.296037,-0.907984,-1.167555,0.268797,1
B,-0.419792,0.723927,1.036089,-1.343167,0.750053,4
D,2.302487,0.1135,0.712243,0.836411,0.08084,16
E,0.906594,-0.743221,0.753674,-0.845777,-0.560146,25


In [None]:
# Secara default
Nilai Axis adalah 0

Axis 0 => baris
Axis 1 => kolom

In [41]:
df.drop(columns=['col_1', 'X'])

Unnamed: 0,W,Y,Z,col_2
A,1.284445,-0.907984,-1.167555,1
B,-0.419792,1.036089,-1.343167,4
C,1.893363,1.235684,-1.459479,9
D,2.302487,0.712243,0.836411,16
E,0.906594,0.753674,-0.845777,25


Agar Fungsi mengubah Data yang ada
- assign nama dataframe original
- gunakan parameter 'inplace = True'

In [42]:
df_del = df.drop(columns='col_1')
df_del

Unnamed: 0,W,X,Y,Z,col_2
A,1.284445,-0.296037,-0.907984,-1.167555,1
B,-0.419792,0.723927,1.036089,-1.343167,4
C,1.893363,0.702701,1.235684,-1.459479,9
D,2.302487,0.1135,0.712243,0.836411,16
E,0.906594,-0.743221,0.753674,-0.845777,25


In [43]:
df.drop(columns='col_1', inplace=True)
df

Unnamed: 0,W,X,Y,Z,col_2
A,1.284445,-0.296037,-0.907984,-1.167555,1
B,-0.419792,0.723927,1.036089,-1.343167,4
C,1.893363,0.702701,1.235684,-1.459479,9
D,2.302487,0.1135,0.712243,0.836411,16
E,0.906594,-0.743221,0.753674,-0.845777,25


In [44]:
df['X']

A   -0.296037
B    0.723927
C    0.702701
D    0.113500
E   -0.743221
Name: X, dtype: float64

In [46]:
df['X'][['B', 'D']]

B    0.723927
D    0.113500
Name: X, dtype: float64

In [53]:
df

Unnamed: 0,W,X,Y,Z,col_2
A,1.284445,-0.296037,-0.907984,-1.167555,1
B,-0.419792,0.723927,1.036089,-1.343167,4
C,1.893363,0.702701,1.235684,-1.459479,9
D,2.302487,0.1135,0.712243,0.836411,16
E,0.906594,-0.743221,0.753674,-0.845777,25


### Accessing Data Raw
`.loc` digunakan untuk mengakses baris data menggunakan nama/label index

In [57]:
df.loc['C']
# df.loc['A']

W        1.284445
X       -0.296037
Y       -0.907984
Z       -1.167555
col_2    1.000000
Name: A, dtype: float64

In [50]:
df.loc['E']

W         0.906594
X        -0.743221
Y         0.753674
Z        -0.845777
col_2    25.000000
Name: E, dtype: float64

`.iloc` digunakan untuk mengakses baris data menggunakan **Urutan Index**

In [51]:
df.iloc[2]

W        1.893363
X        0.702701
Y        1.235684
Z       -1.459479
col_2    9.000000
Name: C, dtype: float64

In [52]:
df.iloc[4]

W         0.906594
X        -0.743221
Y         0.753674
Z        -0.845777
col_2    25.000000
Name: E, dtype: float64

In [58]:
df.loc[['C', 'D']]

Unnamed: 0,W,X,Y,Z,col_2
C,1.893363,0.702701,1.235684,-1.459479,9
D,2.302487,0.1135,0.712243,0.836411,16


In [59]:
df.loc[['C', 'D']][['Y', 'Z']]

Unnamed: 0,Y,Z
C,1.235684,-1.459479
D,0.712243,0.836411


In [61]:
df[['Y', 'Z']].loc[['D', 'C']]

Unnamed: 0,Y,Z
D,0.712243,0.836411
C,1.235684,-1.459479


In [None]:
List = Slicing
pandas = Subsetting
## Conditional Subseeting

### Subsetting

In [63]:
df[ df > 0 ]
# NaN = Not a Number ==> Missing Value

Unnamed: 0,W,X,Y,Z,col_2
A,1.284445,,,,1
B,,0.723927,1.036089,,4
C,1.893363,0.702701,1.235684,,9
D,2.302487,0.1135,0.712243,0.836411,16
E,0.906594,,0.753674,,25


In [64]:
df1 = df[df>0]

In [67]:
df

Unnamed: 0,W,X,Y,Z,col_2
A,1.284445,-0.296037,-0.907984,-1.167555,1
B,-0.419792,0.723927,1.036089,-1.343167,4
C,1.893363,0.702701,1.235684,-1.459479,9
D,2.302487,0.1135,0.712243,0.836411,16
E,0.906594,-0.743221,0.753674,-0.845777,25


In [72]:
df.iloc[1:5][['X', 'Z']]

Unnamed: 0,X,Z
B,0.723927,-1.343167
C,0.702701,-1.459479
D,0.1135,0.836411
E,-0.743221,-0.845777


In [75]:
df[df['W'] > 0]

Unnamed: 0,W,X,Y,Z,col_2
A,1.284445,-0.296037,-0.907984,-1.167555,1
C,1.893363,0.702701,1.235684,-1.459479,9
D,2.302487,0.1135,0.712243,0.836411,16
E,0.906594,-0.743221,0.753674,-0.845777,25


In [77]:
df[df['W'] > 0 ][['Y', 'Z']]

Unnamed: 0,Y,Z
A,-0.907984,-1.167555
C,1.235684,-1.459479
D,0.712243,0.836411
E,0.753674,-0.845777


In [81]:
df[df['W'] > 0][['Y', 'Z']].loc[['C', 'D']]

Unnamed: 0,Y,Z
C,1.235684,-1.459479
D,0.712243,0.836411


In [82]:
df

Unnamed: 0,W,X,Y,Z,col_2
A,1.284445,-0.296037,-0.907984,-1.167555,1
B,-0.419792,0.723927,1.036089,-1.343167,4
C,1.893363,0.702701,1.235684,-1.459479,9
D,2.302487,0.1135,0.712243,0.836411,16
E,0.906594,-0.743221,0.753674,-0.845777,25


In [83]:
df[df['Y'] < 0]

Unnamed: 0,W,X,Y,Z,col_2
A,1.284445,-0.296037,-0.907984,-1.167555,1


In [84]:
df[df['W'] > 0]

Unnamed: 0,W,X,Y,Z,col_2
A,1.284445,-0.296037,-0.907984,-1.167555,1
C,1.893363,0.702701,1.235684,-1.459479,9
D,2.302487,0.1135,0.712243,0.836411,16
E,0.906594,-0.743221,0.753674,-0.845777,25


In [88]:
df[(df['W'] > 0) & (df['Y'] < 0)]

# subsetting with conditional
# 1) using symbol
# 2) using bracket

Unnamed: 0,W,X,Y,Z,col_2
A,1.284445,-0.296037,-0.907984,-1.167555,1


In [91]:
df[(df['W'] > 0) | (df['Y'] < 0)][['X', 'Z']].loc[['C', 'D']]

Unnamed: 0,X,Z
C,0.702701,-1.459479
D,0.1135,0.836411


In [92]:
df1

Unnamed: 0,W,X,Y,Z,col_2
A,1.284445,,,,1
B,,0.723927,1.036089,,4
C,1.893363,0.702701,1.235684,,9
D,2.302487,0.1135,0.712243,0.836411,16
E,0.906594,,0.753674,,25


### Handling NaN

In [94]:
df1.isna().sum()

W        1
X        2
Y        1
Z        4
col_2    0
dtype: int64

### Technical Approach

- `.dropna()` ==> menghapus data NaN (baris)
- `.fillna()` ==> membuat/mengisi dengan data sintetis

In [97]:
df1

Unnamed: 0,W,X,Y,Z,col_2
A,1.284445,,,,1
B,,0.723927,1.036089,,4
C,1.893363,0.702701,1.235684,,9
D,2.302487,0.1135,0.712243,0.836411,16
E,0.906594,,0.753674,,25


In [96]:
df1.dropna()

Unnamed: 0,W,X,Y,Z,col_2
D,2.302487,0.1135,0.712243,0.836411,16


In [100]:
df1.fillna(value='empty data')

Unnamed: 0,W,X,Y,Z,col_2
A,1.28444,empty data,empty data,empty data,1
B,empty data,0.723927,1.03609,empty data,4
C,1.89336,0.702701,1.23568,empty data,9
D,2.30249,0.1135,0.712243,0.836411,16
E,0.906594,empty data,0.753674,empty data,25


In [102]:
df1['W'] = df1['W'].fillna(df1['W'].mean())
df1

Unnamed: 0,W,X,Y,Z,col_2
A,1.284445,,,,1
B,1.596722,0.723927,1.036089,,4
C,1.893363,0.702701,1.235684,,9
D,2.302487,0.1135,0.712243,0.836411,16
E,0.906594,,0.753674,,25


# EDA with dataset

In [103]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

##### Handling Outliers ==> DROP or KEEP

1. Kesalahan Input: kesalahan input usia typo 2500 - seharusnya 25 --> DROP

2. 15 - 20 - 10 - 9 - 12 - 250 - 175 - 33 - 12 - 10 - 300 --> formula membuat lele besar, angka besar yang justru dicari. di sini Outliers yang datanya justru dipakai

3. Data Benar: Harga Mobil
    Berdasarkan kebutuhan
    - Analisa Mobil Rakyat ==> mobil premium di Drop
    - Analisa Mobil Premium ==> mobil rakyat di Drop
    - Analisa keduanya ==> 
      - keduanya di KEEP. 
      = Tapi tidak bisa dianalisa bersama, harus dibuat 2 dataframe berbeda
          1. df_rakyat = df[df['harga'] < 1M]
          2. df_rakyat = df[df['harga'] > 1M]

##### Handling Missing Values ==> DROP or FILL
1. DROP:
    - kolom
      
      syarat: 
          - terdapat kolom pengganti:
              - Missing Value ada di kolom jarak (m), kita memiliki kolom jarak (km)
              - Missing Value di Gaji (Rp), kita memiliki Gaji (USD)
          - missing value nya mencapai 70-80%
    - baris
      
      syarat:
          - lihat berapa banyak kolom yang kosong di baris itu
          - pastikan jumlah datanya banyak 
              - data macet: 990, tidak macet: 10 ==> features & target
          - jika ada missing value pada kolom target ==> langsung drop baris
          
2. FILLNA: mengisi data sintetis dengan
    - Harus melihat kolom/features lain
    - Mean ==> data numerik dan tidak ada outliers
    - Median ==> data numerik
    - Modus ==> data kategorikal
    
    
    contoh: jika kolom gaji kosong -> mengisinya harus melihat kolom kota/lokasi dan jabatan

In [None]:
Exploratory Data Analysis --- EDA
(Harus menguasai Domain/Business Knowledge)

- Pengecekan Tipe Data
- Pengecekan Missing Value - Dan Kita Handling
- Describe Data
- Pengecekan Data Outliers & Kita Handling
- Pengecekan - Extraksi Data datetime/Tanggal jika diperlukan

### Analisa Data
- Univariate (Kolom Tunggal)
- BiVariate / Multivariate (Multi Kolom)
 Ambil Insight - Interpretasi dari Hasil Analisa 
    
Data Visualization
- Univariate
- Bivariate/Multivariate (Multikolom)
Ambil Insight - Interpretasi dari Grafik

-- Kesimpulan & Saran

In [116]:
df = pd.read_csv('../data/sample_data.csv', index_col='receipt_id')
df = pd.read_csv('../data/sample_data.csv', index_col=0)
df.head()

Unnamed: 0_level_0,receipts_item_id,purchase_time,category,sub_category,format,unit_price,discount,quantity
receipt_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
9622257,32369294,7/22/2018 21:19,Rice,Rice,supermarket,128000.0,0,1
9446359,31885876,7/15/2018 16:17,Rice,Rice,minimarket,102750.0,0,1
9470290,31930241,7/15/2018 12:12,Rice,Rice,supermarket,64000.0,0,3
9643416,32418582,7/24/2018 8:27,Rice,Rice,minimarket,65000.0,0,1
9692093,32561236,7/26/2018 11:28,Rice,Rice,supermarket,124500.0,0,1


#### Pengecekan Tipe Data

In [117]:
# Alt 1
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72000 entries, 9622257 to 5808147
Data columns (total 8 columns):
receipts_item_id    72000 non-null int64
purchase_time       72000 non-null object
category            72000 non-null object
sub_category        72000 non-null object
format              72000 non-null object
unit_price          72000 non-null float64
discount            72000 non-null int64
quantity            72000 non-null int64
dtypes: float64(1), int64(3), object(4)
memory usage: 4.9+ MB


#### Cek Missing Value

In [118]:
# Alt 1
df.isnull().sum()

receipts_item_id    0
purchase_time       0
category            0
sub_category        0
format              0
unit_price          0
discount            0
quantity            0
dtype: int64

In [119]:
# Alt 2
df.isna().sum()

receipts_item_id    0
purchase_time       0
category            0
sub_category        0
format              0
unit_price          0
discount            0
quantity            0
dtype: int64

#### Describe Data

In [120]:
df.describe() # descriptive stats untuk tipe data numerik

Unnamed: 0,receipts_item_id,unit_price,discount,quantity
count,72000.0,72000.0,72000.0,72000.0
mean,24590020.0,24830.776334,852.195139,1.482861
std,7169571.0,24496.419282,4051.061578,1.289382
min,8904708.0,2505.0,-49600.0,1.0
25%,18199820.0,12500.0,0.0,1.0
50%,22365860.0,15472.5,0.0,1.0
75%,31052280.0,25100.0,0.0,2.0
max,38501660.0,219400.0,320000.0,19.0


In [None]:
count = jumlah data
mean = rata-rata
std = standar deviasi (sampel)
min = nilai minimum
25% = Q1 = Kuartil 1
50% = Q2 = Kuartil 2 = Median
75% = Q3 = Kuartil 3
max = nilai maximum

##### Describe Data Object

In [121]:
# Alt 1
df.describe(include = np.object)

Unnamed: 0,purchase_time,category,sub_category,format
count,72000,72000,72000,72000
unique,62072,3,3,3
top,10/22/2017 12:00,Fabric Care,Detergent,minimarket
freq,12,36000,36000,46803


In [123]:
# Alt 2
df.describe(include = 'O')

Unnamed: 0,purchase_time,category,sub_category,format
count,72000,72000,72000,72000
unique,62072,3,3,3
top,10/22/2017 12:00,Fabric Care,Detergent,minimarket
freq,12,36000,36000,46803


In [125]:
# Alt 3
df.describe(include = ['object'])

Unnamed: 0,purchase_time,category,sub_category,format
count,72000,72000,72000,72000
unique,62072,3,3,3
top,10/22/2017 12:00,Fabric Care,Detergent,minimarket
freq,12,36000,36000,46803


In [None]:
count = jumlah data
unique = jumlah unique data
top = data paling sering muncul (modus)
freq = jumlah berapa kali modus muncul

In [127]:
# Alt 4
df.describe(include='all')
df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
receipts_item_id,72000,,,,24590000.0,7169570.0,8904710.0,18199800.0,22365900.0,31052300.0,38501700.0
purchase_time,72000,62072.0,10/22/2017 12:00,12.0,,,,,,,
category,72000,3.0,Fabric Care,36000.0,,,,,,,
sub_category,72000,3.0,Detergent,36000.0,,,,,,,
format,72000,3.0,minimarket,46803.0,,,,,,,
unit_price,72000,,,,24830.8,24496.4,2505.0,12500.0,15472.5,25100.0,219400.0
discount,72000,,,,852.195,4051.06,-49600.0,0.0,0.0,0.0,320000.0
quantity,72000,,,,1.48286,1.28938,1.0,1.0,1.0,2.0,19.0


#### Pengecekan Outliers
Outliers adalah data di atas Upper Fence atau di bawah Lower Fence

- Upper Fence - Batas Atas = Q3 + (1.5 * IQR)
- Lower Fence - Batas Bawah = Q1 + (1.5 * IQR)

IQR = Q3 - Q1
    
    - Q3 = kuartil 3 = titik tengah antara nilai Median dan nilai Max
    - Q1 = Kuartil 1 = titik tengah antara nilai Median dan nilai Min

In [128]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
receipts_item_id,72000.0,24590020.0,7169571.0,8904708.0,18199818.75,22365857.0,31052282.25,38501658.0
unit_price,72000.0,24830.78,24496.42,2505.0,12500.0,15472.5,25100.0,219400.0
discount,72000.0,852.1951,4051.062,-49600.0,0.0,0.0,0.0,320000.0
quantity,72000.0,1.482861,1.289382,1.0,1.0,1.0,2.0,19.0


In [129]:
# mencari Outliers untuk Unit Price
df.describe()['unit_price']

count     72000.000000
mean      24830.776334
std       24496.419282
min        2505.000000
25%       12500.000000
50%       15472.500000
75%       25100.000000
max      219400.000000
Name: unit_price, dtype: float64

In [131]:
Q1 = df.describe()['unit_price']['25%']
Q3 = df.describe()['unit_price']['75%']
IQR = Q3 - Q1

Upper_Fence = Q3 + (1.5 * IQR)
Lower_Fence = Q1 - (1.5 * IQR)

In [132]:
# Data Outliers
df[(df['unit_price'] < Lower_Fence) | (df['unit_price'] > Upper_Fence)]

Unnamed: 0_level_0,receipts_item_id,purchase_time,category,sub_category,format,unit_price,discount,quantity
receipt_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
9622257,32369294,7/22/2018 21:19,Rice,Rice,supermarket,128000.0,0,1
9446359,31885876,7/15/2018 16:17,Rice,Rice,minimarket,102750.0,0,1
9470290,31930241,7/15/2018 12:12,Rice,Rice,supermarket,64000.0,0,3
9643416,32418582,7/24/2018 8:27,Rice,Rice,minimarket,65000.0,0,1
9692093,32561236,7/26/2018 11:28,Rice,Rice,supermarket,124500.0,0,1
...,...,...,...,...,...,...,...,...
5965798,18136614,12/31/2017 17:50,Sugar/Flavored Syrup,Sugar,hypermarket,55130.0,0,1
5742503,17449270,12/12/2017 15:57,Sugar/Flavored Syrup,Sugar,hypermarket,59900.0,0,2
5648387,17146348,12/06/2017 14:14,Sugar/Flavored Syrup,Sugar,supermarket,48600.0,0,1
5763026,17508220,12/14/2017 17:29,Sugar/Flavored Syrup,Sugar,supermarket,59950.0,0,1


In [145]:
df.head()

Unnamed: 0_level_0,receipts_item_id,purchase_time,category,sub_category,format,unit_price,discount,quantity
receipt_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
9622257,32369294,7/22/2018 21:19,Rice,Rice,supermarket,128000.0,0,1
9446359,31885876,7/15/2018 16:17,Rice,Rice,minimarket,102750.0,0,1
9470290,31930241,7/15/2018 12:12,Rice,Rice,supermarket,64000.0,0,3
9643416,32418582,7/24/2018 8:27,Rice,Rice,minimarket,65000.0,0,1
9692093,32561236,7/26/2018 11:28,Rice,Rice,supermarket,124500.0,0,1


In [134]:
df['category'].unique()

array(['Rice', 'Fabric Care', 'Sugar/Flavored Syrup'], dtype=object)

In [135]:
df['category'].nunique()

3

In [136]:
df['category'].dtypes

dtype('O')

In [137]:
df['category'].isna().sum()

0

In [139]:
round((((df['category'].isna().sum()) / len(df)) * 100), 2)

0.0

In [140]:
df['category'].drop_duplicates().sample(2).values

array(['Rice', 'Sugar/Flavored Syrup'], dtype=object)

In [141]:
df.columns

Index(['receipts_item_id', 'purchase_time', 'category', 'sub_category',
       'format', 'unit_price', 'discount', 'quantity'],
      dtype='object')

![image.png](attachment:image.png)

In [148]:
dataDesc = []

for i in df.columns:
    dataDesc.append([
        i,
        df[i].dtypes,
        df[i].isna().sum(),
        round((((df[i].isna().sum()) / len(df)) * 100), 2),
        df[i].nunique(),
        df[i].drop_duplicates().sample(2).values
    ])

In [149]:
pd.DataFrame(dataDesc, columns=[
    'Data Feature',
    'Data Types',
    'Null',
    'Null Percentage',
    'Unique',
    'Unique Sample'
])

Unnamed: 0,Data Feature,Data Types,Null,Null Percentage,Unique,Unique Sample
0,receipts_item_id,int64,0,0.0,72000,"[32329845, 16070893]"
1,purchase_time,object,0,0.0,62072,"[7/18/2018 7:58, 12/08/2017 19:20]"
2,category,object,0,0.0,3,"[Rice, Fabric Care]"
3,sub_category,object,0,0.0,3,"[Rice, Sugar]"
4,format,object,0,0.0,3,"[hypermarket, minimarket]"
5,unit_price,float64,0,0.0,3884,"[21525.0, 94490.0]"
6,discount,int64,0,0.0,1329,"[15160, 29560]"
7,quantity,int64,0,0.0,19,"[18, 13]"


#### Proses konversi tanggal
Jika kolom tanggal bertipe object, harus dikonversi menjadi datetime

In [None]:
# Alt 1
Menggunakan `parse_dates = [nama kolom]` ketika di awal membaca file

In [151]:
df = pd.read_csv('../data/sample_data.csv', index_col=0, parse_dates=['purchase_time'])
df.head()

Unnamed: 0_level_0,receipts_item_id,purchase_time,category,sub_category,format,unit_price,discount,quantity
receipt_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
9622257,32369294,2018-07-22 21:19:00,Rice,Rice,supermarket,128000.0,0,1
9446359,31885876,2018-07-15 16:17:00,Rice,Rice,minimarket,102750.0,0,1
9470290,31930241,2018-07-15 12:12:00,Rice,Rice,supermarket,64000.0,0,3
9643416,32418582,2018-07-24 08:27:00,Rice,Rice,minimarket,65000.0,0,1
9692093,32561236,2018-07-26 11:28:00,Rice,Rice,supermarket,124500.0,0,1


In [152]:
df.dtypes

receipts_item_id             int64
purchase_time       datetime64[ns]
category                    object
sub_category                object
format                      object
unit_price                 float64
discount                     int64
quantity                     int64
dtype: object

In [None]:
# Alt 2
- menggunakan fungsi `.astype('datetime64')`
- astype digunakan untuk mengubah tipe data, tidak terbatas pada tanggal

In [154]:
df = pd.read_csv('../data/sample_data.csv', index_col=0)
df.head()

Unnamed: 0_level_0,receipts_item_id,purchase_time,category,sub_category,format,unit_price,discount,quantity
receipt_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
9622257,32369294,7/22/2018 21:19,Rice,Rice,supermarket,128000.0,0,1
9446359,31885876,7/15/2018 16:17,Rice,Rice,minimarket,102750.0,0,1
9470290,31930241,7/15/2018 12:12,Rice,Rice,supermarket,64000.0,0,3
9643416,32418582,7/24/2018 8:27,Rice,Rice,minimarket,65000.0,0,1
9692093,32561236,7/26/2018 11:28,Rice,Rice,supermarket,124500.0,0,1


In [155]:
df.dtypes

receipts_item_id      int64
purchase_time        object
category             object
sub_category         object
format               object
unit_price          float64
discount              int64
quantity              int64
dtype: object

In [156]:
df['purchase_time'] = df['purchase_time'].astype('datetime64')
df.head()

Unnamed: 0_level_0,receipts_item_id,purchase_time,category,sub_category,format,unit_price,discount,quantity
receipt_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
9622257,32369294,2018-07-22 21:19:00,Rice,Rice,supermarket,128000.0,0,1
9446359,31885876,2018-07-15 16:17:00,Rice,Rice,minimarket,102750.0,0,1
9470290,31930241,2018-07-15 12:12:00,Rice,Rice,supermarket,64000.0,0,3
9643416,32418582,2018-07-24 08:27:00,Rice,Rice,minimarket,65000.0,0,1
9692093,32561236,2018-07-26 11:28:00,Rice,Rice,supermarket,124500.0,0,1


In [158]:
df.dtypes

receipts_item_id             int64
purchase_time       datetime64[ns]
category                    object
sub_category                object
format                      object
unit_price                 float64
discount                     int64
quantity                     int64
dtype: object

In [None]:
# Alt 3
- menggunakan fungsi to_datetime('nama kolom')

In [159]:
df = pd.read_csv('../data/sample_data.csv', index_col=0)
df.head()

Unnamed: 0_level_0,receipts_item_id,purchase_time,category,sub_category,format,unit_price,discount,quantity
receipt_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
9622257,32369294,7/22/2018 21:19,Rice,Rice,supermarket,128000.0,0,1
9446359,31885876,7/15/2018 16:17,Rice,Rice,minimarket,102750.0,0,1
9470290,31930241,7/15/2018 12:12,Rice,Rice,supermarket,64000.0,0,3
9643416,32418582,7/24/2018 8:27,Rice,Rice,minimarket,65000.0,0,1
9692093,32561236,7/26/2018 11:28,Rice,Rice,supermarket,124500.0,0,1


In [160]:
df['purchase_time'] = pd.to_datetime(df['purchase_time'])
df.head()

Unnamed: 0_level_0,receipts_item_id,purchase_time,category,sub_category,format,unit_price,discount,quantity
receipt_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
9622257,32369294,2018-07-22 21:19:00,Rice,Rice,supermarket,128000.0,0,1
9446359,31885876,2018-07-15 16:17:00,Rice,Rice,minimarket,102750.0,0,1
9470290,31930241,2018-07-15 12:12:00,Rice,Rice,supermarket,64000.0,0,3
9643416,32418582,2018-07-24 08:27:00,Rice,Rice,minimarket,65000.0,0,1
9692093,32561236,2018-07-26 11:28:00,Rice,Rice,supermarket,124500.0,0,1


####  Proses Ekstraksi Tanggal


In [162]:
purchase_dt = df['purchase_time'].dt

In [163]:
df.head()

Unnamed: 0_level_0,receipts_item_id,purchase_time,category,sub_category,format,unit_price,discount,quantity
receipt_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
9622257,32369294,2018-07-22 21:19:00,Rice,Rice,supermarket,128000.0,0,1
9446359,31885876,2018-07-15 16:17:00,Rice,Rice,minimarket,102750.0,0,1
9470290,31930241,2018-07-15 12:12:00,Rice,Rice,supermarket,64000.0,0,3
9643416,32418582,2018-07-24 08:27:00,Rice,Rice,minimarket,65000.0,0,1
9692093,32561236,2018-07-26 11:28:00,Rice,Rice,supermarket,124500.0,0,1


In [164]:
# ambil data bulan
purchase_dt.month.head()

receipt_id
9622257    7
9446359    7
9470290    7
9643416    7
9692093    7
Name: purchase_time, dtype: int64

In [165]:
purchase_dt.month_name().head()

receipt_id
9622257    July
9446359    July
9470290    July
9643416    July
9692093    July
Name: purchase_time, dtype: object

In [166]:
purchase_dt.year.head()

receipt_id
9622257    2018
9446359    2018
9470290    2018
9643416    2018
9692093    2018
Name: purchase_time, dtype: int64

In [167]:
df['month'] = purchase_dt.month_name()

In [168]:
df.head()

Unnamed: 0_level_0,receipts_item_id,purchase_time,category,sub_category,format,unit_price,discount,quantity,month
receipt_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
9622257,32369294,2018-07-22 21:19:00,Rice,Rice,supermarket,128000.0,0,1,July
9446359,31885876,2018-07-15 16:17:00,Rice,Rice,minimarket,102750.0,0,1,July
9470290,31930241,2018-07-15 12:12:00,Rice,Rice,supermarket,64000.0,0,3,July
9643416,32418582,2018-07-24 08:27:00,Rice,Rice,minimarket,65000.0,0,1,July
9692093,32561236,2018-07-26 11:28:00,Rice,Rice,supermarket,124500.0,0,1,July


In [170]:
df['year'] = purchase_dt.year

In [171]:
df.head()

Unnamed: 0_level_0,receipts_item_id,purchase_time,category,sub_category,format,unit_price,discount,quantity,month,year
receipt_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
9622257,32369294,2018-07-22 21:19:00,Rice,Rice,supermarket,128000.0,0,1,July,2018
9446359,31885876,2018-07-15 16:17:00,Rice,Rice,minimarket,102750.0,0,1,July,2018
9470290,31930241,2018-07-15 12:12:00,Rice,Rice,supermarket,64000.0,0,3,July,2018
9643416,32418582,2018-07-24 08:27:00,Rice,Rice,minimarket,65000.0,0,1,July,2018
9692093,32561236,2018-07-26 11:28:00,Rice,Rice,supermarket,124500.0,0,1,July,2018


In [172]:
purchase_dt.day.head() ## ekstraksi tanggal

receipt_id
9622257    22
9446359    15
9470290    15
9643416    24
9692093    26
Name: purchase_time, dtype: int64

In [173]:
purchase_dt.dayofweek.head() ## ekstraksi urutan hari dalam seminggu 0 = senin

receipt_id
9622257    6
9446359    6
9470290    6
9643416    1
9692093    3
Name: purchase_time, dtype: int64

In [177]:
purchase_dt.hour.head()

receipt_id
9622257    21
9446359    16
9470290    12
9643416     8
9692093    11
Name: purchase_time, dtype: int64

In [178]:
purchase_dt.day_name().head()

receipt_id
9622257      Sunday
9446359      Sunday
9470290      Sunday
9643416     Tuesday
9692093    Thursday
Name: purchase_time, dtype: object

In [179]:
df['day'] = purchase_dt.day_name()

In [180]:
df.head()

Unnamed: 0_level_0,receipts_item_id,purchase_time,category,sub_category,format,unit_price,discount,quantity,month,year,day
receipt_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9622257,32369294,2018-07-22 21:19:00,Rice,Rice,supermarket,128000.0,0,1,July,2018,Sunday
9446359,31885876,2018-07-15 16:17:00,Rice,Rice,minimarket,102750.0,0,1,July,2018,Sunday
9470290,31930241,2018-07-15 12:12:00,Rice,Rice,supermarket,64000.0,0,3,July,2018,Sunday
9643416,32418582,2018-07-24 08:27:00,Rice,Rice,minimarket,65000.0,0,1,July,2018,Tuesday
9692093,32561236,2018-07-26 11:28:00,Rice,Rice,supermarket,124500.0,0,1,July,2018,Thursday


In [181]:
purchase_dt.to_period('D').head()

receipt_id
9622257    2018-07-22
9446359    2018-07-15
9470290    2018-07-15
9643416    2018-07-24
9692093    2018-07-26
Name: purchase_time, dtype: period[D]

In [182]:
purchase_dt.to_period('M').head()

receipt_id
9622257    2018-07
9446359    2018-07
9470290    2018-07
9643416    2018-07
9692093    2018-07
Name: purchase_time, dtype: period[M]

In [183]:
# membandingkan per minggu nya
purchase_dt.to_period('W').head()

receipt_id
9622257    2018-07-16/2018-07-22
9446359    2018-07-09/2018-07-15
9470290    2018-07-09/2018-07-15
9643416    2018-07-23/2018-07-29
9692093    2018-07-23/2018-07-29
Name: purchase_time, dtype: period[W-SUN]

In [184]:
purchase_dt.to_period('Q').head()

receipt_id
9622257    2018Q3
9446359    2018Q3
9470290    2018Q3
9643416    2018Q3
9692093    2018Q3
Name: purchase_time, dtype: period[Q-DEC]

In [185]:
df.head()

Unnamed: 0_level_0,receipts_item_id,purchase_time,category,sub_category,format,unit_price,discount,quantity,month,year,day
receipt_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9622257,32369294,2018-07-22 21:19:00,Rice,Rice,supermarket,128000.0,0,1,July,2018,Sunday
9446359,31885876,2018-07-15 16:17:00,Rice,Rice,minimarket,102750.0,0,1,July,2018,Sunday
9470290,31930241,2018-07-15 12:12:00,Rice,Rice,supermarket,64000.0,0,3,July,2018,Sunday
9643416,32418582,2018-07-24 08:27:00,Rice,Rice,minimarket,65000.0,0,1,July,2018,Tuesday
9692093,32561236,2018-07-26 11:28:00,Rice,Rice,supermarket,124500.0,0,1,July,2018,Thursday


In [186]:
df['quarter'] = purchase_dt.to_period('Q')
df.head()

Unnamed: 0_level_0,receipts_item_id,purchase_time,category,sub_category,format,unit_price,discount,quantity,month,year,day,quarter
receipt_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
9622257,32369294,2018-07-22 21:19:00,Rice,Rice,supermarket,128000.0,0,1,July,2018,Sunday,2018Q3
9446359,31885876,2018-07-15 16:17:00,Rice,Rice,minimarket,102750.0,0,1,July,2018,Sunday,2018Q3
9470290,31930241,2018-07-15 12:12:00,Rice,Rice,supermarket,64000.0,0,3,July,2018,Sunday,2018Q3
9643416,32418582,2018-07-24 08:27:00,Rice,Rice,minimarket,65000.0,0,1,July,2018,Tuesday,2018Q3
9692093,32561236,2018-07-26 11:28:00,Rice,Rice,supermarket,124500.0,0,1,July,2018,Thursday,2018Q3


In [188]:
df.sample(5)

Unnamed: 0_level_0,receipts_item_id,purchase_time,category,sub_category,format,unit_price,discount,quantity,month,year,day,quarter
receipt_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
6264359,19014468,2018-01-20 18:26:00,Sugar/Flavored Syrup,Sugar,supermarket,5180.0,0,1,January,2018,Saturday,2018Q1
10817498,36108057,2018-09-08 12:51:00,Sugar/Flavored Syrup,Sugar,hypermarket,10485.0,0,4,September,2018,Saturday,2018Q3
7339292,21939518,2018-03-25 21:08:00,Fabric Care,Detergent,minimarket,18800.0,0,2,March,2018,Sunday,2018Q1
10214415,34169831,2018-08-16 00:07:00,Sugar/Flavored Syrup,Sugar,minimarket,12500.0,0,1,August,2018,Thursday,2018Q3
9478664,31957140,2018-07-15 19:49:00,Sugar/Flavored Syrup,Sugar,supermarket,11050.0,0,1,July,2018,Sunday,2018Q3


In [189]:
df['Total'] = df['quantity'] * df['unit_price']
df.head()

Unnamed: 0_level_0,receipts_item_id,purchase_time,category,sub_category,format,unit_price,discount,quantity,month,year,day,quarter,Total
receipt_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
9622257,32369294,2018-07-22 21:19:00,Rice,Rice,supermarket,128000.0,0,1,July,2018,Sunday,2018Q3,128000.0
9446359,31885876,2018-07-15 16:17:00,Rice,Rice,minimarket,102750.0,0,1,July,2018,Sunday,2018Q3,102750.0
9470290,31930241,2018-07-15 12:12:00,Rice,Rice,supermarket,64000.0,0,3,July,2018,Sunday,2018Q3,192000.0
9643416,32418582,2018-07-24 08:27:00,Rice,Rice,minimarket,65000.0,0,1,July,2018,Tuesday,2018Q3,65000.0
9692093,32561236,2018-07-26 11:28:00,Rice,Rice,supermarket,124500.0,0,1,July,2018,Thursday,2018Q3,124500.0


## Belajar Numpy

In [8]:
# List Basic
a = [1,2,3,4,5]
a

[1, 2, 3, 4, 5]

In [10]:
### Array
b = np.array(a)
b

array([1, 2, 3, 4, 5])

In [11]:
type(b)

numpy.ndarray

n = jumlah
d = dimensi
array

In [14]:
## Elementwise
a + [1]

[1, 2, 3, 4, 5, 1]

In [15]:
b + 1 ## Bersifat elementwise => operasi dilakukan ke setiap element dari array

array([2, 3, 4, 5, 6])

In [16]:
a * 2

[1, 2, 3, 4, 5, 1, 2, 3, 4, 5]

In [17]:
b * 2

array([ 2,  4,  6,  8, 10])

### Range dari Python

In [22]:
range(5, 10, 0.5) # Built in dari Python

TypeError: 'float' object cannot be interpreted as an integer

### Range dari Numpy

In [20]:

np.arange(5,10, 0.5) ## Range dari python ==> STEP dapat dengan/menggunakan FLOAT

array([5. , 5.5, 6. , 6.5, 7. , 7.5, 8. , 8.5, 9. , 9.5])

In [23]:
### Linspace => Linear space

In [24]:
np.linspace(1, 10, 4)

array([ 1.,  4.,  7., 10.])

In [25]:
### Array Multi dimensi => Matrix

In [26]:
a = [
    [1,2,3],
    [4,5,6],
    [7,8,9]
]

In [29]:
a

[[1, 2, 3], [4, 5, 6], [7, 8, 9]]

In [32]:
aNum = np.array(a)
aNum

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [33]:
b

array([1, 2, 3, 4, 5])

In [35]:
b.ndim

1

In [36]:
aNum.ndim

2

In [37]:
b.size

5

In [38]:
aNum.size

9

In [42]:
np.zeros(5)

array([0., 0., 0., 0., 0.])

In [41]:
np.zeros((5,5))

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [44]:
np.ones(5)

array([1., 1., 1., 1., 1.])

In [45]:
np.ones((5,5))

array([[1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]])

In [46]:
np.identity(5)

array([[1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.]])

In [47]:
np.eye(5)

array([[1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.]])

In [48]:
a = [1,2,3,4,5]
b = [6,7,8,9,10]

In [49]:
aNum = np.array(a)
bNum = np.array(b)

In [50]:
a + b

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [51]:
aNum + bNum

array([ 7,  9, 11, 13, 15])

In [52]:
a * b

TypeError: can't multiply sequence by non-int of type 'list'

In [53]:
aNum * bNum

array([ 6, 14, 24, 36, 50])

In [54]:
a - b

TypeError: unsupported operand type(s) for -: 'list' and 'list'

In [55]:
aNum - bNum

array([-5, -5, -5, -5, -5])

In [56]:
aNum = np.arange(10) ** 2

In [57]:
aNum

array([ 0,  1,  4,  9, 16, 25, 36, 49, 64, 81], dtype=int32)

In [58]:
aNum[2]

4

In [59]:
aNum[-1]

81

In [60]:
aNum[4]

16

In [61]:
aNum[1:4]

array([1, 4, 9], dtype=int32)

In [62]:
aNum[:5]

array([ 0,  1,  4,  9, 16], dtype=int32)

In [63]:
aNum[5:]

array([25, 36, 49, 64, 81], dtype=int32)

In [64]:
a = [
    [1,2,3],
    [4,5,6],
    [7,8,9]
]

In [65]:
aMatrix = np.array(a)

In [66]:
aMatrix[0][2]

3

In [67]:
A = [
    [2, 3, 4],
    [4, 5, 6]
]

In [68]:
AMt = np.array(A)

In [71]:
AMt.shape

(2, 3)

In [73]:
AMt

array([[2, 3, 4],
       [4, 5, 6]])

In [76]:
AMt.transpose()

array([[2, 4],
       [3, 5],
       [4, 6]])

In [77]:
np.transpose(AMt)

array([[2, 4],
       [3, 5],
       [4, 6]])

In [78]:
AMt.T

array([[2, 4],
       [3, 5],
       [4, 6]])

In [79]:
AMt.reshape(6,1)

array([[2],
       [3],
       [4],
       [4],
       [5],
       [6]])

In [80]:
AMt.reshape(1,6)

array([[2, 3, 4, 4, 5, 6]])

In [81]:
AMt.reshape(3, 2)

array([[2, 3],
       [4, 4],
       [5, 6]])

In [82]:
#### Statistics Operational

In [84]:
B = np.array([2, 50, 26, 23, 10, 3, 1, 100, 10, 8])

In [85]:
np.mean(B)

23.3

In [86]:
np.median(B)

10.0

In [87]:
np.percentile(B, 25) # Q1 => Quartile 1 = 25%

4.25

In [88]:
np.percentile(B, 75) # Q3 => Quartile 3 = 75%

25.25

In [89]:
np.max(B)

100

In [90]:
np.min(B)

1

In [91]:
np.sort(B)

array([  1,   2,   3,   8,  10,  10,  23,  26,  50, 100])

In [92]:
np.argmax(B)

7

In [93]:
np.argmin(B)

6

In [94]:
##### Membuat angka Random

In [95]:
np.random.rand(10) ## antara nilai 0-1

array([0.75331069, 0.37679627, 0.69229612, 0.60588473, 0.06495015,
       0.61421043, 0.05658903, 0.25120701, 0.17142313, 0.67376319])

In [97]:
np.random.randn(10) ## antara -3 sampai 3 tetapi berbentuk Bell shaped (Distribusi Normal)

array([ 1.17957923,  0.17343628, -0.74246768,  0.05220565,  1.17046878,
       -1.51488645,  0.58247411, -0.24530143,  0.75923687, -0.02801411])

In [99]:
np.random.randint(1, 10, 10)

array([1, 1, 5, 3, 6, 3, 5, 2, 5, 3])

In [100]:
import pandas as pd

pd = pandas dataframe

In [101]:
##### Series

In [102]:
x = [9, 10, 8, 15, 3, 17]

In [104]:
x1 = np.array(x)
x1

array([ 9, 10,  8, 15,  3, 17])

In [105]:
y = pd.Series(x)
y

0     9
1    10
2     8
3    15
4     3
5    17
dtype: int64

In [106]:
ind = ['a', 'b', 'c', 'd', 'e', 'f']

In [108]:
pd.Series(data=x1, index=ind, name='Test')

a     9
b    10
c     8
d    15
e     3
f    17
Name: Test, dtype: int32

In [109]:
data = {'a' : 50,
       'b' : 75,
       'c' : 85}

In [110]:
pd.Series(data)

a    50
b    75
c    85
dtype: int64

In [111]:
#### DataFrame

In [112]:
x = [20, 50, 35, 13]
x1 = np.array(x)

In [113]:
df = pd.DataFrame(x1, columns=['Usia'])
df

Unnamed: 0,Usia
0,20
1,50
2,35
3,13


In [114]:
data =[
    ["Rosi", 25, "Jakarta"],
    ["Selvy", 24, "Bandung"],
    ["Rita", 27, "Malang"],
    ["Sony", 26, "Bandung"]
]

In [115]:
df = pd.DataFrame(data, columns=['nama', 'usia', 'kota'])

In [116]:
df

Unnamed: 0,nama,usia,kota
0,Rosi,25,Jakarta
1,Selvy,24,Bandung
2,Rita,27,Malang
3,Sony,26,Bandung


In [117]:
df = pd.read_csv('sales.csv')
df

Unnamed: 0,Country,Region,Sales Person,Date of Purchase,Total,Quantity
0,India,East,John,3/1/2013 0:00:00,26000000,567
1,China,North,Bill,12/29/2016 0:00:00,100000,3000
2,UK,East,Thomas,1/13/2014 0:00:00,120000,345
3,Nepal,East,John,4/2/2010 0:00:00,200000,1000
4,Africa,East,Bill,10/16/2015 0:00:00,220000,123
5,Bhuthan,North,Thomas,4/1/2013 0:00:00,240000,1000
6,Mylasia,North,John,1/17/2016 0:00:00,260000,7890
7,India,North,Bill,7/11/2017 0:00:00,220000,200
8,Nepal,East,Thomas,7/7/2014 0:00:00,240000,1000
9,Korea,North,John,8/15/2013 0:00:00,260000,1000


In [118]:
df.head()

Unnamed: 0,Country,Region,Sales Person,Date of Purchase,Total,Quantity
0,India,East,John,3/1/2013 0:00:00,26000000,567
1,China,North,Bill,12/29/2016 0:00:00,100000,3000
2,UK,East,Thomas,1/13/2014 0:00:00,120000,345
3,Nepal,East,John,4/2/2010 0:00:00,200000,1000
4,Africa,East,Bill,10/16/2015 0:00:00,220000,123


In [119]:
df.tail()

Unnamed: 0,Country,Region,Sales Person,Date of Purchase,Total,Quantity
15,Toranto,West,John,9/13/2018 0:00:00,240000,1000
16,UK,West,Bill,8/5/2015 0:00:00,260000,123
17,Quatar,West,Thomas,6/30/2013 0:00:00,260000,1000
18,Africa,North,John,11/30/2015 0:00:00,140000,7890
19,Nepal,North,Bill,12/2/2016 0:00:00,150000,200


In [121]:
df.isna().sum()
## Mencari Missing Value

Country             0
Region              0
Sales Person        0
Date of Purchase    0
Total               0
Quantity            0
dtype: int64

In [125]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Country           20 non-null     object
 1   Region            20 non-null     object
 2   Sales Person      20 non-null     object
 3   Date of Purchase  20 non-null     object
 4   Total             20 non-null     int64 
 5   Quantity          20 non-null     int64 
dtypes: int64(2), object(4)
memory usage: 1.1+ KB


In [123]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Total,20.0,1489000.0,5769521.0,100000.0,150000.0,220000.0,245000.0,26000000.0
Quantity,20.0,1479.3,2279.447,123.0,345.0,946.0,1000.0,7890.0


In [124]:
df.describe(include='O')

Unnamed: 0,Country,Region,Sales Person,Date of Purchase
count,20,20,20,20
unique,11,3,3,20
top,Nepal,East,Bill,12/2/2016 0:00:00
freq,4,9,7,1


In [126]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Country           20 non-null     object
 1   Region            20 non-null     object
 2   Sales Person      20 non-null     object
 3   Date of Purchase  20 non-null     object
 4   Total             20 non-null     int64 
 5   Quantity          20 non-null     int64 
dtypes: int64(2), object(4)
memory usage: 1.1+ KB


In [127]:
df.dtypes

Country             object
Region              object
Sales Person        object
Date of Purchase    object
Total                int64
Quantity             int64
dtype: object

In [128]:
df.isna().sum()

Country             0
Region              0
Sales Person        0
Date of Purchase    0
Total               0
Quantity            0
dtype: int64

In [None]:
Exploratory Data Analysis --- EDA
(Harus menguasai Domain/Business Knowledge)

- Pengecekan Tipe Data
- Pengecekan Missing Value - Dan Kita Handling
- Describe Data
- Pengecekan Data Outliers & Kita Handling
- Pengecekan - Extraksi Data datetime/Tanggal jika diperlukan

### Analisa Data
- Univariate (Kolom Tunggal)
- BiVariate / Multivariate (Multi Kolom)
 Ambil Insight - Interpretasi dari Hasil Analisa 
    
Data Visualization
- Univariate
- Bivariate/Multivariate (Multikolom)
Ambil Insight - Interpretasi dari Grafik

-- Kesimpulan & Saran

##### Machine Learning
