## Preprocessing

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [169]:
# read dataset
data = pd.read_csv("../data/raws/ds_crptt.csv")
data.head()

Unnamed: 0,tanggal,jenis,qty,harga,jumlah
0,5/8/2023,CRPTT,900,16500.0,14850000.0
1,5/10/2023,CRPTT,13,16500.0,214500.0
2,5/11/2023,CRPTT,5,16500.0,82500.0
3,5/11/2023,CRPTT,500,17500.0,8750000.0
4,5/22/2023,CRPTT,6,16000.0,96000.0


In [170]:
# membaca informasi data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171 entries, 0 to 170
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   tanggal  171 non-null    object
 1   jenis    171 non-null    object
 2   qty      171 non-null    object
 3   harga    171 non-null    object
 4   jumlah   171 non-null    object
dtypes: object(5)
memory usage: 6.8+ KB


In [171]:
# copy data
df = data.copy()

In [172]:
# mengubah df type
df['qty'] = df['qty'].str.strip().str.replace(',', '').astype(float)
df['harga'] = df['harga'].str.strip().str.replace(',', '').astype(float)
df['jumlah'] = df['jumlah'].str.strip().str.replace(',', '').astype(float)
df['tanggal'] = pd.to_datetime(df['tanggal'], format='mixed')

In [173]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171 entries, 0 to 170
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   tanggal  171 non-null    datetime64[ns]
 1   jenis    171 non-null    object        
 2   qty      171 non-null    float64       
 3   harga    171 non-null    float64       
 4   jumlah   171 non-null    float64       
dtypes: datetime64[ns](1), float64(3), object(1)
memory usage: 6.8+ KB


In [174]:
# mengecek missing value
df.isnull().sum()

tanggal    0
jenis      0
qty        0
harga      0
jumlah     0
dtype: int64

In [175]:
# mengecek duplicate value
df.duplicated().sum()

0

In [176]:
# group by tanggal
df_group = df.groupby(["tanggal", "jenis"]).agg({"qty": "sum",
                                                 "harga": "mean",
                                                 "jumlah": "sum"}).reset_index()
df_group.head()

Unnamed: 0,tanggal,jenis,qty,harga,jumlah
0,2023-05-08,CRPTT,900.0,16500.0,14850000.0
1,2023-05-10,CRPTT,13.0,16500.0,214500.0
2,2023-05-11,CRPTT,505.0,17000.0,8832500.0
3,2023-05-22,CRPTT,2688.0,16000.0,43008000.0
4,2023-05-23,CRPTT,318.0,16000.0,5088000.0


In [177]:
# create data range 
min_date = df_group['tanggal'].min()
max_date = df_group['tanggal'].max()

# buat range data tanggal
date_range = pd.date_range(start=min_date, end=max_date)
df_tanggal= pd.DataFrame(date_range, columns=['tanggal'])

In [178]:
df_tanggal.head()
df.shape

(171, 5)

In [180]:
# menggabungkan dataset
merged_df = pd.merge(df_tanggal, df_group, on='tanggal', how='left')
merged_df.head()

Unnamed: 0,tanggal,jenis,qty,harga,jumlah
0,2023-05-08,CRPTT,900.0,16500.0,14850000.0
1,2023-05-09,,,,
2,2023-05-10,CRPTT,13.0,16500.0,214500.0
3,2023-05-11,CRPTT,505.0,17000.0,8832500.0
4,2023-05-12,,,,


In [181]:
# mengisi nilai kosong
merged_df["jenis"] = merged_df["jenis"].fillna("CRPTT")
merged_df["qty"].interpolate(inplace=True)
merged_df["jumlah"].interpolate(inplace=True)
merged_df["harga"].interpolate(inplace=True)
merged_df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df["qty"].interpolate(inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df["jumlah"].interpolate(inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alwa

Unnamed: 0,tanggal,jenis,qty,harga,jumlah
0,2023-05-08,CRPTT,900.0,16500.0,14850000.0
1,2023-05-09,CRPTT,456.5,16500.0,7532250.0
2,2023-05-10,CRPTT,13.0,16500.0,214500.0
3,2023-05-11,CRPTT,505.0,17000.0,8832500.0
4,2023-05-12,CRPTT,703.454545,16909.090909,11939360.0


In [182]:
print(merged_df.isnull().sum())
print(merged_df.duplicated().sum())

tanggal    0
jenis      0
qty        0
harga      0
jumlah     0
dtype: int64
0


In [183]:
# save dataset
merged_df.to_csv("../data/cleans/dataset_crptt.csv", index=False)