#### Install Pandas

In [None]:
%pip install pandas

#### Memuat file `config.yaml`

In [None]:
import yaml

with open('./config.yaml', 'r') as file:
    config = yaml.safe_load(file)

#### Mengimpor pandas

In [None]:
import pandas as pd

#### Memuat dan menampilkan dataset

In [None]:
dataframe = pd.read_csv(config['dataset'], low_memory=False)
dataframe.head()

#### Menampikan dimensi dataset


In [None]:
dataframe.shape

#### Menampilkan daftar nama kolom

In [None]:
dataframe.columns

#### Ringkasan data

In [None]:
dataframe.describe()

#### Menampilkan info dataset

In [None]:
dataframe.info()

#### Membersihkan dataset yang null dan menyimpan pada `df_clean_rows`

In [None]:
dataframe.drop(dataframe[dataframe['hosp_yn'] == 'Missing'].index, inplace=True)
dataframe.drop(dataframe[dataframe['icu_yn'] == 'Missing'].index, inplace=True)
dataframe.drop(dataframe[dataframe['death_yn'] == 'Missing'].index, inplace=True)
dataframe.drop(dataframe[dataframe['medcond_yn'] == 'Missing'].index, inplace=True)
dataframe.drop(dataframe[dataframe['Race and ethnicity (combined)'] == 'Missing'].index, inplace=True)
dataframe.drop(dataframe[dataframe['age_group'] == 'Unknown'].index, inplace=True)
dataframe.drop(dataframe[dataframe['Race and ethnicity (combined)'] == 'Unknown'].index, inplace=True)
df_clean_rows = dataframe.dropna()

#### Menampilkan dataset yang bersih

In [None]:
df_clean_rows

#### Install matplotlib

In [None]:
%pip install matplotlib

#### Mengimpor matplotlib

In [None]:
import matplotlib.pyplot as plt

#### Install seaborn

In [None]:
%pip install seaborn

#### Mengimpor seaborn

In [None]:
import seaborn as sns

#### Menampilkan data kolom `age_group`

In [None]:
plt.figure(figsize = (30,4))
sns.countplot(x='age_group', data=df_clean_rows)
plt.show()

#### Menampilkan data kolom `Race and ethnicity (combined)`

In [None]:
plt.figure(figsize = (30,4))
sns.countplot(x='Race and ethnicity (combined)', data=df_clean_rows)
plt.show()

#### Install sklearn

In [None]:
%pip install sklearn

#### Mengimpor sklearn

In [None]:
from sklearn.preprocessing import LabelEncoder

#### Meng-encode label string ke integer

In [None]:
columns_to_encode = [
    'cdc_report_dt', 'pos_spec_dt', 'onset_dt', 'current_status', 'sex',
    'age_group', 'Race and ethnicity (combined)', 'hosp_yn', 'icu_yn',
    'medcond_yn', 'death_yn'
]

le = LabelEncoder()

for column in columns_to_encode:
    df_clean_rows.loc[:, column] = le.fit_transform(df_clean_rows[column])

#### Menampilkan dataset yang sudah di-encode

In [None]:
df_clean_rows

## Machine learning model

#### Memilih target

In [None]:
y = df_clean_rows['cdc_report_dt']
y

#### Memilih fitur

In [None]:
features = ['pos_spec_dt', 'onset_dt', 'sex', 'age_group', 'Race and ethnicity (combined)', 'hosp_yn', 'icu_yn', 'medcond_yn', 'death_yn']
X = df_clean_rows[features]
X

#### Menampilkan fitur

In [None]:
X.describe()

#### Mengimpor `DecisionTreeRegresor`

In [None]:
from sklearn.tree import DecisionTreeRegressor

#### Konfigurasi model

In [None]:
covid_model = DecisionTreeRegressor(random_state=1)

#### Training model

In [None]:
covid_model.fit(X, y)

#### Melakukan prediksi

In [None]:
covid_model.predict(X.head())

In [None]:
y.head()

## Evaluasi Model

#### Mengimpor evaluation metric

In [None]:
from sklearn.metrics import mean_absolute_error

In [None]:
y_hat = covid_model.predict(X)
mean_absolute_error(y, y_hat)

#### Training dan testing

In [None]:
from sklearn.model_selection import train_test_split

#### Membagi dataset menjadi dua bagian

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

#### Konfigurasi dan training model

In [None]:
covid_model = DecisionTreeRegressor(random_state=1)
covid_model.fit(X_train, y_train)

#### Evaluasi model

In [None]:
y_hat = covid_model.predict(X_test)
mean_absolute_error(y_test, y_hat)

#### Optimasi model

In [None]:
def get_mae(max_leaf_nodes, X_train, X_test, y_train, y_test):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(X_train, y_train)
    y_hat = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_hat)
    return mae

In [None]:
for max_leaf_nodes in [5, 50, 500, 5000]:
    leaf_mae = get_mae(max_leaf_nodes, X_train, X_test, y_train, y_test)
    print(f'Max leaf nodes: {max_leaf_nodes} \t Mean Absolute Error: {int(leaf_mae)}')

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=1)
rf_model.fit(X_train, y_train)
y_hat = rf_model.predict(X_test)
print(f'Mean Absolute Error: {int(mean_absolute_error(y_test, y_hat))}')