# Data Analytics Competition Find IT UGM - H1N1 and Seasonal Vaccine

## Tim Oh Data Euy : 
- Gerend Christopher 
- Felix Fernando 
- Jeremy

# Setup

In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
from IPython.display import display

# Model Library
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import confusion_matrix, roc_auc_score, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
import optuna

from catboost import CatBoostClassifier, Pool, cv

import lightgbm as lgb

import xgboost as xgb

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from category_encoders import OrdinalEncoder as oe

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

# custom plot seaborn
plt.rcParams["figure.figsize"] = (8,6)
custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style="ticks", rc=custom_params, palette='tab10')

np.random.seed(10)
%matplotlib inline

# Data Preparation

### Loading Data

In [3]:
df_features = pd.read_csv('new_data/train_features.csv') # Membaca  feature data train
df_labels = pd.read_csv('new_data/train_label.csv') # Membaca label data train 
df_test_features = pd.read_csv('new_data/test_feature.csv') # membaca feature data test

In [10]:
df_train = df_features.join(df_labels)
df_train

Unnamed: 0,facilities,rating,location,Price
0,RestaurantBARSwimmingPools,7.8 Very GoodFrom 10 reviews,Stokol,"13,500avg/night"
1,intrnetRestaurantgym,5.6 GoodFrom 4 reviews,Machlessvile,"13,000avg/night"
2,restaurantgympoolBar,7.2 Very GoodFrom 38 reviews,Wanderland,"19,000avg/night"
3,BARRestaurant,7.3 Very GoodFrom 6 reviews,Uberlandia,"6,000avg/night"
4,InternetRestaurant,7.2 Very GoodFrom 30 reviews,Stokol,"20,000avg/night"
...,...,...,...,...
3061,barInternet,,Andeman,"31,625avg/night"
3062,restaurantBarInternet,8.1 ExcellentFrom 4 reviews,Uberlandia,"30,500avg/night"
3063,Barrestaurantswimmingpools,6.7 Very GoodFrom 10 reviews,Willsmian,"14,000avg/night"
3064,Restaurant,,Hallerson,"8,500avg/night"


In [12]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3066 entries, 0 to 3065
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   facilities  2765 non-null   object
 1   rating      2429 non-null   object
 2   location    3066 non-null   object
 3   Price       3066 non-null   object
dtypes: object(4)
memory usage: 95.9+ KB


### Handling Null

READ THIS: kalo dua duanya null hmm, drop aja la ya (ger)

In [23]:
# check both facilities and rating null values

df_train[df_train['facilities'].isnull() & df_train['rating'].isnull()]

Unnamed: 0,facilities,rating,location,Price
16,,,Machlessvile,"3,200avg/night"
44,,,Uberlandia,"17,000avg/night"
58,,,Stokol,"1,800avg/night"
73,,,Stokol,"23,050avg/night"
79,,,Stokol,"1,800avg/night"
...,...,...,...,...
2998,,,Machlessvile,"3,500avg/night"
3015,,,Stokol,"8,000avg/night"
3021,,,Uberlandia,"8,000avg/night"
3030,,,Willsmian,"3,700avg/night"


In [27]:
# drop if facilities and rating both null

df_train.dropna(subset=['facilities', 'rating'], how='all', inplace=True)

In [32]:
df_train = df_train.reset_index(drop=True)

READ THIS: susah kalo fasilitias nya engga ada untuk fill nan nya. jadi mnrt ger di drop ae

In [33]:
# drop null values in facilities

df_train[df_train['facilities'].isnull()]

Unnamed: 0,facilities,rating,location,Price
28,,6.0 Very GoodFrom 43 reviews,Wanderland,"15,000avg/night"
57,,10.0 ExcellentFrom 1 review,Wanderland,"20,000avg/night"
102,,6.4 Very GoodFrom 1 review,Andeman,"10,000avg/night"
133,,6.0 Very GoodFrom 43 reviews,Andeman,"15,000avg/night"
145,,6.0 Very GoodFrom 43 reviews,Hallerson,"15,000avg/night"
...,...,...,...,...
2706,,6.4 Very GoodFrom 1 review,Stokol,"10,000avg/night"
2728,,7.6 Very GoodFrom 1 review,Stokol,"11,000avg/night"
2746,,8.3 ExcellentFrom 4 reviews,Ubisville,"35,000avg/night"
2767,,6.0 Very GoodFrom 43 reviews,Wanderland,"15,000avg/night"


In [36]:
# drop null values in facilities

df_train = df_train.dropna(subset=['facilities']).reset_index(drop=True)
df_train

Unnamed: 0,facilities,rating,location,Price
0,RestaurantBARSwimmingPools,7.8 Very GoodFrom 10 reviews,Stokol,"13,500avg/night"
1,intrnetRestaurantgym,5.6 GoodFrom 4 reviews,Machlessvile,"13,000avg/night"
2,restaurantgympoolBar,7.2 Very GoodFrom 38 reviews,Wanderland,"19,000avg/night"
3,BARRestaurant,7.3 Very GoodFrom 6 reviews,Uberlandia,"6,000avg/night"
4,InternetRestaurant,7.2 Very GoodFrom 30 reviews,Stokol,"20,000avg/night"
...,...,...,...,...
2760,barInternet,,Andeman,"31,625avg/night"
2761,restaurantBarInternet,8.1 ExcellentFrom 4 reviews,Uberlandia,"30,500avg/night"
2762,Barrestaurantswimmingpools,6.7 Very GoodFrom 10 reviews,Willsmian,"14,000avg/night"
2763,Restaurant,,Hallerson,"8,500avg/night"


READ THIS: nah gimn skrang ttg rating. bisa jadi drop?, bisa jadi isi tapi cuma angka rating paling atau tipe(kek excellent dll), tapi ga bisa yang byk review nya

### Feature Engineering

READ THIS: split rating angka, tipe(kek excellent, good, blabla), dan berapa review nya, hapus avg/night di price, split facilites (hati-hati masalah case sensitive), trus ger pikir bgusnya kita encoding ada apa aja fasilitas nya 

READ THIS: yg price posisinya terakhir aja

In [55]:
# strip price unit, replace commas and change data type

df_train['Price'] = df_train['Price'].str.rstrip('avg/night').str.replace(",", "").astype('int64')
df_train['Price']

0       13500
1       13000
2       19000
3        6000
4       20000
        ...  
2760    31625
2761    30500
2762    14000
2763     8500
2764    19000
Name: Price, Length: 2765, dtype: int64

# EDA

# Modeling

# Submission