In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve, recall_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
import pickle

In [2]:
tv = pd.read_csv('tvfix.csv', sep=";")
tv.head()

Unnamed: 0,produk,merek,tipe,ukuran,harga
0,Polytron PLD32T7511,Polytron,LED,32,1648000
1,Sharp AQUOS LC-24LE170i,Sharp,LED,24,1091200
2,Sharp AQUOS LC-24LE175i,Sharp,LED,24,400000
3,Sharp AQUOS LC-32SA4100i,Sharp,LED,32,1945000
4,Polytron PLD32T710,Polytron,LED,32,1890000


In [3]:
tv.dtypes

produk    object
merek     object
tipe      object
ukuran     int64
harga      int64
dtype: object

In [7]:
tv["produk"].value_counts()

TCL D2700                      6
Samsung UA55JS7200             1
Panasonic VIERA TH-43C305G     1
Samsung QA75Q9F                1
Sharp AQUOS LC-40LE265M        1
AKARI SC-52V40                 1
Changhong Led29A6500           1
LG 60LF630T                    1
TOSHIBA REGZA 49L3750          1
Changhong 22C2600              1
Sharp AQUOS LC-24LE170i        1
Sharp AQUOS LC-24LE175i        1
LG 42UB700T                    1
LG 43UF640T                    1
TOSHIBA REGZA 24L3650          1
LG 79UF770T                    1
Sharp AQUOS LC-40LE185i        1
Sharp AQUOS LC-58UE1M          1
Coocaa 40D3A                   1
Samsung UA32K4100              1
Panasonic VIERA TH-43F305G     1
Samsung PS43D450A2M            1
LG 49LF630T                    1
LG 32LH500D                    1
Samsung UA32F4000              1
LG OLed65B6T                   1
Panasonic VIERA TH-55EX600G    1
AKARI LE-50D88ID               1
LG 49UH610T                    1
Sharp AQUOS LC-40LE380X        1
          

In [8]:
def _get_category_mapping(column):
    """ Return the mapping of a category """
    return dict([(cat, code) for code, cat in enumerate(column.cat.categories)])

In [9]:
tv['produk'] = tv['produk'].astype('category')
produk_mapping = _get_category_mapping(tv['produk'])
tv['produk'] = tv['produk'].cat.codes

In [10]:
produk_mapping

{'AKARI LE-20K88': 0,
 'AKARI LE-20V89': 1,
 'AKARI LE-24K88ID': 2,
 'AKARI LE-25B88': 3,
 'AKARI LE-29P57ID': 4,
 'AKARI LE-29V89': 5,
 'AKARI LE-3289T2': 6,
 'AKARI LE-32AK30': 7,
 'AKARI LE-32D88': 8,
 'AKARI LE-32K88': 9,
 'AKARI LE-32M88': 10,
 'AKARI LE-32P88': 11,
 'AKARI LE-32V90': 12,
 'AKARI LE-32V99SM': 13,
 'AKARI LE-32V99T2': 14,
 'AKARI LE-4099T2': 15,
 'AKARI LE-40D88': 16,
 'AKARI LE-40P88': 17,
 'AKARI LE-4399T2SB': 18,
 'AKARI LE-43D99SBS': 19,
 'AKARI LE-5099T2SB': 20,
 'AKARI LE-50D88ID': 21,
 'AKARI LE-50D99SBS': 22,
 'AKARI LE-55D88S': 23,
 'AKARI LE-65D88': 24,
 'AKARI SC-52V32': 25,
 'AKARI SC-52V40': 26,
 'AKARI SC-52V43': 27,
 'AQUA LE24AQT6500T': 28,
 'AQUA LE24AQT8300': 29,
 'AQUA LE32AQT6500': 30,
 'AQUA LE32AQT7000T': 31,
 'AQUA LE32AQT9000': 32,
 'AQUA LE40AQT8300': 33,
 'Changhong 22C2600': 34,
 'Changhong 32E6000i': 35,
 'Changhong 40E6000HFT': 36,
 'Changhong 50E6000HFT': 37,
 'Changhong L24G3': 38,
 'Changhong L32H4': 39,
 'Changhong L32H7': 40,
 'Cha

In [11]:
tv['merek'] = tv['merek'].astype('category')
merek_mapping = _get_category_mapping(tv['merek'])
tv['merek'] = tv['merek'].cat.codes

In [12]:
merek_mapping

{'AKARI': 0,
 'AQUA': 1,
 'Changhong': 2,
 'Coocaa': 3,
 'LG': 4,
 'Mito': 5,
 'Panasonic': 6,
 'Philips': 7,
 'Polytron': 8,
 'SANKEN': 9,
 'SANYO': 10,
 'Samsung': 11,
 'Sharp': 12,
 'Sony': 13,
 'TCL': 14,
 'TOSHIBA': 15,
 'Xiaomi': 16}

In [13]:
tv['tipe'] = tv['tipe'].astype('category')
tipe_mapping = _get_category_mapping(tv['tipe'])
tv['tipe'] = tv['tipe'].cat.codes

In [14]:
tipe_mapping

{'CRT': 0,
 'LCD': 1,
 'LCD, Smart TV': 2,
 'LED': 3,
 'LED, Smart TV': 4,
 'OLED, Smart TV': 5,
 'Plasma': 6,
 'QLED, Smart TV': 7}

In [15]:
tv['ukuran'] = tv['ukuran'].astype('category')
ukuran_mapping = _get_category_mapping(tv['ukuran'])
tv['ukuran'] = tv['ukuran'].cat.codes

In [16]:
ukuran_mapping

{19: 0,
 20: 1,
 21: 2,
 22: 3,
 23: 4,
 24: 5,
 25: 6,
 28: 7,
 29: 8,
 32: 9,
 39: 10,
 40: 11,
 42: 12,
 43: 13,
 45: 14,
 46: 15,
 47: 16,
 48: 17,
 49: 18,
 50: 19,
 55: 20,
 58: 21,
 60: 22,
 65: 23,
 70: 24,
 75: 25,
 79: 26,
 80: 27,
 84: 28,
 86: 29,
 98: 30}

In [17]:
tv.head()

Unnamed: 0,produk,merek,tipe,ukuran,harga
0,241,8,3,9,1648000
1,338,12,3,5,1091200
2,339,12,3,5,400000
3,346,12,3,9,1945000
4,240,8,3,9,1890000


In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [19]:
x = tv.drop(['harga'],axis=1)
y = tv['harga']
x.head()

Unnamed: 0,produk,merek,tipe,ukuran
0,241,8,3,9
1,338,12,3,5
2,339,12,3,5
3,346,12,3,9
4,240,8,3,9


In [20]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.20, random_state=0)

In [21]:
classifier= RandomForestClassifier(n_estimators=100, max_features=4, random_state=0)

In [22]:
classifier.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=4, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [24]:
pickle.dump(classifier, open('model.pkl','wb'))

In [25]:
from sklearn.externals import joblib

def _save_variable(variable, filename):
    """ Save a variable to a file """
    joblib.dump(variable, filename)

In [26]:
_save_variable(produk_mapping, 'produk.pkl')
_save_variable(merek_mapping, 'merek.pkl')
_save_variable(tipe_mapping, 'tipe.pkl')
_save_variable(ukuran_mapping, 'ukuran.pkl')