In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import os,sys
from scipy import stats
from sklearn.svm import SVC

dataDrink=pd.read_csv("drinks_new.csv")
dataDrink.head()

Unnamed: 0,Beverage,Calories,Cholesterol,Carbohydrates,Sugars,Protein,Caffeine
0,Brewed Coffee,3,0,5,0,0.3,175
1,Brewed Coffee,4,0,10,0,0.5,260
2,Brewed Coffee,5,0,10,0,1.0,330
3,Brewed Coffee,5,0,10,0,1.0,410
4,Caffè Latte,70,10,75,9,6.0,75


# Data Cleansing & Check

In [2]:
#cek ada data null
dataDrink.isnull().values.any()

False

In [3]:
#cek jumlah data yg null di tiap atribut
dataDrink.isnull().sum()

Beverage         0
Calories         0
Cholesterol      0
Carbohydrates    0
 Sugars          0
 Protein         0
Caffeine         0
dtype: int64

In [4]:
#isi data null dengan nol (0)
dataDrink=dataDrink.fillna(value=0)
dataDrink.head()

Unnamed: 0,Beverage,Calories,Cholesterol,Carbohydrates,Sugars,Protein,Caffeine
0,Brewed Coffee,3,0,5,0,0.3,175
1,Brewed Coffee,4,0,10,0,0.5,260
2,Brewed Coffee,5,0,10,0,1.0,330
3,Brewed Coffee,5,0,10,0,1.0,410
4,Caffè Latte,70,10,75,9,6.0,75


# Menentukan Atribut dan Label

In [5]:
y = dataDrink.iloc[:, 0]
X = dataDrink.drop('Beverage', axis=1)

# Data Transform (Normalisasi RobustScaler)

In [6]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)

# Split TRAIN dan TEST

In [7]:
X_train,X_test,y_train,y_test = train_test_split(X_scaled,y,test_size=0.2,random_state=123, stratify=y)

# Algoritma SVM Biasa, Tanpa Preproses dan Optimasi (Skor Rendah)

In [41]:
tempSVM = SVC(C=100.0, gamma=0.1, kernel='rbf').fit(X_train,y_train)
print("Accuracy on training set: {:.3f}".format(tempSVM.score(X_train,y_train)))
print("Accuracy on test set : {:.3f}".format(tempSVM.score(X_test, y_test)))
pred_svm = tempSVM.predict(X_test)

#Akurasi,presisi,recall,f1
from sklearn.metrics import classification_report
print(classification_report(y_test, pred_svm))

#Cross Validation 
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5)
svm_eval = cross_val_score(estimator = tempSVM, X = X, y = y, cv = skf)
print("->Rate of Cross Validation Score [SVM] : {:.2f}".format(svm_eval.mean()))

Accuracy on training set: 0.876
Accuracy on test set : 0.592
                                                     precision    recall  f1-score   support

                          Banana Chocolate Smoothie       1.00      1.00      1.00         1
                                      Brewed Coffee       1.00      1.00      1.00         1
                                    Caffè Americano       1.00      1.00      1.00         1
                                        Caffè Latte       0.33      0.50      0.40         2
                Caffè Mocha (Without Whipped Cream)       0.67      1.00      0.80         2
                                         Cappuccino       0.00      0.00      0.00         2
                                            Caramel       0.00      0.00      0.00         1
                    Caramel (Without Whipped Cream)       0.50      1.00      0.67         2
        Caramel Apple Spice (Without Whipped Cream)       1.00      1.00      1.00         1
        

  'precision', 'predicted', average, warn_for)


# Feature-selection / Cek Atribut Terbaik dan Atribut yg 'perlu' dibuang

In [12]:
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.metrics import accuracy_score

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=123, stratify=y)

# find best scored 3 features
select_feature = SelectKBest(chi2, k='all').fit(X_train, y_train)
print('Score list:', select_feature.scores_)
print('Feature list:', X_train.columns)

X_train = select_feature.transform(X_train)
X_test = select_feature.transform(X_test)
tempSVM = SVC(C=700, gamma= 0.001, kernel= 'linear', verbose=True)   #pakai best parameter dari tuning ya... !!
tempSVM = tempSVM.fit(X_train,y_train)
pred_svm = tempSVM.predict(X_test)
print("\n\n3 atribut terbaik: Carbohydrates, Caffeine, Calories\n\n")


#Akurasi,presisi,recall,f1
from sklearn.metrics import classification_report
print(classification_report(y_test, pred_svm))
#Cross Validation 
#from sklearn.model_selection import KFold, cross_val_score
#from sklearn.model_selection import StratifiedKFold, ShuffleSplit
#skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5)
#ss = ShuffleSplit(n_splits=3, test_size=0.25, random_state=0)
#svm_eval = cross_val_score(estimator = tempSVM, X = X, y = y) #, cv = skf)
#print("->Rate of Cross Validation Score [SVM] : {:.2f}".format(svm_eval.mean()))

Score list: [ 6203.40752373  1573.8653638   6972.12161031  1547.6713801    458.45522108
  6779.81069417]
Feature list: Index(['Calories', 'Cholesterol', 'Carbohydrates', ' Sugars', ' Protein',
       'Caffeine'],
      dtype='object')
[LibSVM]

3 atribut terbaik: Carbohydrates, Caffeine, Calories


                                                     precision    recall  f1-score   support

                          Banana Chocolate Smoothie       1.00      1.00      1.00         1
                                      Brewed Coffee       1.00      1.00      1.00         1
                                    Caffè Americano       1.00      1.00      1.00         1
                                        Caffè Latte       0.33      0.50      0.40         2
                Caffè Mocha (Without Whipped Cream)       0.50      0.50      0.50         2
                                         Cappuccino       1.00      0.50      0.67         2
                                            Cara

  'precision', 'predicted', average, warn_for)


# Tuning / Cari Best-parameter SVM - [ WARNING!! NYARINYA LAMA !! ]

In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

scaler = RobustScaler().fit(X)
X_scaled = scaler.transform(X)
X_train,X_test,y_train,y_test = train_test_split(X_scaled,y,test_size=0.2,random_state=123, stratify=y)

#skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5)
#C_range = 10. ** np.arange(-3, 8)
#gamma_range = 10. ** np.arange(-5, 4)

svm_params = {    'C': [1, 10, 100, 700],
                  'gamma': [0.001], 
                  'kernel':('linear', 'rbf')
             } 

svm_grid = GridSearchCV(SVC(), svm_params, verbose=True, n_jobs=-1) #, cv=skf)
#svm_rbf_grid = GridSearchCV(SVC(), svm_params, verbose=True, n_jobs=-1)
svm_grid.fit(X_scaled, y)

print('Best SVM params:', svm_grid.best_params_)
print('Best SVM cross validation score:', svm_grid.best_score_)

#HASIL SEARCH NYA GINI KALAU LAMA PROSESNYA...
#Robust--> Best SVM params: {'C': 700, 'gamma': 0.001, 'kernel': 'linear'}  Best SVM cross validation score: 0.665289256198
#StandardScaler--> Best SVM params: {'C': 243, 'gamma': 0.001, 'kernel': 'linear'} Best SVM cross validation score: 0.648760330579
#MinMaxScaler--> Best SVM params: {'C': 1500, 'gamma': 0.001, 'kernel': 'linear'} Best SVM cross validation score: 0.640495867769

Fitting 3 folds for each of 8 candidates, totalling 24 fits




Best SVM params: {'C': 700, 'gamma': 0.001, 'kernel': 'linear'}
Best SVM cross validation score: 0.648760330579


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:    9.2s finished


# Masukkan Best-parameter dan Lihat Hasil Tuning-nya

In [17]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=123, stratify=y)

tempSVM = SVC(C=700, gamma=0.001, kernel='linear', verbose=True).fit(X_train,y_train)
print("Accuracy on training set: ", tempSVM.score(X_train,y_train))
print("Accuracy on test set : ", tempSVM.score(X_test, y_test))
pred_svm = tempSVM.predict(X_test)

[LibSVM]Accuracy on training set:  0.943005181347
Accuracy on test set :  0.612244897959


# Akurasi, Presisi, Recall, F1, Support, Cross Validation (Hasil Tuning)

In [298]:
#Akurasi,presisi,recall,f1
from sklearn.metrics import classification_report
pred_svm = tempSVM.predict(X_test)
print(classification_report(y_test, pred_svm))

print("Accuracy Score : " + str(tempSVM.score(X_test, y_test)))
#Cross Validation 
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import StratifiedKFold, ShuffleSplit
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5)
#ss = ShuffleSplit(n_splits=3, test_size=0.25, random_state=0)
svm_eval = cross_val_score(estimator = tempSVM, X = X, y = y, cv = skf)
print("->Rate of Cross Validation Score [SVM] : ", svm_eval.mean())

  'precision', 'predicted', average, warn_for)


                                                     precision    recall  f1-score   support

                          Banana Chocolate Smoothie       1.00      1.00      1.00         1
                                      Brewed Coffee       1.00      1.00      1.00         1
                                    Caffè Americano       1.00      1.00      1.00         1
                                        Caffè Latte       1.00      0.50      0.67         2
                Caffè Mocha (Without Whipped Cream)       0.50      0.50      0.50         2
                                         Cappuccino       0.67      1.00      0.80         2
                                            Caramel       1.00      1.00      1.00         1
                    Caramel (Without Whipped Cream)       0.67      1.00      0.80         2
        Caramel Apple Spice (Without Whipped Cream)       1.00      1.00      1.00         1
                                  Caramel Macchiato       0.50      0

# Data Extraction (PCA) --> Akurasi-nya RENDAH jadinya TDK dipakai

In [34]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler().fit(X)
#X_scaled = scaler.transform(X)

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=123, stratify=y)

temp_pca = PCA(0.9).fit(X)
print('We need %d components to explain 90%% of variance' % temp_pca.n_components_)

pca = PCA(n_components=2, whiten=True)
pca.fit(X_train)
X_train = pca.transform(X_train)

pca = PCA(n_components=2, whiten=True)
pca.fit(X_test)
X_test = pca.transform(X_test)

tempSVM = SVC(C=700, gamma=0.001, kernel='linear', verbose=True).fit(X_train,y_train)
print("Accuracy on training set: ", tempSVM.score(X_train,y_train))
print("Accuracy on test set : ", tempSVM.score(X_test, y_test))
print("\nAccuracy with PCA-SVM is NOT good!!! ")

We need 2 components to explain 90% of variance
[LibSVM]Accuracy on training set:  0.476683937824
Accuracy on test set :  0.265306122449

Accuracy with PCA-SVM is NOT good!!! 


In [19]:
################################################################################################################################

In [20]:
################################################################################################################################

In [21]:
################################################################################################################################

# HASIL AKHIR EXPERIMEN TADI \\  PIPE LEBIH SIMPLE DARIPADA MANUAL!

# BUAT PIPELINE (BEST PARAM ROBUSTSCALER C=700, gamma=0.001, kernel='linear', verbose=True)

In [8]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=123, stratify=y)
newPipe = make_pipeline(StandardScaler(), SVC(C=243, gamma=0.001, kernel='linear', verbose=True))
newPipe.fit(X_train,y_train)
pred_svm = newPipe.predict(X_test)

#Akurasi,presisi,recall,f1
from sklearn.metrics import classification_report
print(classification_report(y_test, pred_svm))
#Cross Validation 
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import StratifiedKFold, ShuffleSplit
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5)
#ss = ShuffleSplit(n_splits=3, test_size=0.25, random_state=0)
svm_eval = cross_val_score(estimator = newPipe, X = X, y = y, cv = skf)
print("Accuracy Score : " + str(newPipe.score(X_test, y_test)))
print("->Rate of Cross Validation Score [SVM] : ", svm_eval.mean())

[LibSVM]                                                     precision    recall  f1-score   support

                          Banana Chocolate Smoothie       1.00      1.00      1.00         1
                                      Brewed Coffee       1.00      1.00      1.00         1
                                    Caffè Americano       1.00      1.00      1.00         1
                                        Caffè Latte       0.33      0.50      0.40         2
                Caffè Mocha (Without Whipped Cream)       0.67      1.00      0.80         2
                                         Cappuccino       1.00      0.50      0.67         2
                                            Caramel       0.00      0.00      0.00         1
                    Caramel (Without Whipped Cream)       0.67      1.00      0.80         2
        Caramel Apple Spice (Without Whipped Cream)       1.00      1.00      1.00         1
                                  Caramel Macchiato       1.0

  'precision', 'predicted', average, warn_for)


[LibSVM][LibSVM]Accuracy Score : 0.714285714286
->Rate of Cross Validation Score [SVM] :  0.806505538541


# DUMP MODEL

In [9]:
tempData = [newPipe,dataDrink]

from sklearn.externals import joblib
joblib.dump(tempData,'drinks_pycham.pkl')

['drinks_pycham.pkl']

# LIHAT DUMP

In [11]:
newData = joblib.load('drinks_pycham.pkl')
print(newData)

[Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svc', SVC(C=243, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=True))]),                                          Beverage  Calories  Cholesterol  \
0                                   Brewed Coffee         3            0   
1                                   Brewed Coffee         4            0   
2                                   Brewed Coffee         5            0   
3                                   Brewed Coffee         5            0   
4                                     Caffè Latte        70           10   
5                                     Caffè Latte       100           10   
6                                     Caffè Latte        70            6   
7                                     Caf