In [1]:
import pandas as pd

df = pd.read_csv('avocado.csv', index_col='Date', parse_dates=['Date'])

In [2]:
# 目的変数を確認
obj_col = 'type'
df[obj_col].value_counts()

conventional    9126
organic         9123
Name: type, dtype: int64

In [3]:
#ラベルを整数に変換
df.loc[ df[obj_col]=='conventional' , obj_col] = 0
df.loc[ df[obj_col]=='organic' , obj_col] = 1

#目的変数の型変換
df.loc[:, obj_col ] = df.loc[:, obj_col ].astype(int)

In [4]:
exp_col = ['AveragePrice', 
                   'Total Volume', 
                   '4046', 
                   '4225', 
                   '4770', 
                   'Total Bags', 
                   'Small Bags', 
                   'Large Bags']

In [5]:
df[exp_col].values

array([[  1.33000000e+00,   6.42366200e+04,   1.03674000e+03, ...,
          8.69687000e+03,   8.60362000e+03,   9.32500000e+01],
       [  1.35000000e+00,   5.48769800e+04,   6.74280000e+02, ...,
          9.50556000e+03,   9.40807000e+03,   9.74900000e+01],
       [  9.30000000e-01,   1.18220220e+05,   7.94700000e+02, ...,
          8.14535000e+03,   8.04221000e+03,   1.03140000e+02],
       ..., 
       [  1.87000000e+00,   1.37667600e+04,   1.19192000e+03, ...,
          9.39411000e+03,   9.35180000e+03,   4.23100000e+01],
       [  1.93000000e+00,   1.62052200e+04,   1.52763000e+03, ...,
          1.09695400e+04,   1.09195400e+04,   5.00000000e+01],
       [  1.62000000e+00,   1.74895800e+04,   2.89477000e+03, ...,
          1.20141500e+04,   1.19881400e+04,   2.60100000e+01]])

In [6]:
df[obj_col].values

array([0, 0, 0, ..., 1, 1, 1])

In [7]:
traindf = df.loc[:'2017',:]
testdf  = df.loc['2018':,:]

In [8]:
X_train = traindf.loc[:'2017',exp_col].values
y_train = traindf.loc[:'2017',obj_col].values

X_test  = testdf.loc['2018':,exp_col].values
y_test  = testdf.loc['2018':,obj_col].values

In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
#標準化させるために訓練データだけで平均と分散を計算
scaler.fit(X_train)
#訓練データ，テストデータを標準化する．
X_train = scaler.transform(X_train)
X_test  = scaler.transform(X_test)

In [11]:
from sklearn.linear_model import LogisticRegression

In [None]:
# group 10-fold cross validation
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GroupKFold

n = X_train.shape[1]

#標準化と学習のパイプライン
pipe = Pipeline([('scaler',StandardScaler()),("LR",LogisticRegression(random_state=0))])
pipe.fit(X_train,y_train)

#パラメータは10^nオーダーで変化させる
params = {'LR__C':[10000,100000,1000000,10000000],'LR__penalty':['l1', 'l2'],'LR__class_weight':[None,'balanced']}

#グリッドサーチ
grid = GridSearchCV(pipe, param_grid = params, cv = 5, scoring='accuracy',n_jobs=10)
grid.fit(X_train,y_train)

print('Best cross-validation accuracy: {:.2f}'.format(grid.best_score_))
print('Train set score: {:.2f}'.format(grid.score(X_train,y_train)))
print('Best parameters : {}'.format(grid.best_params_))

In [12]:
#from sklearn.svm import SVC
#from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, penalty='l2', C=100000, class_weight='balanced')

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [13]:
from sklearn.metrics import classification_report

print('平均予測精度 : {:.2f}\n'.format(clf.score(X_test, y_test)))
print(classification_report(y_test, y_pred, target_names=['conventional','organic']))

平均予測精度 : 0.9591049382716049

              precision    recall  f1-score   support

conventional       0.99      0.92      0.96       648
     organic       0.93      0.99      0.96       648

 avg / total       0.96      0.96      0.96      1296

