In [40]:
import warnings
warnings.filterwarnings(action='ignore')
import numpy as np
import pandas as pd

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split

In [4]:
from sklearn.datasets import load_digits

In [10]:
X = load_digits().data
y = load_digits().target
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [26]:
svc_model = SVC(probability=True)
tree_model = DecisionTreeClassifier()

In [27]:
svc_model.fit(X_train, y_train)
svc_model.score(X_test,y_test)

0.9944444444444445

In [28]:
tree_model.fit(X_train, y_train)
tree_model.score(X_test,y_test)

0.8888888888888888

In [30]:
svc_model.predict_proba(X_test[[0],:])
# tree_model.predict_proba(X_test[[0],:])

array([[0.00149488, 0.00240909, 0.00221351, 0.03243052, 0.00297932,
        0.00644   , 0.00153366, 0.01010625, 0.85912337, 0.0812694 ]])

In [42]:
svc_model.get_params()
## C 0~1.0 10개
## kernel linear, rbf

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': True,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [43]:
model = SVC()
params = {
    'C': np.linspace(0,1,10),
    'kernel': ['rbf','linear'],
}
gcv = GridSearchCV(model, param_grid=params, cv=5)

In [44]:
gcv.fit(X,y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': array([0.        , 0.11111111, 0.22222222, 0.33333333, 0.44444444,
       0.55555556, 0.66666667, 0.77777778, 0.88888889, 1.        ]),
                         'kernel': ['rbf', 'linear']})

In [45]:
gcv.best_estimator_.score(X,y)

0.996661101836394

In [47]:
dt_model = DecisionTreeClassifier()
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': np.arange(2,10),
}
dt_grid = GridSearchCV(dt_model, param_grid=param_grid, cv=5)

In [48]:
dt_grid.fit(X,y)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': array([2, 3, 4, 5, 6, 7, 8, 9])})

In [50]:
dt_grid.best_params_

{'criterion': 'entropy', 'max_depth': 9}

In [51]:
dt_grid.best_estimator_.score(X,y)

0.9944351697273233

In [75]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [54]:
data = [
    {'price': 850000, 'rooms': 4, 'neighborhood': 'Queen Anne'},
    {'price': 700000, 'rooms': 3, 'neighborhood': 'Fremont'},
    {'price': 650000, 'rooms': 3, 'neighborhood': 'Wallingford'},
    {'price': 600000, 'rooms': 2, 'neighborhood': 'Fremont'}
]

In [56]:
df = pd.DataFrame(data)

In [57]:
df

Unnamed: 0,price,rooms,neighborhood
0,850000,4,Queen Anne
1,700000,3,Fremont
2,650000,3,Wallingford
3,600000,2,Fremont


In [81]:
ohe = OneHotEncoder()
ohe.fit(df[['neighborhood']])
ohe.transform(df[['neighborhood']]).toarray()

array([[0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [68]:
en_neighbor = LabelEncoder()
en_neighbor.fit(df.neighborhood)
new_neighbor = en_neighbor.transform(df.neighborhood)

In [71]:
df.drop('neighborhood', axis=1, inplace=True)
df['neighborhood'] = new_neighbor
X = df.values
X

array([[850000,      4,      1],
       [700000,      3,      0],
       [650000,      3,      2],
       [600000,      2,      0]])

In [72]:
X.dtype

dtype('int64')

In [61]:
y = np.array([0,1,0,1])

In [62]:
from sklearn.tree import DecisionTreeClassifier

In [73]:
dt = DecisionTreeClassifier()
dt.fit(X,y)

DecisionTreeClassifier()

In [74]:
en_neighbor.transform([['Fremont']])

array([0])

In [83]:
en_neighbor.classes_

array(['Fremont', 'Queen Anne', 'Wallingford'], dtype=object)

In [98]:
sample = ['problem of evil',
          'evil queen',
          'horizon problem']

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [103]:
vec = CountVectorizer()
tfidfvec = TfidfVectorizer()

In [104]:
X_vec = vec.fit_transform(sample)
X_tfidf = tfidfvec.fit_transform(sample)

In [106]:
print(pd.DataFrame(X_vec.toarray(),columns=vec.get_feature_names()))
print(pd.DataFrame(X_tfidf.toarray(),columns=vec.get_feature_names()))

   evil  horizon  of  problem  queen
0     1        0   1        1      0
1     1        0   0        0      1
2     0        1   0        1      0
       evil   horizon        of   problem     queen
0  0.517856  0.000000  0.680919  0.517856  0.000000
1  0.605349  0.000000  0.000000  0.000000  0.795961
2  0.000000  0.795961  0.000000  0.605349  0.000000


In [128]:
X = np.array([[ np.nan, 0,   3  ],
              [ 3,   7,   9  ],
              [ 3,   5,   2  ],
              [ 4,   np.nan, 6  ],
              [ 8,   8,   1  ]])
y = np.array([14, 16, -1,  8, -5])

In [125]:
from sklearn.linear_model import LinearRegression

In [134]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy='most_frequent')
X2 = imp.fit_transform(X)
X2

array([[3., 0., 3.],
       [3., 7., 9.],
       [3., 5., 2.],
       [4., 0., 6.],
       [8., 8., 1.]])

In [127]:
X = pd.DataFrame(np.c_[X,y]).dropna().iloc[:,:-1]
y = pd.DataFrame(np.c_[X,y]).dropna().iloc[:,-1]

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 3 and the array at index 1 has size 5