<span style="color:gray">Интеллектуальный анализ данных в R и Python. Семинар 5</span>

### Анализ данных в Python. Scikit-Learn

---

Ниже приведено описание базовых моделей анализа данных с помощью Scikit-Learn.

<img src="scikit.PDF" width="800">

---

<font color='green'>Задание для самостоятельной работы:</font>
    
1. Создать ячейку в данном Jupyter Notebook и повторить операции, приведенные выше.  
1. Загрузить результирующий .ipynb файл в папку /assignments/. Файлы должны именоваться как:<br>
   *'Seminar_5_' + surname*.

## Scikit-learn

In [19]:
from sklearn import neighbors, datasets, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
iris = datasets.load_iris()
X, y = iris.data[:, :2], iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33)
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
knn = neighbors.KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy_score(y_test, y_pred)

(150,)

## Loading The Data

In [36]:
import numpy as np
X = np.random.random((5,11))
y = np.array(['M','A','A','A','M','F','M','M','F','F','F'])
X[X < 0.7] = 0
X = X.transpose()
X.shape

(11, 5)

In [37]:
y.shape

(11,)

## Training And Test Data

In [38]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

## Preprocessing The Data

In [39]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
standardized_X = scaler.transform(X_train)
standardized_X_test = scaler.transform(X_test)

In [40]:
from sklearn.preprocessing import Normalizer
scaler = Normalizer().fit(X_train)
normalized_X = scaler.transform(X_train)
normalized_X_test = scaler.transform(X_test)

In [41]:
from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold=0.0).fit(X)
binary_X = binarizer.transform(X)

In [42]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
y = enc.fit_transform(y)
y

array([2, 0, 0, 0, 2, 1, 2, 2, 1, 1, 1])

In [43]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=0, strategy='mean')
imp.fit_transform(X_train)

array([[0.85291342, 0.96071515, 0.83175183, 0.83671652, 0.75918052],
       [0.85291342, 0.96071515, 0.83175183, 0.83671652, 0.85109863],
       [0.85291342, 0.96190138, 0.76989323, 0.83671652, 0.85109863],
       [0.85291342, 0.96071515, 0.97286302, 0.83671652, 0.94301673],
       [0.85291342, 0.96071515, 0.83175183, 0.83671652, 0.85109863],
       [0.87589689, 0.95952891, 0.83175183, 0.83671652, 0.85109863],
       [0.82992995, 0.96071515, 0.84033329, 0.83671652, 0.85109863],
       [0.85291342, 0.96071515, 0.74391778, 0.83671652, 0.85109863]])

In [45]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(5)
poly.fit_transform(X)

array([[1.        , 0.82992995, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.        , 0.96190138, ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.78923903, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.2521885 ]])

## Create Your Model

In [46]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression(normalize=True)

In [47]:
from sklearn.svm import SVC
svc = SVC(kernel='linear')

In [48]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

In [49]:
from sklearn import neighbors
knn = neighbors.KNeighborsClassifier(n_neighbors=5)

In [50]:
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)

In [51]:
from sklearn.cluster import KMeans
k_means = KMeans(n_clusters=3, random_state=0)

## Model Fitting

In [52]:
lr.fit(X, y)
knn.fit(X_train, y_train)
svc.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [53]:
k_means.fit(X_train)
pca_model = pca.fit_transform(X_train)

## Prediction

In [91]:
#y_pred = svc.predict(X_test)

In [86]:
#y_pred = knn.predict_proba(X_test)

dtype('float64')

In [92]:
y_pred = k_means.predict(X_test)
#y_pred.dtype

## Evaluate Your Model’s Performance

In [108]:
knn.score(X_test, y_test)

0.3333333333333333

In [109]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

  score = y_true == y_pred


0.0

In [110]:
from sklearn.metrics import classification_report
y_pred = svc.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           A       0.00      0.00      0.00         1
           F       0.33      1.00      0.50         1
           M       0.00      0.00      0.00         1

    accuracy                           0.33         3
   macro avg       0.11      0.33      0.17         3
weighted avg       0.11      0.33      0.17         3



  _warn_prf(average, modifier, msg_start, len(result))


In [111]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_true, y_pred))

[[0 1 0]
 [0 1 0]
 [0 1 0]]


In [112]:
from sklearn.metrics import mean_squared_error
y_true = [3, -0.5, 2]
#y_pred = k_means.predict(X_test)
#y_pred.dtype
#y_test = y_test.astype(int32');
mean_squared_error(y_test, y_pred)

  y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
  y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)


TypeError: ufunc 'subtract' did not contain a loop with signature matching types dtype('<U1') dtype('<U1') dtype('<U1')

In [113]:
from sklearn.metrics import adjusted_rand_score
adjusted_rand_score(y_true, y_pred)

0.0

In [114]:
from sklearn.metrics import homogeneity_score
homogeneity_score(y_true, y_pred)

0.0

In [115]:
from sklearn.metrics import v_measure_score
v_measure_score(y_true, y_pred)

0.0

In [130]:
from sklearn.model_selection import cross_val_score
print(cross_val_score(knn, X_train, y_train, cv=3))
print(cross_val_score(lr, X, y, cv=2))

[0.33333333 0.33333333 0.5       ]
[-1.01907926 -1.24558842]




## Tune Your Model

In [147]:
from sklearn.model_selection import GridSearchCV
params = {"n_neighbors": np.arange(1,3),
"metric": ["euclidean", "cityblock"]}
grid = GridSearchCV(estimator=knn,
param_grid=params, cv=3)
grid.fit(X_train, y_train)
print(grid.best_score_)
print(grid.best_estimator_.n_neighbors)

0.611111111111111
2




In [146]:
from sklearn.model_selection import RandomizedSearchCV
params = {"n_neighbors": range(1,5),
"weights": ["uniform", "distance"]}
rsearch = RandomizedSearchCV(estimator=knn,
param_distributions=params,
cv=3,
n_iter=8,
random_state=5)
rsearch.fit(X_train, y_train)
print(rsearch.best_score_)

0.611111111111111


