In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [5]:
from google.colab import drive
drive.mount('/content/drive')
way = "/content/drive/MyDrive/Colab_Notebooks/aaa_ml/datasets/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
RANDOM_STATE = 22
TEST_SIZE = 0.2

## Работа с данными

In [6]:
data = pd.read_csv(way + 'data.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4744 entries, 0 to 4743
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   brand                  4744 non-null   object 
 1   model                  4744 non-null   object 
 2   description            4744 non-null   object 
 3   price                  4744 non-null   float64
 4   exposition_days_count  4744 non-null   int64  
 5   images_count           4744 non-null   int64  
 6   label                  4744 non-null   object 
dtypes: float64(1), int64(2), object(4)
memory usage: 259.6+ KB


In [7]:
data['label'].value_counts(normalize=True)

good            0.348440
bad             0.316610
excellent       0.187184
no_data         0.101602
satisfactory    0.028246
new             0.017917
Name: label, dtype: float64

In [8]:
X, y = data.drop(columns='label'), data['label']

### Разбиение датасета. Модуль model_selection.train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
    )

In [10]:
print(f'Размер данных для обучения: признаки - {X_train.shape}, таргеты - {y_train.shape}')
print(f'Размер данных для теста: признаки - {X_test.shape}, таргеты - {y_test.shape}')

Размер данных для обучения: признаки - (3795, 6), таргеты - (3795,)
Размер данных для теста: признаки - (949, 6), таргеты - (949,)


In [11]:
y_train.value_counts(normalize=True)

good            0.348353
bad             0.316733
excellent       0.187088
no_data         0.101713
satisfactory    0.028195
new             0.017918
Name: label, dtype: float64

### Работа с числовыми признаками. Модуль preprocessing

In [None]:
numeric_features = ['price', 'exposition_days_count', 'images_count']

scaler = StandardScaler()
X_train_numeric = scaler.fit_transform(X_train[numeric_features])
X_test_numeric = scaler.transform(X_test[numeric_features])

X_train_numeric

array([[-0.54013021,  0.54603495, -0.32757234],
       [-0.39147941,  1.31035846,  1.50191557],
       [-0.29237887,  0.05964726, -1.1279733 ],
       ...,
       [-0.08709919,  1.27561648,  1.27322958],
       [-0.52597299, -0.32251449,  0.01545664],
       [-0.37732219, -0.25303054, -0.32757234]])

In [None]:
scaler.mean_ # среднее значение для каждого признака

array([8130.45612648,  102.56627141,   15.86482213])

In [None]:
scaler.var_  # стандратное отклоение для каждого признака

array([1.99574046e+08, 3.31398790e+03, 7.64858113e+01])

### Работа с категориальными признаками. Модуль preprocessing

In [None]:
cat_features = ['brand', 'model']

enc = OneHotEncoder(handle_unknown='ignore')
X_train_cat = enc.fit_transform(X_train[cat_features])
X_test_cat = enc.transform(X_test[cat_features])

X_test_cat

<949x2640 sparse matrix of type '<class 'numpy.float64'>'
	with 1444 stored elements in Compressed Sparse Row format>

In [None]:
X_test_cat.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
enc.categories_

[array(['AHB', 'AIWA', 'AKAI', 'AMCV', 'AOC', 'AVQ', 'Acer', 'Akira',
        'Aquatelevision', 'Asano', 'Avest', 'BBK', 'BORK', 'BQ', 'BRAVIS',
        'Bang & Olufsen', 'Beko', 'Blauberg', 'Blaupunkt', 'Bose', 'CASIO',
        'CENTEK', 'Cameron', 'Candy', 'Changhong', 'Conrac', 'DENN',
        'DEXP', 'DIGMA', 'DNS', 'Daewoo Electronics', 'Doffler', 'ECON',
        'Elekta', 'Elenberg', 'Erisson', 'Evgo', 'Finlux', 'Funai',
        'Fusion', 'General', 'GoldStar', 'Grundig', 'HAMBER', 'HARPER',
        'HARTENS', 'HEC', 'HOLLEBERG', 'HUAWEI', 'Haier', 'Helix', 'Hi',
        'Hisense', 'Hitachi', 'Horizont', 'Hyundai', 'Irbis', 'Izumi',
        'JVC', 'KIVI', 'LEBEN', 'LG', 'Leff', 'Lentel', 'Loewe', 'MYSTERU',
        'Marine', 'Mystery', 'NATIONAL', 'NEKO', 'Novex', 'OK.', 'OKARI',
        'Olto', 'Orion', 'Panasonic', 'Philips', 'Pioneer', 'Polar',
        'Polarline', 'Premiera', 'Prestigio', 'Prology', 'Rolsen', 'Rotex',
        'Ruimatech', 'Runco', 'STARWIND', 'SUPRA', 'SUZUKI

### Работа с текстовыми признаками. Модуль feature_extraction

In [None]:
text_feature = 'description'

vectorizer = TfidfVectorizer()
X_train_text = vectorizer.fit_transform(X_train[text_feature])
X_test_text = vectorizer.transform(X_test[text_feature])

In [None]:
vectorizer.vocabulary_

{'рабочем': 5540,
 'состоянии': 6172,
 'разбит': 5563,
 'экран': 7113,
 'на': 3918,
 'запчасти': 2947,
 'разбита': 5564,
 'матрица': 3722,
 'телевизор': 6400,
 'funai': 1099,
 'рабочий': 5543,
 'все': 2263,
 'работает': 5519,
 'продам': 5390,
 'отличном': 4506,
 'тв': 6384,
 '55': 642,
 'диагональ': 2605,
 'состояние': 6171,
 'хорошее': 6869,
 'продаю': 5394,
 'потроха': 5133,
 'для': 2654,
 'телека': 6414,
 'lg': 1225,
 '43lj515v': 550,
 'матрицу': 3726,
 'разбили': 5555,
 'остальное': 4445,
 'майн': 3679,
 'ld76h': 1202,
 'eax67041505': 1047,
 '2200р': 303,
 'con': 968,
 '6870c': 719,
 '0532c': 22,
 '400р': 499,
 'динамики': 2625,
 '500р': 617,
 'шлейфы': 7086,
 'довесок': 2671,
 'приложу': 5283,
 'если': 2800,
 'надо': 3933,
 'какие': 3221,
 'sony': 1485,
 'модель': 3849,
 'kdl': 1180,
 '46ex720': 571,
 'жк': 2838,
 'подсветка': 4878,
 'led': 1216,
 '46': 567,
 '1920x1080': 217,
 '1080p': 59,
 'full': 1097,
 'hd': 1117,
 '3d': 484,
 'есть': 2808,
 'pip': 1360,
 'dvr': 1026,
 'звук':

In [None]:
X_test_text

<949x7214 sparse matrix of type '<class 'numpy.float64'>'
	with 11410 stored elements in Compressed Sparse Row format>

### Объединение преобразований над признками. Модуль compose

In [12]:
text_feature = 'description'
cat_features = ['brand', 'model']
numeric_features = ['price', 'exposition_days_count', 'images_count']

In [13]:
preprocessor = ColumnTransformer(
     transformers=[
         ('scaler', StandardScaler(), numeric_features),
         ('text', TfidfVectorizer(), text_feature),
         ('category', OneHotEncoder(handle_unknown='ignore'), cat_features),
     ],
)

X_train_prepared = preprocessor.fit_transform(X_train)
X_test_prepared = preprocessor.transform(X_test)

In [14]:
X_train_prepared

<3795x10012 sparse matrix of type '<class 'numpy.float64'>'
	with 65876 stored elements in Compressed Sparse Row format>

## Модели для задачи классификации

### Строим baseline. Модуль dummy

In [None]:
dummy_clf = DummyClassifier(strategy="most_frequent")  # будем предсказывать самый частотный таргет
dummy_clf.fit(X_train_prepared, y_train)

In [None]:
y_pred = dummy_clf.predict(X_test_prepared)
y_probas = dummy_clf.predict_proba(X_test_prepared)

print(y_pred[:10])
print(y_probas[:10])

['Хорошее' 'Хорошее' 'Хорошее' 'Хорошее' 'Хорошее' 'Хорошее' 'Хорошее'
 'Хорошее' 'Хорошее' 'Хорошее']
[[0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]]


In [None]:
dummy_clf.score(X_test_prepared, y_test)

0.3487881981032666

### Классификаторы из модуля linear_model

In [None]:
logreg_clf = LogisticRegression(
       random_state=RANDOM_STATE, multi_class='multinomial', class_weight='balanced'
    ).fit(X_train_prepared, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
logreg_clf.coef_

array([[ 0.65419904,  0.14853335, -0.01741205, ..., -0.01733967,
        -0.09752671, -0.02086249],
       [-1.41694715, -0.02203866,  0.03621683, ..., -0.07442307,
        -0.29095128, -0.03707555],
       [ 0.25319723,  0.02839848, -0.07926913, ..., -0.10041411,
        -0.20502751, -0.03004266],
       [ 0.56351026,  0.0128011 , -0.00504001, ..., -0.0304418 ,
         0.19288048, -0.09367521],
       [-0.0417095 , -0.14657875,  0.07674927, ..., -0.04279012,
         0.66861503, -0.08310413],
       [-0.01224988, -0.02111552, -0.01124491, ...,  0.26540877,
        -0.26799001,  0.26476004]])

In [None]:
logreg_clf.score(X_test_prepared, y_test)

0.8609062170706007

In [16]:
sgd_clf = SGDClassifier(random_state=RANDOM_STATE).fit(X_train_prepared, y_train)
sgd_clf.score(X_test_prepared, y_test)

0.8956796628029505

In [15]:
def get_feature_names(preprocessor: ColumnTransformer):
    """
    Отдает названия признаков после метода fit / fit_transform в ColumnTransformer
    """
    features = []
    for _, transformer, transformer_features, _ in preprocessor._iter(fitted=True):

        if isinstance(transformer, str):
            continue

        if hasattr(transformer, 'get_feature_names_out'):
            transformer_features = transformer.get_feature_names_out()

        for feature in transformer_features:
            features.append(feature)

    return features

In [19]:
features = get_feature_names(preprocessor)


In [28]:
sgd_clf.classes_

array(['bad', 'excellent', 'good', 'new', 'no_data', 'satisfactory'],
      dtype='<U12')

In [31]:
feature_weights = pd.concat([pd.Series(features), pd.Series(sgd_clf.coef_[1])], axis = 1)
feature_weights

Unnamed: 0,0,1
0,price,-0.002227
1,exposition_days_count,0.095680
2,images_count,-0.096517
3,00,-0.067806
4,000,-0.095853
...,...,...
10007,model_WS-32Z30HPQ,-0.130041
10008,model_WS-32Z31SSQ,0.000000
10009,model_WS-32Z40HTQ,0.000000
10010,model_Waterproof 65,-0.130041


In [34]:
feature_weights.sort_values(1).tail(10)

Unnamed: 0,0,1
5310,прекрасном,2.047404
3293,картинка,2.051815
6299,состояние,2.569607
4569,отлично,3.206329
3075,идеальное,3.363771
4575,отличный,3.729323
6300,состоянии,4.017566
4571,отличное,5.54735
3076,идеальном,5.639342
4573,отличном,10.093407


## Метрики. Модуль metric

In [None]:
y_pred = sgd_clf.predict(X_test_prepared)
# >>> print(f1_score(y_test, y_pred))

In [None]:
print(f1_score(y_test, y_pred, average='weighted'))

0.8650427402648317


In [None]:
>>> from sklearn.metrics import classification_report

>>> print(classification_report(y_test, y_pred))

                    precision    recall  f1-score   support

         Как новое       0.64      0.41      0.50        17
       На запчасти       0.94      0.95      0.94       300
        Нет данных       0.73      0.73      0.73        96
          Отличное       0.86      0.88      0.87       178
Удовлетворительное       0.40      0.15      0.22        27
           Хорошее       0.88      0.92      0.90       331

          accuracy                           0.87       949
         macro avg       0.74      0.67      0.69       949
      weighted avg       0.86      0.87      0.87       949



## Поиск оптимальных гиперпараметров. Модуль model_selection

In [None]:
parameters = {
    'loss': [
        'hinge',
        'modified_huber',
        'squared_hinge',
        'perceptron',
        'huber',
    ],
    'penalty': ['l2', 'l1', 'elasticnet'],
    'max_iter': [100, 500, 1000],
    'learning_rate': ['constant', 'optimal', 'adaptive'],
    'eta0': [0.1, 0.5]
}

In [None]:
grid_search = GridSearchCV(sgd_clf, parameters)
grid_search.fit(X_train_prepared, y_train)



In [None]:
grid_search.best_params_

{'eta0': 0.1,
 'learning_rate': 'adaptive',
 'loss': 'perceptron',
 'max_iter': 100,
 'penalty': 'l1'}

In [None]:
y_pred = grid_search.best_estimator_.predict(X_test_prepared)
f1_score(y_pred, y_test, average='weighted')

0.8991991281947176

In [35]:
parameters = {
    'eta0': [0.1, 0.2, 0.3, 0.4, 0.5],
    'learning_rate': ['constant', 'optimal', 'adaptive'],
    'loss': ['hinge', 'modified_huber', 'squared_hinge','perceptron', 'huber'],
    'max_iter': [50, 100, 200, 500, 1000],
    'penalty': ['l2', 'l1', 'elasticnet']
}

In [36]:
grid_search = GridSearchCV(sgd_clf, parameters)
grid_search.fit(X_train_prepared, y_train)



In [39]:
grid_search.best_params_.values()

dict_values([0.3, 'adaptive', 'perceptron', 50, 'l1'])

## Объединение преобразований над признаками с обучением модели. Модуль pipeline

In [None]:
pipeline = Pipeline(
     steps=[
         (
             'preprocessor', ColumnTransformer(
                 transformers=[
                     ('text', TfidfVectorizer(), text_feature),
                     ('category', OneHotEncoder(handle_unknown='ignore'), cat_features),
                 ],
             )
         ),
         ('classifier', SGDClassifier(
             eta0=0.5,
             learning_rate='adaptive',
             loss='hinge',
             max_iter=100,
             penalty='l1',
             random_state=RANDOM_STATE,
         )
         ),
     ],
 )
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

In [None]:
f1_score(y_pred, y_test, average='weighted')

0.8969529850491467

### Приложение к задаче

In [None]:
>>> def get_feature_names(preprocessor: ColumnTransformer):
    """
    Отдает названия признаков после метода fit / fit_transform в ColumnTransformer
    """
    features = []
    for _, transformer, transformer_features, _ in preprocessor._iter(fitted=True):

        if isinstance(transformer, str):
            continue

        if hasattr(transformer, 'get_feature_names_out'):
            transformer_features = transformer.get_feature_names_out()

        if hasattr(transformer, 'get_feature_names'):
            transformer_features = transformer.get_feature_names()

        for feature in transformer_features:
            features.append(feature)

    return features

In [None]:
# get_feature_names(preprocessor)