In [None]:
### Оценка категориальных признаков

In [None]:
# Попробуем оценить влияние категориальных величин на целевую метрику

# Выведем категориальные данные в отдельную таблицу
data102 = pd.DataFrame(df_balanced).copy()
cont_features = ['Usage_kWh','Lagging_Power','Leading_Power','CO2(tCO2)','Lagging_Factor','Leading_Factor','NSM']
data102.drop(cont_features, axis=1, inplace=True)
data102.head()

In [None]:
# Переведем целевую метрику в числовые значения
# Light_Load = 0, Medium_Load = 1, Maximum_Load = 2

import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

data102['Load_Type'].unique()

In [None]:
loads = ['Light_Load', 'Medium_Load', 'Maximum_Load']
encoder102 = OrdinalEncoder(categories = [loads])
encoder102.fit_transform(data102[['Load_Type']])
data102['Load_Type_№'] = encoder102.fit_transform(data102[['Load_Type']])
data102.head()

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Влияние категориальной величины 'WeekStatus' на целевую метрику

sns.set(font_scale = 1)

data102 = data102.explode('Load_Type_№')
data102['Load_Type_№'] = data102['Load_Type_№'].astype('float')
sns.violinplot(data=data102, x='WeekStatus', y='Load_Type_№')

data102.groupby('Load_Type_№')['WeekStatus'].value_counts()

In [None]:
# Влияние категориальной величины 'Day_Of_Week' на целевую метрику

#data103 = data102.explode('Load_Type_№')
#data103['Load_Type_№'] = data103['Load_Type_№'].astype('float')
sns.violinplot(data=data102, x='Day_Of_Week', y='Load_Type_№')

data102.groupby('Load_Type_№')['Day_Of_Week'].value_counts()

In [None]:
Как можно увидеть из графиков существует определенное влияние категориальной величины 'WeekStatus' на тип производственной нагрузки. В то время как признак 'Day_Of_Week' имеет достаточно равномерное распределение. 

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

# Разделяем фатафрейм на признаки и разметку
data333_dropped = data333.drop(data333.index[57:61])

X50 = data333_dropped.copy()
del X50['Load_Type']
y50 = data333_dropped.Load_Type.copy()

# Разделение данных на тренировочную и тестовую выборки
X50_train, X50_test, y50_train, y50_test = train_test_split(X50, y50, test_size=0.2, random_state = 42)

In [None]:
# Импорт моделей
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

models = {
    "Logistic Regression": LogisticRegression(max_iter=10000),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier()
}

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import roc_curve, roc_auc_score

# Обучение моделей и сохранение обученных конвейеров
trained_pipelines = {}

for name, model in models.items():
    pipeline = make_pipeline(StandardScaler(), PCA(n_components=0.95), model)
    pipeline.fit(X50_train, y50_train)
    y50_pred = pipeline.predict(X50_test)
    accuracy = round(accuracy_score(y50_test, y50_pred), 5)
    trained_pipelines[name] = pipeline  # Сохраняем обученный конвейер
    print(f"{name} accuracy: {accuracy}")
    

In [None]:
pd.options.display.float_format = '{:.5f}'.format # преобразует все значения из степенного вида в целые
                                                  # Например 1.256e+2 станет 125.600
df_result = pd.DataFrame({'Версия Dataset': [0, 1],
                   'Logistic Regression': [0.66583, 0.67583],
                   'SVM': [0.7818, 0.7918],
                   'Random Forest': [0.85515, 0.86515],
                   'KNN': [0.84437, 0.85437],
                   'Naive Bayes': [0.67912, 0.68912],
                   'Decision Tree': [0.82489, 0.83489]
                 })

plt.plot(df_result['Logistic Regression'], label='Logistic Regression', color='green')
plt.plot(df_result['SVM'], label='SVM', color='steelblue', linewidth= 4 )

#label='Sales', color='purple', linestyle='dashed'

plt.legend()

plt.ylabel('Accuracy', fontsize= 12)
plt.xlabel('Версия Dataset', fontsize= 12)
plt.title('Изменение точности моделей машинного обучения', fontsize= 14 )

#display plot
plt.show() 

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

X_check = data333.iloc[57:61].copy()
del X_check['Load_Type']
real_load_type = data333.iloc[57:61].Load_Type.copy()

display(X_check)
display(real_load_type)

In [None]:
predictions = {}
for name, pipeline in trained_pipelines.items():
    pred = pipeline.predict(X_check)
    predictions[name] = pred

for i, a in predictions.items():
  print(f'{i}:')
  for c in a:
    print(f'    {c}')

print(f'\nReal_Load_Type:')
for c in real_load_type:
  print(f'    {c}')

In [None]:
pd.options.display.float_format = '{:.5f}'.format # преобразует все значения из степенного вида в целые
                                                  # Например 1.256e+2 станет 125.600
df_result = pd.DataFrame({'Версия Dataset': [0, 1],
                   'Logistic Regression': [0.62549, 0.62549],
                   'SVM': [0.74857, 0.74857],
                   'Random Forest': [0.82993, 0.82993],
                   'KNN': [0.83108, 0.83108],
                   'Naive Bayes': [0.686, 0.686],
                   'Decision Tree': [0.82902, 0.82902]
                 })

plt.plot(df_result['Logistic Regression'], label='Logistic Regression', color='green')
plt.plot(df_result['SVM'], label='SVM', color='steelblue', linewidth= 4 )
plt.plot(df_result['Random Forest'], label='Random Forest', color='purple', linestyle='dashed')
plt.plot(df_result['KNN'], label='KNN', color='steelblue', linewidth= 2 )
plt.plot(df_result['Naive Bayes'], label='Naive Bayes', color='yellow')
plt.plot(df_result['Decision Tree'], label='Decision Tree', color='pink', linewidth= 4 )

#label='Sales', color='purple', linestyle='dashed'

plt.legend()

plt.ylabel('Accuracy', fontsize= 12)
plt.xlabel('Версия Dataset', fontsize= 12)
plt.title('Изменение точности моделей машинного обучения', fontsize= 14 )

#display plot
plt.show() 

In [None]:
# Проведем обучение модели на исходных данных сбалансированного Dataset (с учетом только числовых признаков)

cols_1 = ['Usage_kWh','Lagging_Power','Leading_Power','CO2(tCO2)', 'Lagging_Factor', 'Leading_Factor', 'NSM', 'Load_Type'] 

data_1 = df_balanced[cols_1]

data_1.head()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

# Разделяем фатафрейм на признаки и разметку
data_1_dropped = data_1.drop(data_1.index[57:61])

X1 = data_1_dropped.copy()
del X1['Load_Type']
y1 = data_1_dropped.Load_Type.copy()

# Разделение данных на тренировочную и тестовую выборки
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state = 42)

In [None]:
# Начинаем обучение модели, применив только метод главных компонентов РСА

from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

# Обучение моделей и сохранение обученных конвейеров
trained_pipelines_1 = {}

for name, model in models.items():
    pipeline_1 = make_pipeline(PCA(n_components=0.95), model)
    pipeline_1.fit(X1_train, y1_train)
    y1_pred = pipeline_1.predict(X1_test)
    accuracy = round(accuracy_score(y1_test, y1_pred), 5)
    trained_pipelines_1[name] = pipeline_1  # Сохраняем обученный конвейер
    print(f"{name} accuracy: {accuracy}")

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

X_check = data_1.iloc[57:61].copy()
del X_check['Load_Type']
real_load_type = data_1.iloc[57:61].Load_Type.copy()

display(X_check)
display(real_load_type)

In [None]:
predictions = {}
for name, pipeline in trained_pipelines_1.items():
    pred = pipeline_1.predict(X_check)
    predictions[name] = pred

for i, a in predictions.items():
  print(f'{i}:')
  for c in a:
    print(f'    {c}')

print(f'\nReal_Load_Type:')
for c in real_load_type:
  print(f'    {c}')

In [None]:
# data3 = pd.DataFrame(steel_dataset).copy()

# data3['Day_Of_Week'].unique()

In [None]:
# days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# encoder = OrdinalEncoder(categories = [days])

# encoder.fit_transform(data3[['Day_Of_Week']])

In [None]:
# data3['Day_Of_Week'] = encoder.fit_transform(data3[['Day_Of_Week']])
# data3.tail(5)

In [None]:
# data3['Day_Of_Week'].value_counts()