# Установка библиотек и импорты

In [None]:
pip install openpyxl pandas numpy sklearn matplotlib seaborn

In [None]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix
from sklearn.neighbors import LocalOutlierFactor
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_validate

# Загрузка датасета

### Загрузка x_bp

In [None]:
x_bp = pd.read_excel(io = "./hw_data_composite/X_bp.xlsx")
x_bp = x_bp.astype({'Unnamed: 0':'int'})
x_bp

### Загрузка x_nup

In [None]:
x_nup = pd.read_excel(io = "./hw_data_composite/X_nup.xlsx")
x_nup = x_nup.astype({'Unnamed: 0':'int','Угол нашивки, град':'int'})
x_nup

### Объединение таблиц и удаление столбца-индекса

In [None]:
dataset = pd.merge(x_bp, x_nup, how='inner', on='Unnamed: 0')
dataset = dataset.drop(labels = 'Unnamed: 0', axis = 1)
dataset

# Разведочный анализ

### Медианы по столбцам

In [None]:
dataset.median()

### Средние значения по столбцам

In [None]:
dataset.mean()

### Убираю угол нашивки для отрисовки графиков (неинформативный параметр, т.к. принимает всего  2 значения)

In [None]:
ds_without_angles = dataset.drop(axis = 1, labels = 'Угол нашивки, град')

### Гистограммы распределения переменных

In [None]:
ds_without_angles.hist(figsize = (20, 20), bins = 20, color = 'green')

### Диаграммы размаха (ящики с усами) по каждой переменной

In [None]:
for col in ds_without_angles.columns:
    sns.boxplot(x = dataset[col], color = 'green')
    plt.show()

### Попарные диаграммы рассеяния

In [None]:
pair_diagram = scatter_matrix(ds_without_angles, figsize = (16, 16), alpha = 0.25, color = 'green', diagonal = None)
for ax in pair_diagram.ravel():
    ax.set_xlabel(ax.get_xlabel(), fontsize = 10, rotation = 45)
    ax.set_ylabel(ax.get_ylabel(), fontsize = 10, rotation = 45)
pair_diagram

# Предобработка данных

### Проверка наличия пропусков

In [None]:
dataset.isnull().sum()

### Поиск выбросов с помощью многомерного LocalOutlierFactor

In [None]:
lof = LocalOutlierFactor(n_neighbors = 300)
lof_pred = lof.fit_predict(ds_without_angles)
outlier_indexes = []
for i in range(len(lof_pred)):
    if lof_pred[i] == -1:
        outlier_indexes.append(i)
print('Кол-во выбросов: ' + str(len(outlier_indexes)))
dataset.loc[outlier_indexes]

### Удаление выбросов из датасета

In [None]:
dataset.drop(outlier_indexes, inplace = True, errors='ignore')
ds_without_angles.drop(outlier_indexes, inplace = True, errors='ignore')
dataset

### Разделение датасета на входные и выходные параметры

In [None]:
dataset_inputs = dataset.drop(axis = 1,
                              labels = ['Соотношение матрица-наполнитель',
                                        'Модуль упругости при растяжении, ГПа',
                                        'Прочность при растяжении, МПа'])
dataset_output_mod_up = pd.DataFrame(dataset['Модуль упругости при растяжении, ГПа'])
dataset_output_mat_nap = pd.DataFrame(dataset['Соотношение матрица-наполнитель'])
dataset_output_str = pd.DataFrame(dataset['Прочность при растяжении, МПа'])

### Метод для scaling'а

In [None]:
def scale(scaler, df):
    scaler.fit(df)
    df_scaled = pd.DataFrame(scaler.transform(df.values))
    df_scaled.columns = df.columns
    return scaler, df_scaled

### Нормализация

In [None]:
min_max_scaler_inputs, dataset_inputs_normalized = scale(scaler = preprocessing.MinMaxScaler(),
                                                         df = dataset_inputs)
dataset_inputs_normalized

In [None]:
min_max_scaler_output_mod_up, dataset_output_mod_up_normalized = scale(scaler = preprocessing.MinMaxScaler(),
                                                                       df = dataset_output_mod_up)
min_max_scaler_output_mat_nap, dataset_output_mat_nap_normalized = scale(scaler = preprocessing.MinMaxScaler(),
                                                                         df = dataset_output_mat_nap)
min_max_scaler_output_str, dataset_output_str_normalized = scale(scaler = preprocessing.MinMaxScaler(),
                                                      df = dataset_output_str)

### Стандартизация

In [None]:
standard_scaler_inputs, dataset_inputs_standartized = scale(scaler = preprocessing.StandardScaler(),
                                                         df = dataset_inputs)
dataset_inputs_standartized

In [None]:
standard_scaler_output_mod_up, dataset_output_mod_up_standartized = scale(scaler = preprocessing.StandardScaler(),
                                                                       df = dataset_output_mod_up)
standard_scaler_output_mat_nap, dataset_output_mat_nap_standartized = scale(scaler = preprocessing.StandardScaler(),
                                                                         df = dataset_output_mat_nap)
standard_scaler_output_str, dataset_output_str_standartized = scale(scaler = preprocessing.StandardScaler(),
                                                      df = dataset_output_str)

# Построение моделей

### Подбор гиперпараметров по сетке с кросс валидацией по метрике R2 и обучение на лучших параметрах

### Ридж-регрессия 

### Прогноз модуля упругости при растяжении на нормализованном датасете

In [None]:
def calculate_ridge(x, y, alphas = np.logspace(-10, -0, 11),
                    n_splits = 10, scoring = None):
    ridge_results = []
    for alpha in alphas:
        ridge_result = cross_val_score(
            Ridge(alpha), x, y, cv=KFold(n_splits, shuffle = True), n_jobs = -1, scoring = scoring)
        ridge_results.append([alpha, ridge_result.mean()])
    ridge_df = pd.DataFrame(ridge_results, columns = ['alpha','score'])
    return ridge_df

In [None]:
def make_ridge_model(ridge_cross_val_res, dataset_inputs, dataset_output,
                     output_scaler, test_size=0.3):
    alpha_best = ridge_cross_val_res.sort_values(by = 'score',ascending = False)['alpha'].iloc[0]
    print('Best alpha: ' + str(alpha_best))
    
    ridge_model = Ridge(alpha_best)
    X_train, X_test, y_train, y_test = train_test_split(
        dataset_inputs_normalized, dataset_output_mod_up_normalized, test_size=test_size)
    ridge_model.fit(X_train, y_train)
    
    comparison_df = pd.DataFrame(np.column_stack(
    [output_scaler.inverse_transform(ridge_model.predict(X_test)),
     output_scaler.inverse_transform(y_test)]), columns=['Предсказанные данные', 'Тестовые данные'])
    
    print(comparison_df)
    print('Средняя абсолютная ошибка: ' + str(mean_absolute_error(comparison_df['Тестовые данные'], comparison_df['Предсказанные данные'])))

    return ridge_model

### Прогноз модуля упругости при растяжении на нормализованном датасете

In [None]:
ridge_mod_up_cross_val_res_normalized = calculate_ridge(dataset_inputs_normalized, dataset_output_mod_up_normalized)
ridge_mod_up_cross_val_res_normalized

In [None]:
ridge_mod_up_best_normalized = make_ridge_model(ridge_mod_up_cross_val_res_normalized, dataset_inputs_normalized,
                                                dataset_output_mod_up_normalized, min_max_scaler_output_mod_up)

### Прогноз прочности при растяжении на нормализованном датасете

In [None]:
ridge_str_cross_val_res_normalized = calculate_ridge(dataset_inputs_normalized,dataset_output_str_normalized)
ridge_str_cross_val_res_normalized

In [None]:
ridge_str_best_normalized = make_ridge_model(ridge_str_cross_val_res_normalized, dataset_inputs_normalized,
                                             dataset_output_str_normalized, min_max_scaler_output_str)

### Прогноз модуля упругости при растяжении на стандартизированном датасете

In [None]:
ridge_mod_up_cross_val_res_standartized = calculate_ridge(dataset_inputs_standartized, dataset_output_mod_up_standartized)
ridge_mod_up_cross_val_res_standartized

In [None]:
ridge_mod_up_best_standartized = make_ridge_model(ridge_mod_up_cross_val_res_standartized, dataset_inputs_standartized,
                                                  dataset_output_mod_up_standartized, min_max_scaler_output_mod_up)

### Прогноз прочности при растяжении на стандартизированном датасете

In [None]:
ridge_str_cross_val_res_standartized = calculate_ridge(dataset_inputs_standartized, dataset_output_str_standartized)
ridge_str_cross_val_res_standartized

In [None]:
ridge_str_best_standartized = make_ridge_model(ridge_str_cross_val_res_standartized, dataset_inputs_standartized,
                                               dataset_output_str_standartized, min_max_scaler_output_str)

### ElasticNet регрессия с полиномиальными параметрами

In [None]:
def calculate_elastic(x, y, alphas = np.logspace(-10, -0, 11), ratios = np.arange(11)/10,
                      n_splits = 10, scoring = None):
    poly_elastic_results = []
    poly_x = PolynomialFeatures().fit_transform(x)
    for alpha in alphas:
        for ratio in ratios:    
            poly_elastic_result = cross_val_score(
                ElasticNet(alpha, l1_ratio = ratio),
                poly_x, y, n_jobs=-1,
                cv=KFold(n_splits, shuffle = True))
            poly_elastic_results.append([alpha, ratio, poly_elastic_result.mean()])
    elastic_df = pd.DataFrame(poly_elastic_results, columns = ['alpha','ratio','score'])
    return elastic_df

In [None]:
def make_elastic_model(elastic_cross_val_res, dataset_inputs, dataset_output,
                       output_scaler, test_size=0.3):
    elastic_cross_val_res = elastic_cross_val_res.sort_values(by = 'score',ascending = False)['alpha']
    alpha_best = elastic_cross_val_res['alpha'].iloc[0]
    print('Best alpha: ' + str(alpha_best))
    ratio_best = elastic_cross_val_res['ratio'].iloc[0]
    print('Best ratio: ' + str(ratio_best))
    
    elastic_model = ElasticNet(alpha = alpha_best, ratio = ratio_best)
    X_train, X_test, y_train, y_test = train_test_split(
        dataset_inputs_normalized, dataset_output_mod_up_normalized, test_size=test_size)
    poly_x_train = PolynomialFeatures().fit_transform(X_train)
    poly_x_test = PolynomialFeatures().fit_transform(X_test)
    elastic_model.fit(poly_x_train, y_train)
    
    comparison_df = pd.DataFrame(np.column_stack(
    [output_scaler.inverse_transform(elastic_model.predict(poly_x_test)),
     output_scaler.inverse_transform(y_test)]), columns=['Предсказанные данные', 'Тестовые данные'])
    
    print(comparison_df)
    print('Средняя абсолютная ошибка: ' + str(mean_absolute_error(comparison_df['Тестовые данные'], comparison_df['Предсказанные данные'])))

    return elastic_model

In [None]:
elastic_mod_up_cross_val_res_normalized = calculate_elastic(dataset_inputs_normalized, dataset_output_mod_up_normalized)
elastic_mod_up_cross_val_res_normalized

In [None]:
elastic_mod_up_best_normalized = make_elastic_model(elastic_mod_up_cross_val_res_normalized, dataset_inputs_normalized,
                                                dataset_output_mod_up_normalized, min_max_scaler_output_mod_up)

In [None]:
ridge_results = []
for alpha in np.logspace(-10, -0, 11):
    ridge_result = cross_val_score(
        Ridge(alpha), input_data, output_data, cv=KFold(n_splits = 10, shuffle = True), n_jobs=-1)
    ridge_results.append([alpha, ridge_result.mean()])
ridge_df = pd.DataFrame(ridge_results, columns = ['alpha','score']).sort_values(by = 'score', ascending = False)
ridge_df

In [None]:
lasso_results = []
for alpha in np.logspace(-10, -0, 11):
    lasso_result = cross_val_score(
        Lasso(alpha), input_data, output_data, cv=KFold(n_splits = 10, shuffle = True), n_jobs=-1)
    lasso_results.append([alpha, lasso_result.mean()])
lasso_df = pd.DataFrame(lasso_results, columns = ['alpha','score']).sort_values(by = 'score', ascending = False)
lasso_df

In [None]:
poly_elastic_results = []
for alpha in np.logspace(-10, -0, 11):
    for ratio in np.arange(11)/10:    
        poly_elastic_result = cross_val_score(
            ElasticNet(alpha, l1_ratio = ratio),
            PolynomialFeatures(2).fit_transform(input_data),
            output_data,
            cv=KFold(n_splits = 10, shuffle = True), n_jobs=-1)
        poly_elastic_results.append([alpha, ratio, poly_elastic_result.mean()])
pedf = pd.DataFrame(poly_elastic_results, columns = ['alpha','ratio','score']).sort_values(by = 'score', ascending = False)
pedf

In [None]:
input_train, input_test, out_train, out_test = train_test_split(
    input_data, output_data, shuffle = True, test_size = 0.3)

In [None]:
linr = LinearRegression(n_jobs = -1)
linr.fit(input_train, out_train)
print(lr.score(input_test, out_test))

In [None]:
ridr = Ridge(alpha = ridge_df['alpha'].iloc[0])
ridr.fit(input_train, out_train)
print(ridr.score(input_test, out_test))

In [None]:
lasr = Lasso(alpha = lasso_df['alpha'].iloc[0])
lasr.fit(input_train, out_train)
print(lasr.score(input_test, out_test))

In [None]:
per = ElasticNet(alpha = pedf['alpha'].iloc[0], l1_ratio = pedf['ratio'].iloc[0])
per.fit(PolynomialFeatures(2).fit_transform(input_train), out_train)
print(per.score(PolynomialFeatures(2).fit_transform(input_test), out_test))