### Import bibliotek

In [1]:
import numpy as np
import pandas as pd
import plotly.figure_factory as ff
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
from seaborn import load_dataset
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler


pio.templates.default = "plotly_white"
default_font_size = 20
pio.templates["plotly_white"]["layout"].update(
    {
        "title_font": {"size": default_font_size},
        "xaxis_title_font": {"size": default_font_size},
        "yaxis_title_font": {"size": default_font_size},
        "legend_font": {"size": default_font_size},
    }
)
np.set_printoptions(precision=12, suppress=True, linewidth=150)
pd.options.display.float_format = '{:.3f}'.format

### Załadowanie danych i wstępna eksploracja

#### Wczytanie danych z pliku excel i przetwarzanie ich jako ramka danych pakietu pandas. 

In [2]:
!pip install openpyxl




[notice] A new release of pip is available: 23.0.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
raw_dataset = pd.read_excel('dataset.xlsx')
raw_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103073 entries, 0 to 103072
Data columns (total 25 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Unnamed: 0     103073 non-null  int64  
 1   Raingage       103073 non-null  int64  
 2   Outlet         103073 non-null  object 
 3   Area           103073 non-null  int64  
 4   PercImperv     103073 non-null  int64  
 5   Width          103073 non-null  int64  
 6   PercSlope      103073 non-null  int64  
 7   CurbLength     103073 non-null  int64  
 8   N-Imperv       103073 non-null  float64
 9   N-Perv         103073 non-null  float64
 10  S-Imperv       103073 non-null  float64
 11  S-Perv         103073 non-null  float64
 12  PctZero        103073 non-null  int64  
 13  RouteTo        103073 non-null  object 
 14  TotalPrecip    103073 non-null  int64  
 15  TotalRunon     103073 non-null  int64  
 16  TotalEvap      103073 non-null  int64  
 17  TotalInfil     103073 non-nul

#### Przygotowanie danych do eksploracji i manipulacji oraz wyświetlenie pierwszych pięciu wierszy.

In [4]:
dataset = raw_dataset.copy()
dataset.head()

Unnamed: 0.1,Unnamed: 0,Raingage,Outlet,Area,PercImperv,Width,PercSlope,CurbLength,N-Imperv,N-Perv,...,TotalRunon,TotalEvap,TotalInfil,ImpervRunoff,PervRunoff,TotalRunoffIn,TotalRunoffMG,PeakRunoff,RunoffCoeff,coords
0,0,1,J1,5,25,100,10,0,0.01,1.0,...,0,0,8.13,24.12,43.08,67.2,3.36,0.19,0.693,"[(777181.812, 592589.633), (777181.932, 592580..."
1,1,1,J1,5,1,1,1,0,0.015,0.015,...,0,0,12.04,0.93,20.2,21.13,1.06,0.07,0.218,"[(777181.812, 592589.633), (777181.932, 592580..."
2,2,1,J1,5,1,1,1,0,0.015,0.015,...,0,0,12.04,0.93,20.2,21.13,1.06,0.07,0.218,"[(777181.812, 592589.633), (777181.932, 592580..."
3,3,1,J1,5,1,1,1,0,0.015,0.015,...,0,0,12.04,0.93,20.2,21.13,1.06,0.07,0.218,"[(777181.812, 592589.633), (777181.932, 592580..."
4,4,1,J1,5,1,1,1,0,0.015,0.015,...,0,0,12.04,0.93,20.2,21.13,1.06,0.07,0.218,"[(777181.812, 592589.633), (777181.932, 592580..."


#### Usunięcie niepotrzebnych kolumn.

In [5]:
dataset.drop(["Unnamed: 0", 'Area', 'RouteTo', 'CurbLength', 'ImpervRunoff', 'PervRunoff', 'TotalInfil', 'PeakRunoff', 'coords', 'RunoffCoeff', 'TotalRunoffIn', 'TotalRunon', 'TotalEvap', "TotalPrecip", "Raingage", "Outlet", ], axis=1, inplace=True)
dataset.rename(columns={"TotalRunoffMG": "Runoff"}, inplace=True)
dataset.head()

Unnamed: 0,PercImperv,Width,PercSlope,N-Imperv,N-Perv,S-Imperv,S-Perv,PctZero,Runoff
0,25,100,10,0.01,1.0,0.05,0.05,25,3.36
1,1,1,1,0.015,0.015,0.05,0.05,1,1.06
2,1,1,1,0.015,0.015,0.05,0.05,10,1.06
3,1,1,1,0.015,0.015,0.05,0.05,20,1.06
4,1,1,1,0.015,0.015,0.05,0.05,40,1.06


#### Procentowa ilość pustych wartości dla kolumny

In [6]:
dataset.isnull().sum() / len(dataset)

PercImperv   0.000
Width        0.000
PercSlope    0.000
N-Imperv     0.000
N-Perv       0.000
S-Imperv     0.000
S-Perv       0.000
PctZero      0.000
Runoff       0.000
dtype: float64

##### W zbiorze nie występują próbki z pustymi danymi

#### Podstawowe informacje o zbiorze danych. Zbiór zawiera 8 cech. Szukaną jest TotalRunoff.

In [7]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103073 entries, 0 to 103072
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   PercImperv  103073 non-null  int64  
 1   Width       103073 non-null  int64  
 2   PercSlope   103073 non-null  int64  
 3   N-Imperv    103073 non-null  float64
 4   N-Perv      103073 non-null  float64
 5   S-Imperv    103073 non-null  float64
 6   S-Perv      103073 non-null  float64
 7   PctZero     103073 non-null  int64  
 8   Runoff      103073 non-null  float64
dtypes: float64(5), int64(4)
memory usage: 7.1 MB


## Chande datatypes


In [8]:
## Chande datatypes
def change_dtypes(df: pd.DataFrame) -> pd.DataFrame:
    dataset["PercImperv"] = dataset["PercImperv"].astype("int8")
    dataset["Width"] = dataset["Width"].astype("int16")
    dataset["PercSlope"] = dataset["PercSlope"].astype("int8")
    dataset["N-Imperv"] = dataset["N-Imperv"].astype("float32")
    dataset["N-Perv"] = dataset["N-Perv"].astype("float32")
    dataset["S-Imperv"] = dataset["S-Imperv"].astype("float32")
    dataset["S-Perv"] = dataset["S-Perv"].astype("float32")
    dataset["PctZero"] = dataset["PctZero"].astype("int8")
    return df

In [9]:
dataset = change_dtypes(dataset)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103073 entries, 0 to 103072
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   PercImperv  103073 non-null  int8   
 1   Width       103073 non-null  int16  
 2   PercSlope   103073 non-null  int8   
 3   N-Imperv    103073 non-null  float32
 4   N-Perv      103073 non-null  float32
 5   S-Imperv    103073 non-null  float32
 6   S-Perv      103073 non-null  float32
 7   PctZero     103073 non-null  int8   
 8   Runoff      103073 non-null  float64
dtypes: float32(4), float64(1), int16(1), int8(3)
memory usage: 2.9 MB


#### Pozyskanie podstawowych statystych ze zbioru danych.

In [10]:
stats = dataset.describe()
# stats = stats.transpose()
stats
stats.to_excel("stats.xlsx")

Podstawowe statystyki opisowe dla każdej kolumny zawierającej dane liczbowe statystyk, jak:

    count: liczba niepustych (nie-NA/null) wartości
    mean: średnia wartość
    std: odchylenie standardowe
    min: wartość minimalna
    25%: pierwszy kwartyl (kwantyl rzędu 0.25)
    50%: mediana (drugiego kwartyl, kwantyl rzędu 0.5)
    75%: trzeci kwartyl (kwantyl rzędu 0.75)
    max: wartość maksymalna

#### Rozkład zmienej docelowej (Runoff)

In [11]:
fig = px.histogram(dataset, x='Runoff', width=1000, height=500)
fig.update_layout(
    title={'text': "Rozkład danych zmiennej Runoff", 'y': 0.9, 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top', 'font': {'size': 36}},
    xaxis_title="Runoff [m3/s]",
    yaxis_title="Liczba próbek",
    xaxis=dict(title=dict(font=dict(size=36))),
    yaxis=dict(title=dict(font=dict(size=36))),
    font=dict(size=36)
)

### Liczba próbek zerowych.

In [12]:
dataset.Runoff.value_counts()

4.460    2247
4.490    2034
4.500    2016
4.530    1904
4.470    1864
         ... 
3.390       2
1.720       2
2.270       2
3.010       1
0.410       1
Name: Runoff, Length: 400, dtype: int64

### Pozyskanie podstawowych statystych ze zbioru danych po przetworzeniu danych.

In [13]:
dataset.describe()

Unnamed: 0,PercImperv,Width,PercSlope,N-Imperv,N-Perv,S-Imperv,S-Perv,PctZero,Runoff
count,103073.0,103073.0,103073.0,103073.0,103073.0,103073.0,103073.0,103073.0,103073.0
mean,42.902,481.564,42.807,0.399,0.403,0.183,0.183,49.981,3.67
std,34.639,359.301,34.742,0.32,0.32,0.103,0.103,35.066,1.512
min,1.0,1.0,1.0,0.01,0.015,0.05,0.05,1.0,0.03
25%,10.0,250.0,10.0,0.015,0.015,0.05,0.05,25.0,3.71
50%,40.0,500.0,40.0,0.4,0.4,0.2,0.2,50.0,4.4
75%,80.0,750.0,80.0,0.8,0.8,0.3,0.3,75.0,4.56
max,100.0,1000.0,100.0,0.8,1.0,0.3,0.3,100.0,4.85


Podstawowe statystyki opisowe dla każdej kolumny zawierającej dane liczbowe statystyk, jak:

    count: liczba niepustych (nie-NA/null) wartości
    mean: średnia wartość
    std: odchylenie standardowe
    min: wartość minimalna
    25%: pierwszy kwartyl (kwantyl rzędu 0.25)
    50%: mediana (drugiego kwartyl, kwantyl rzędu 0.5)
    75%: trzeci kwartyl (kwantyl rzędu 0.75)
    max: wartość maksymalna

In [14]:
fig = px.box(dataset, y='Runoff', width=500, height=1000, title='Rozkład danych zmiennej Runoff')
fig.update_layout(
    title={'text': "Rozkład danych zmiennej Runoff", 'y': 0.9, 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top', 'font': {'size': 36}},
    xaxis=dict(title=dict(font=dict(size=36))),
    yaxis=dict(title=dict(font=dict(size=36))),
    font=dict(size=36)
)
fig.show()

## Podział na zbiór treningowy i testowy

In [15]:
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

print(f'train_dataset length: {len(train_dataset)}')
print(f'test_dataset length: {len(test_dataset)}')

train_dataset length: 82458
test_dataset length: 20615


#### Zbiór treningowy zawiera 34560 próbek.
Poniżej wyświetlono 5 przykładowych próbek zbioru treningowego.

In [16]:
train_dataset.head()

Unnamed: 0,PercImperv,Width,PercSlope,N-Imperv,N-Perv,S-Imperv,S-Perv,PctZero,Runoff
40961,1,250,100,0.4,0.8,0.3,0.05,75,4.08
41295,10,250,100,0.4,0.4,0.05,0.2,50,4.29
51653,100,500,20,0.015,0.015,0.2,0.05,1,4.82
71434,80,750,20,0.8,0.4,0.3,0.3,25,4.56
43577,1,500,1,0.015,0.015,0.3,0.2,100,4.39


#### Zbiór testowy zawiera 8640 próbek.
Poniżej wyświetlono 5 przykładowych próbek zbioru testowego.

In [17]:
test_dataset.head()

Unnamed: 0,PercImperv,Width,PercSlope,N-Imperv,N-Perv,S-Imperv,S-Perv,PctZero,Runoff
2,1,1,1,0.015,0.015,0.05,0.05,10,1.06
10,1,1,1,0.015,0.015,0.05,0.1,20,1.06
13,1,1,1,0.015,0.015,0.05,0.1,80,1.06
20,1,1,1,0.015,0.015,0.05,0.2,80,1.05
21,1,1,1,0.015,0.015,0.05,0.2,100,1.05


#### Statystyki zbioru treningowego przed normalizacją danych.

In [18]:
train_stats = train_dataset.describe()
train_stats.pop('Runoff')
train_stats = train_stats.transpose()
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PercImperv,82458.0,42.852,34.614,1.0,10.0,40.0,80.0,100.0
Width,82458.0,481.576,359.094,1.0,250.0,500.0,750.0,1000.0
PercSlope,82458.0,42.761,34.753,1.0,10.0,40.0,80.0,100.0
N-Imperv,82458.0,0.4,0.32,0.01,0.015,0.4,0.8,0.8
N-Perv,82458.0,0.402,0.319,0.015,0.015,0.4,0.8,1.0
S-Imperv,82458.0,0.182,0.103,0.05,0.05,0.2,0.3,0.3
S-Perv,82458.0,0.183,0.102,0.05,0.05,0.2,0.3,0.3
PctZero,82458.0,50.015,35.046,1.0,25.0,50.0,75.0,100.0


## Split data and target

In [19]:
y_train = train_dataset['Runoff']
y_test = test_dataset['Runoff']
X_train = train_dataset.drop('Runoff', axis=1)
X_test = test_dataset.drop('Runoff', axis=1)


## Standaryzacja danych metodą Z-score.

In [20]:
# Skalowanie danych
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Budowa modelu regresji ElasticNet

In [21]:
# from sklearn.model_selection import GridSearchCV

# # Definiowanie zakresu parametrów do sprawdzenia
# param_grid = {'alpha': [0.0001, 0.01, 0.1, 1, 10, 100], 'l1_ratio': [0.001, 0.1, 0.2, 0.4, 0.6, 0.8, 1]}

# # Inicjalizacja modelu i GridSearchCV
# elastic_net = ElasticNet(random_state=0)
# grid_search = GridSearchCV(elastic_net, param_grid, cv=5)

# # Dopasowanie GridSearchCV
# grid_search.fit(X_train_scaled, y_train)

# # Znalezienie najlepszych parametrów
# best_params = grid_search.best_params_
# print("Best parameters: ", best_params)

model = ElasticNet(random_state=0, alpha=0.01, l1_ratio=0.2)
model.fit(X_train_scaled, y_train)
# # predictions = model.predict(X_test)
# # predictions





# # Użycie najlepszego modelu
# best_model = grid_search.best_estimator_
# test_predictions = best_model.predict(X_test_scaled).flatten()

### test_predictions zawiera przewidywane wartości na podstawie zbioru testowego w postaci jednowymiarowej tablicy.

In [22]:
test_predictions = model.predict(X_test_scaled).flatten()
test_predictions

array([2.1272962, 2.1335216, 2.148409 , ..., 5.4537425, 5.458772 , 5.4649754], dtype=float32)

### Tabela predykcji 

Wynikowy dataframe pred przedstawia dwie kolumny:

* Pierwsza kolumna to wartości rzeczywiste (test_labels) z zestawu danych testowych. Zawiera one prawdziwe wartości etykiet, które model próbuje przewidzieć.
* Druga kolumna o nazwie 'predictions' zawiera wartości przewidziane przez model na podstawie zbioru testowego (test_predictions). Są to wartości, które model wygenerował po przetworzeniu zbioru testowego.

Dataframe pred służy do porównania wartości rzeczywistych z przewidywaniami modelu, co pozwala ocenić, jak dobrze model radzi sobie z przewidywaniem na nowych, nieznanych danych. Wizualizując lub analizując te dane, można ocenić skuteczność i dokładność modelu.

In [23]:
ann_pred = pd.DataFrame(y_test)
ann_pred['predictions'] = test_predictions

### Wykres danych przewidywanych

In [24]:
fig = px.scatter(ann_pred, 'Runoff', 'predictions')
fig.add_trace(go.Scatter(x=[0, 5], y=[0, 5], mode='lines', name='Regression Line'))
fig.add_trace(go.Scatter(x=[None], y=[None], mode='markers', name='Actual vs Predicted', marker=dict(color='blue')))
fig.update_layout(width=800, height=500)
fig.update_layout(
    title={'text': "Model Fit Plot", 'y': 0.9, 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top', 'font': {'size': 18}},
    xaxis_title="Runoff",
    yaxis_title="Predictions",
    xaxis=dict(title=dict(font=dict(size=18))),
    yaxis=dict(title=dict(font=dict(size=18))),
    font=dict(size=18)
)
fig.update_layout(
    legend=dict(
        x=0.7,
        y=-0.1,
        traceorder='normal',
        orientation='h'
    )
)
fig.show()

In [25]:
ann_pred['error'] = ann_pred['Runoff'] - ann_pred['predictions']
pred.head()

NameError: name 'pred' is not defined

### Histogram rozkładu błędu.
Histogram przedstawia rozkład wartości błędów prognozowania, które zostały obliczone jako różnica między rzeczywistymi wartościami spływu powierzchniowego (Runoff) a wartościami przewidzianymi przez model (predictions).

In [None]:
fig = px.histogram(pred, 'error', marginal='rug')
fig.update_layout(
    title={'text': "Histogram of the error distribution", 'y': 0.9, 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top', 'font': {'size': 18}},
    xaxis_title="Error",
    yaxis_title="Count of samples",
    xaxis=dict(title=dict(font=dict(size=18))),
    yaxis=dict(title=dict(font=dict(size=18))),
    font=dict(size=18)
)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Ewaluacja na zestawie testowym
en_pred_test = model.predict(X_test_scaled)

# Obliczenie metryk
en_mse_test = mean_squared_error(y_test, en_pred_test)
en_r2_test = r2_score(y_test, en_pred_test)

# Analiza błędów
# errors_diff = np.abs(en_pred_test - ann_pred_test)
# cases_of_high_diff = np.where(errors_diff > threshold)  # Zdefiniuj własny próg

# Czas wnioskowania
import time
start_time = time.time()
model.predict(X_test)
en_inference_time = time.time() - start_time

print(f"ElasticNet Test R2: {en_r2_test}.")
print(f"ElasticNet Test MSE: {en_mse_test}.")
print(f"ElasticNet Inference Time: {en_inference_time}.")

ElasticNet Test R2: 0.5500946128180213.
ElasticNet Test MSE: 1.034948806107919.
ElasticNet Inference Time: 0.001988649368286133.



X has feature names, but ElasticNet was fitted without feature names

