* Strona biblioteki: https://scikit-learn.org
* Dokumentacja/User Guide: https://scikit-learn.org/stable/user_guide.html¶

In [1]:
import numpy as np
import pandas as pd
import sklearn

sklearn.__version__

'1.7.1'

In [2]:
def fetch_financial_data(company='PX'):
    """
    This function fetches stock market quotations.
    https://stooq.pl/
    """
    import pandas_datareader.data as web
    return web.DataReader(name=company, data_source='stooq')

df_raw = fetch_financial_data()
df_raw.head() # wyświetlenie tylko pięciu pierwszych wierszy

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2026-01-02,9.91,9.91,9.76,9.89,311630.0
2025-12-31,9.8,9.84,9.705,9.81,343844.0
2025-12-30,10.05,10.1,9.85,9.88,300699.0
2025-12-29,10.08,10.15,10.05,10.09,236972.0
2025-12-26,10.045,10.08,9.97,10.08,206949.0


In [3]:
df = df_raw
df = df[:5]
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5 entries, 2026-01-02 to 2025-12-26
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Open    5 non-null      float64
 1   High    5 non-null      float64
 2   Low     5 non-null      float64
 3   Close   5 non-null      float64
 4   Volume  5 non-null      float64
dtypes: float64(5)
memory usage: 240.0 bytes


In [4]:
df.index

DatetimeIndex(['2026-01-02', '2025-12-31', '2025-12-30', '2025-12-29',
               '2025-12-26'],
              dtype='datetime64[ns]', name='Date', freq=None)

In [5]:
df.index.day

Index([2, 31, 30, 29, 26], dtype='int32', name='Date')

In [6]:
df.index.month

Index([1, 12, 12, 12, 12], dtype='int32', name='Date')

In [7]:
df.index.year

Index([2026, 2025, 2025, 2025, 2025], dtype='int32', name='Date')

In [8]:
df['day'] = df.index.day
df['month'] = df.index.month
df['year'] = df.index.year
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['day'] = df.index.day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['month'] = df.index.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = df.index.year


Unnamed: 0_level_0,Open,High,Low,Close,Volume,day,month,year
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2026-01-02,9.91,9.91,9.76,9.89,311630.0,2,1,2026
2025-12-31,9.8,9.84,9.705,9.81,343844.0,31,12,2025
2025-12-30,10.05,10.1,9.85,9.88,300699.0,30,12,2025
2025-12-29,10.08,10.15,10.05,10.09,236972.0,29,12,2025
2025-12-26,10.045,10.08,9.97,10.08,206949.0,26,12,2025


### Dyskretyzacja zmiennej ciągłej (zmiana na zmienną kategoryczną)

In [10]:
df = pd.DataFrame(data={'height': [175., 178.5, 185., 191., 184.5, 183., 168.]})
df

Unnamed: 0,height
0,175.0
1,178.5
2,185.0
3,191.0
4,184.5
5,183.0
6,168.0


In [13]:
df['height_cat'] = pd.cut(x=df.height, bins=3) # podziel na 3 kategorie(bins=3), z kolumny height
df

Unnamed: 0,height,height_cat
0,175.0,"(167.977, 175.667]"
1,178.5,"(175.667, 183.333]"
2,185.0,"(183.333, 191.0]"
3,191.0,"(183.333, 191.0]"
4,184.5,"(183.333, 191.0]"
5,183.0,"(175.667, 183.333]"
6,168.0,"(167.977, 175.667]"


In [14]:
df['height_cat'] = pd.cut(x=df.height, bins=(160, 175, 180, 195)) # podziel na 3 konkretne przedziały (160-175, 175-180, 180-195)
df

Unnamed: 0,height,height_cat
0,175.0,"(160, 175]"
1,178.5,"(175, 180]"
2,185.0,"(180, 195]"
3,191.0,"(180, 195]"
4,184.5,"(180, 195]"
5,183.0,"(180, 195]"
6,168.0,"(160, 175]"


In [15]:
df['height_cat'] = pd.cut(x=df.height, bins=(160, 175, 180, 195), labels=['small', 'medium', 'hight']) # jw. tylko z etykietami
df

Unnamed: 0,height,height_cat
0,175.0,small
1,178.5,medium
2,185.0,hight
3,191.0,hight
4,184.5,hight
5,183.0,hight
6,168.0,small


In [17]:
pd.get_dummies(df, drop_first=True, prefix='height', dtype=int)

Unnamed: 0,height,height_medium,height_hight
0,175.0,0,0
1,178.5,1,0
2,185.0,0,1
3,191.0,0,1
4,184.5,0,1
5,183.0,0,1
6,168.0,0,0


### Ekstrakcja cech

In [18]:
df = pd.DataFrame(data={'lang': [['PL', 'ENG'], ['GER', 'ENG', 'PL', 'FRA'], ['RUS']]})
df

Unnamed: 0,lang
0,"[PL, ENG]"
1,"[GER, ENG, PL, FRA]"
2,[RUS]
