# Podstawowe narzędzia pracy w Data Science

## Pakiety do obliczeń numerycznych

### Operacje macierzowe

In [2]:
import numpy as np

In [3]:
A = np.array([
  [1, 2, 3, 4],
  [1, 2, 3, 4],
  [1, 2, 3, 4]
])
B = np.array([
  [1, 1, 1],
  [2, 2, 2],
  [3, 3, 3],
  [4, 4, 4],
])
C = np.array([1, 2, 3])

In [4]:
A[0, :] @ B[:, 0]

30

In [None]:
A @ B

In [None]:
A @ B + C

In [None]:
RANDOM_ARRAY = np.random.random((6, 5))

In [None]:
RANDOM_ARRAY

In [None]:
print(RANDOM_ARRAY.mean(axis=1))
print(RANDOM_ARRAY.var(axis=1))
print(RANDOM_ARRAY.std(axis=1))

### Obliczenia naukowe

In [None]:
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (21, 7)

xs = np.arange(0.0, 2.5*np.pi, step=0.01)
ys = np.sin(xs)
plt.plot(xs, ys)
plt.show()

In [None]:
import scipy.integrate

In [None]:
scipy.integrate.trapz(ys, xs, dx=0.01)

In [None]:
N = 600
# sample spacing
T = 1.0 / 800.0
xs = np.linspace(0.0, N*T, N, endpoint=False)
signal = np.sin(2.0*np.pi*xs) -3*np.cos(0.5*2.0*np.pi**xs) +2.2*np.sin(3.5*2.0*np.pi*xs) + 1.7*np.cos(42*2.0*np.pi*xs) - np.sin(64*2.0*np.pi*xs) + 3*np.sin(223*2.0*np.pi*xs)

In [None]:
plt.plot(xs, signal)
plt.show()

In [None]:
from scipy.fft import fft, fftfreq

In [None]:
yf = fft(signal)
xf = fftfreq(N, T)[:N//2]

In [None]:
plt.plot(xf, 2.0/N * np.abs(yf[0:N//2]))
plt.grid()
plt.show()

## Reprezentowanie i przetwarzanie danych

Dane pochodzą z https://datacatalog.worldbank.org/dataset/ibrd-statement-loans-historical-data

In [None]:
import pandas as pd

In [None]:
!wget -O financial_data.csv https://finances.worldbank.org/api/views/sfv5-tf7p/rows.csv?accessType=DOWNLOAD

In [None]:
!ls

In [None]:
DATA = pd.read_csv("./financial_data.csv")

In [None]:
DATA

In [None]:
DATA.describe()

In [None]:
DATA[DATA.Country == "Denmark"]

In [None]:
DATA["Interest Rate"].agg(["mean", "median", "min", "max"])

In [None]:
DATA["Region"].unique()

In [None]:
DATA.groupby(by=["Region"])["Interest Rate"].agg("mean")

## Wizualizacja danych

In [None]:
interest_rate_by_region = DATA.groupby(by=["Region"])["Interest Rate"].agg("mean")
plt.bar(np.arange(0, len(interest_rate_by_region)), interest_rate_by_region)
plt.xticks(np.arange(0, len(interest_rate_by_region)), labels=interest_rate_by_region.index, rotation=70)
plt.xlabel("Region")
plt.ylabel("Interest rate, %")
plt.title("Interest rate by region")
plt.show()

In [None]:
from sklearn.datasets import load_digits

MNIST = load_digits()

In [None]:
X, Y = MNIST["data"], MNIST["target"]

In [None]:
MNIST.keys()

In [None]:
plt.rcParams["figure.figsize"] = (3, 3)
plt.imshow(X[120].reshape(8, 8))
plt.show()
print(Y[120])

In [None]:
from sklearn.decomposition import PCA

from sklearn.manifold import TSNE

mnist_embedded = TSNE(n_components=2, verbose=3, perplexity=100, learning_rate=1000).fit_transform(X)

In [None]:
plt.rcParams["figure.figsize"] = (8, 8)
for region in np.unique(Y):
  digit_embedding = mnist_embedded[np.where(Y == region)[0]]
  plt.scatter(x=digit_embedding[:, 0], y=digit_embedding[:, 1])
plt.show()