In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error

# RMSE函數
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))
# RMSLE函數
def rmsle(y_true: np.ndarray, y_pred: np.ndarray):
    rmsle = mean_squared_error(np.log1p(y_true), np.log1p(y_pred))
    return np.sqrt(rmsle)


In [None]:
# 準備資料
y_true = np.array([1000, 1000])       # 真實值
y_pred_low = np.array([600, 600])     # 預測値(比真實值還小)
y_pred_high = np.array([1400, 1400])  # 預測値(比真實值還大)

# 輸出RMSE
print('RMSE')
print(rmse(y_true, y_pred_high))
print(rmse(y_true, y_pred_low))

print('--------------------')

# 輸出RMSLE
print('RMSLE')
print(rmsle(y_true, y_pred_high))
print(rmsle(y_true, y_pred_low))


In [None]:
y_true = np.array([1000, 1000])
y_pred = np.array([1500, 1500])
print(f'RMSLE: {rmsle(y_true, y_pred)}')

y_true = np.array([100000, 100000])
y_pred = np.array([100500, 100500])
print(f'RMSLE: {rmsle(y_true, y_pred)}')


In [None]:
import numpy as np
import math

def logloss(true_label, predicted, eps = 1e-15):
    # 將元素的值收斂在任意範圍內
    p = np.clip(predicted, # 欲處理的資料
                eps,       # 最小值
                1 - eps)   # 最大值
    if true_label == 1:
        return -math.log(p)
    else:
        return -math.log(1 - p)


In [None]:
logloss(1,0.9)

In [None]:
logloss(1,0.5)

In [None]:
logloss(0,0.2)

In [None]:
import numpy as np
from sklearn.metrics import log_loss

# [類別1的正確答案, 類別2的正確答案, 類別3的正確答案]
y_true = np.array([0, 1, 2])
# 預測機率[類別1, 類別2, 類別3]
y_pred = np.array([[0.55, 0.45, 0.00],
                   [0.85, 0.00, 0.15],
                   [0.25, 0.75, 0.00]])

log_loss(y_true, y_pred)


In [None]:
import numpy as np
from sklearn.metrics import f1_score

# [[類別1的正確答案],[ 類別2的正確答案],[ 類別3的正確答案]
y_true = np.array([[1, 2], [1], [1, 2, 3]])
# 對真實值執行one-hot編碼
y_true = np.array([[1, 1, 0],
                   [1, 0, 0],
                   [1, 1, 1]])

# [[類別1的預測結果],[類別2的預測結果],[類別3的預測結果]
y_pred = np.array([[1, 3], [2], [1, 3]])
# 對預測值執行one-hot編碼
y_pred = np.array([[1, 0, 1],
                   [0, 1, 0],
                   [1, 0, 1]])

print('Mean-F1 :', f1_score(y_true, y_pred, average='samples'))
print('Macro-F1:', f1_score(y_true, y_pred, average='macro'))
print('Micro-F1:', f1_score(y_true, y_pred, average='micro'))


In [None]:
from sklearn.metrics import cohen_kappa_score

y_true = [2, 0, 2, 2, 0, 1]
y_pred = [0, 0, 2, 2, 0, 2]
cohen_kappa_score(y_true, y_pred, weights='quadratic')


In [None]:
import pandas as pd
import pandas_profiling

train = pd.read_csv('../input/titanic/train.csv')
train.profile_report()


In [None]:
# 輸入MNIST資料集
from tensorflow.keras.datasets import mnist
# MNIST資料集讀到NumPy序列中
(x_train, y_train), (x_test, y_test) = mnist.load_data()
# 輸出第1張圖的第6行的資料
print(x_train[0][5])


In [None]:
(x_train/255.0)[0][5]

In [None]:
import numpy as np
xmean = x_train.mean()       # 求其平均值
xstd  = np.std(x_train)      # 求其標準差值
# 對訓練資料執行正規化，輸出第1張圖的第6行資料
((x_train-xmean)/xstd)[0][5]


In [None]:
import numpy as np
x = ([1.0, 10.0, 100.0, 1000.0, 10000.0])
np.log1p(x)


In [None]:
from sklearn.preprocessing import LabelEncoder

data = ['A1', 'A2', 'A3', 'B1', 'B2', 'B3', 'A1', 'A2', 'A3']
le = LabelEncoder()  # 生成LabelEncoder 
le.fit(data)         # 將LabelEncoder初始化
print(le.classes_)   # 確認產生的Label


In [None]:
print(le.transform(data))

In [None]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder
df = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
ohe = OneHotEncoder(sparse=False)
# 變換時需將數據設定為二維陣列
print(ohe.fit_transform(df.reshape(-1, 1)))


In [None]:
data = np.array(['A1', 'A2', 'A3', 'B1', 'B2', 'B3', 'A1', 'A2', 'A3'])
ohe = OneHotEncoder(sparse=False)
print(ohe.fit_transform(data.reshape(-1, 1)))


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?'
]

vectorizer = CountVectorizer()
# 執行Bag-of-Words、取得變換後的矩陣
X = vectorizer.fit_transform(corpus)
# 因為傳回值是scipy.sparse疏矩陣
# 故將其變換為NumPy序列後再輸出
X.toarray()


In [None]:
vectorizer.vocabulary_

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?'
]

vectorizer = CountVectorizer(
    analyzer='word',    # 將字詞單位指定為N-grams
    ngram_range=(2, 2)) # 設定為2-grams
# 取得變換後的矩陣
X = vectorizer.fit_transform(corpus)
# 因為傳回值是scipy.sparse的疏矩陣
# 故將其變換為NumPy序列後再輸出
X.toarray()


In [None]:
vectorizer.vocabulary_

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?'
]

vectorizer = CountVectorizer()
transformer = TfidfTransformer()
# 取的變換後的矩陣
tf = vectorizer.fit_transform(corpus)
tfidf = transformer.fit_transform(tf)
# 因為傳回值是scipy.sparse的疏矩陣
# 故將其變換為NumPy序列後再輸出
tfidf.toarray()


In [None]:
from gensim.models import word2vec
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?'
]
# 將每個句子(sentence)列出
sentence = [d.split() for d in corpus]
# 進行訓練
model = word2vec.Word2Vec(sentence,
                          vector_size=10,     # 字詞向量的維數
                          min_count=1,        # 放棄出現不足n次的字詞
                          window=2)           # 用於學習之前後的字詞數量


In [None]:
model.wv['This']

In [None]:
model.wv['is']

In [None]:
model.wv.most_similar('document')