# ML Pipeline 
按照如下的指导要求，搭建你的机器学习管道。
### 1. 导入与加载
- 导入 Python 库
- 使用 [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html) 从数据库中加载数据集
- 定义特征变量X 和目标变量 Y

In [1]:
import numpy as np
import pandas as pd
import pickle
import re

from matplotlib import pyplot as plt

from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline, FeatureUnion

from sqlalchemy import create_engine

from xgboost import XGBClassifier

In [2]:
# load data from database
engine = create_engine('sqlite:///data/disaster_sn_msg.db')
data = pd.read_sql_table("data", engine)
data.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
X = data['message']
Y = data.iloc[:, 4:]

### 2. 编写分词函数，开始处理文本

In [4]:
def tokenize(text):
    # Normalize text
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())

    stop_words = stopwords.words("english")

    #tokenize
    words = word_tokenize (text)

    #stemming
    stemmed = [PorterStemmer().stem(w) for w in words]

    #lemmatizing
    words_lemmed = [WordNetLemmatizer().lemmatize(w) for w in stemmed if w not in stop_words]

    return words_lemmed

### 3. 创建机器学习管道 
这个机器学习管道应该接收 `message` 列作输入，输出分类结果，分类结果属于该数据集中的 36 个类。你会发现 [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) 在预测多目标变量时很有用。

In [5]:
def build_ppl():
    ppl = pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier (AdaBoostClassifier()))
        ])

    return ppl

In [6]:
pipeline = build_ppl()

### 4. 训练管道
- 将数据分割成训练和测试集
- 训练管道

In [7]:
X_train, X_test, Y_train, y_test = train_test_split(X, Y, test_size=0.33)

In [8]:
pipeline.fit(X_train,Y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize at 0x7fe5e9276710>,
                                 vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultiOutputClassifier(estimator=AdaBoostClassifier(algorithm='SAMME.R'

### 5. 测试模型
报告数据集中每个输出类别的 f1 得分、准确度和召回率。你可以对列进行遍历，并对每个元素调用 sklearn 的 `classification_report`。

In [39]:
def perf_eval(y_test_df, y_pred_np):
    """
    This is a customized evaluation function that measures both the label recall
    for each sample and the precision of each label across all samples.
    :param y_test_df: ground truth
    :param y_pred_np: predictions
    :return: sample_label_recall: the label recall for each sample
             label_precision: the precision of each label across all samples
             f1_score: the f1 score calculated from the mean of the above two
    """
    y_test_np = np.array(y_test_df)
    sample_label_recall = []
    label_precision = []

    for i in range(y_test_df.shape[0]):
        if y_test_np[i].sum() != 0:
            sample_label_recall.append(np.bitwise_and(
                    y_pred_np[i], y_test_np[i]).sum() / y_test_np[i].sum())
        elif y_test_np[i].sum() == 0:
            sample_label_recall.append(1)

    for j in range(y_test_np.shape[1]):
        label_precision.append(np.invert(
                np.logical_xor(y_pred_np[:, j],
                               y_test_np[:, j])).sum() / y_test_np.shape[0])

    slr_mean = np.array(sample_label_recall).mean()
    lp_mean = np.array(label_precision).mean()
    f1_score = 2 * slr_mean * lp_mean / (slr_mean + lp_mean)

    label_precision_df = pd.DataFrame(label_precision).transpose()
    label_precision_df.columns = y_test_df.columns

    report = pd.DataFrame()
    y_pred_df = pd.DataFrame(y_pred_np)
    y_pred_df.columns = y_test_df.columns

    for col in y_test_df.columns:
        class_dict = classification_report(output_dict=True,
                                           y_true=y_test_df.loc[:, col],
                                           y_pred=y_pred_df.loc[:, col])

        eval_df = pd.DataFrame(pd.DataFrame.from_dict(class_dict))

        # dropping unnecessary information
        eval_df.drop(['macro avg', 'weighted avg'], axis=1,
                     inplace=True)
        eval_df.drop(index='support', inplace=True)

        av_eval_df = pd.DataFrame(eval_df.transpose().mean()).transpose()

        report = report.append(av_eval_df, ignore_index=True)

    report.index = y_test_df.columns

    return slr_mean, lp_mean, label_precision_df, f1_score, report

In [34]:
y_pred = pipeline.predict(X_test)

In [41]:
def print_metrics(y_test, y_pred):
    slr_mean, lp_mean, label_precision_df, f1_score, report = \
            perf_eval(y_test, y_pred)

    print(f'Custom F1-Score is {f1_score}.')
    print(f'The mean of each label precision  is {lp_mean}.')
    print(f'The mean of label recall for each sample is {slr_mean}.')

    print('=======================================')
    print('Printing custom each label precision...')
    print(label_precision_df)

    print('=======================================')
    print('Printing classification report...')
    print(report)

In [42]:
print_metrics(y_test, y_pred)

Custom F1-Score is 0.8432344244940265.
The mean of custom precision above is 0.9472729490933376.
The mean of label recall for each sample is 0.7597873513753752.
Printing custom each label precision...
    related   request     offer  aid_related  medical_help  medical_products  \
0  0.777393  0.891123  0.994799     0.765025      0.927184          0.958622   

   search_and_rescue  security  military  child_alone  ...  aid_centers  \
0           0.975035  0.981392  0.971567          1.0  ...     0.986246   

   other_infrastructure  weather_related    floods     storm      fire  \
0              0.951341         0.876791  0.958969  0.938049  0.988673   

   earthquake      cold  other_weather  direct_report  
0    0.969371  0.983934       0.946602       0.851017  

[1 rows x 36 columns]
Printing classification report...
                        precision    recall  f1-score
related                  0.628270  0.499552  0.503328
request                  0.854891  0.797650  0.819288
offer  

### 6. 优化模型
使用网格搜索来找到最优的参数组合。 

In [43]:
def gs_eval(y_test, y_pred):
    ytest = np.array(y_test)
    sample_label_recall = []
    label_precision = []

    for i in range(y_test.shape[0]):
        if ytest[i].sum() != 0:
            sample_label_recall.append(np.bitwise_and(
                    y_pred[i], ytest[i]).sum() / ytest[i].sum())
        elif ytest[i].sum() == 0:
            sample_label_recall.append(1)

    for j in range(y_test.shape[1]):
        label_precision.append(np.invert(
                np.logical_xor(y_pred[:, j],
                               ytest[:, j])).sum() \
                               / ytest.shape[0])

    slr_mean = np.array(sample_label_recall).mean()
    lp_mean = np.array(label_precision).mean()
    f1_score = 2 * slr_mean * lp_mean / (slr_mean + lp_mean)

    return f1_score

In [44]:
scorer = make_scorer(gs_eval)

In [45]:
parameters = {'vect__max_df': (0.3, 0.4, 0.5),
              'clf__estimator__n_estimators': range(50, 54, 1),
              'clf__estimator__learning_rate': np.arange(1.2, 2.0, 0.2)
              }

In [1]:
gs = GridSearchCV(pipeline,
                  param_grid=parameters,
                  scoring=scorer,
                  n_jobs=16,
                  verbose=7
)

NameError: name 'GridSearchCV' is not defined

In [None]:
gs.fit(X_train, Y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.


In [16]:
gs.best_params_

{'clf__estimator__learning_rate': 1.2,
 'clf__estimator__n_estimators': 51,
 'vect__max_df': 0.4}

### 7. 测试模型
打印微调后的模型的精确度、准确率和召回率。  

因为本项目主要关注代码质量、开发流程和管道技术，所有没有模型性能指标的最低要求。但是，微调模型提高精确度、准确率和召回率可以让你的项目脱颖而出——特别是让你的简历更出彩。

In [None]:
y_pred_gs = gs.predict(X_test)

print_metrics(y_test, y_pred_gs)

### 8. 继续优化模型，比如：
* 尝试其他的机器学习算法
* 尝试除 TF-IDF 外其他的特征

Add start verb feature

In [None]:
class StartingVerbExtractor(BaseEstimator, TransformerMixin):

    @staticmethod
    def starting_verb(text):
        sentence_list = sent_tokenize(text)
        for sentence in sentence_list:
            pos_tags = pos_tag(tokenize(sentence))
            try:
                first_word, first_tag = pos_tags[0]
                if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                    return True
            except IndexError:
                pass
        return False

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_verb)
        return pd.DataFrame(X_tagged)

Add XGBoost as classifier.

In [None]:
def build_ppl(clf_type):
    """
    Grid Search Results:
        AdaBoost:
            {'clf__estimator__learning_rate': 1.2,
             'clf__estimator__n_estimators': 51,
             'vect__max_df': 0.4}


        XGBoost:
            {'clf__estimator__colsample_bytree': 1.0,
             'clf__estimator__gamma': 5.0,
             'clf__estimator__learning_rate': 0.5,
             'clf__estimator__min_child_weight': 1,
             'clf__estimator__subsample': 1.0,
             'vect__max_df': 0.75}
    """

    if clf_type == "Ada":

        ada_ppl = Pipeline([
                ('feats', FeatureUnion([
                        ('text_ppl', Pipeline([
                                ('vect', CountVectorizer(
                                        tokenizer=tokenize,
                                        max_df=0.4)),
                                ('tfidf', TfidfTransformer())
                        ])),
                        ('start_verb', StartingVerbExtractor())])),
                ('clf', MultiOutputClassifier(
                        AdaBoostClassifier(
                                n_estimators=51,
                                learning_rate=1.2
                        ))
                 )])

        return ada_ppl

    elif clf_type == "XG":

        # XGBoost models are much more computationally expensive than AdaBoost
        xgb_ppl = Pipeline([
                ('feats', FeatureUnion([
                        ('text_ppl', Pipeline([
                                ('vect', CountVectorizer(
                                        tokenizer=tokenize,
                                        max_df=0.75)),
                                ('tfidf', TfidfTransformer())
                        ])),
                        ('start_verb', StartingVerbExtractor())])),
                ('clf', MultiOutputClassifier(
                        XGBClassifier(
                                colsample_bytree=1.0,
                                gamma=5.0,
                                learning_rate=0.5,
                                min_child_weight=1,
                                subsample=1.0
                        ))
                 )])

        return xgb_ppl

Optimize updated AdaBoost pipeline.

In [None]:
ada_ppl = build_ppl('Ada')

In [None]:
ada_params = {'feats__text_ppl__vect__max_df': (0.3, 0.4, 0.5),
              'clf__estimator__n_estimators': range(50, 54, 1),
              'clf__estimator__learning_rate': np.arange(1.2, 2.0, 0.2)
              }

In [None]:
ada_gs = GridSearchCV(
        ada_ppl,
        param_grid=ada_params,
        scoring=scorer,
        n_jobs=12,
        verbose=7
)

In [None]:
ada_gs.best_params_

Optimize updated XGBoost pipeline.

In [None]:
xgb_ppl = build_ppl('XG')

In [None]:
xgb_params = {'feats__text_ppl__vect__max_df': (0.3, 0.4, 0.5),
              'clf__estimator__max_depth': range(5, 10, 2),
              'clf__estimator__learning_rate': [0.5, 0.75, 1.0],
              'clf__estimator__min_child_weight':np.arange(1, 4, 1),
              'clf__estimator__gamma':np.arange(5, 7.5, 0.5)
              }

In [None]:
xgb_gs = GridSearchCV(
        xgb_ppl,
        param_grid=xgb_params,
        scoring=scorer,
        n_jobs=12,
        verbose=7
)

In [None]:
xgb_gs.fit(X_train, Y_train)

In [None]:
xgb_gs.best_params_

### 9. 导出模型为 pickle file

In [None]:
def model_save(model, path):
    pickle.dump(model, open(path, 'wb'))
    pass

In [None]:
model_save(pipeline, "model.pkl")

### 10. Use this notebook to complete `train.py`
使用资源 (Resources)文件里附带的模板文件编写脚本，运行上述步骤，创建一个数据库，并基于用户指定的新数据集输出一个模型。

