# ML Pipeline Preparation

### 1. Import libraries and load data from database.
- Import Python libraries


In [3]:
import sys,pickle
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from nltk.corpus import stopwords
from sqlalchemy import create_engine
import pandas as pd
from nltk.tokenize import word_tokenize
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Donia\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Donia\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Donia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Donia\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### 2. load Data 

In [4]:
def load_data(database_filepath):
    engine = create_engine(f'sqlite:///'+database_filepath)
    df = pd.read_sql_table("last", engine)
    X = df['message']
    y = df.drop(['message','original',"id",'genre'],axis =1)
    category_names = y.columns
    return X,y,category_names

### 2. Write a tokenization function to process your text data

In [5]:
def tokenize(text):
    words = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lem = []
    # stop_words = set(stopwords.words('english'))
    # words = [w for w in words if w not in stop_words]
    for w in words:
        word = lemmatizer.lemmatize(w).lower().strip() 
        lem.append(word)
    return lem


### 3. Build a machine learning pipeline


In [6]:

def build_model():
    pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer()),
                   ('clf', MultiOutputClassifier(KNeighborsClassifier()))])
    
    parameters = {'clf__estimator__n_neighbors':[50,100]
             }
    cv =  GridSearchCV(estimator = pipeline,param_grid = parameters)
    return cv

def evaluate_model(model, X_test, Y_test, category_names):
    y_pred = model.predict(X_test)
    # print(classification_report(y_pred, Y_test.values, target_names=category_names))
    print('Accuracy Score: {}'.format(np.mean(Y_test.values == y_pred)))

def save_model(model, model_filepath):
    pickle.dump(model, open(model_filepath, 'wb'))


### 4. Train pipeline


In [7]:
X, Y, category_names = load_data('../data/dis.db')

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)


In [9]:
print('Building model...')
model = build_model()  
print('Training model...')
model.fit(X_train, Y_train)

Building model...
Training model...


GridSearchCV(estimator=Pipeline(steps=[('vect',
                                        CountVectorizer(tokenizer=<function tokenize at 0x000002495A7E5670>)),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf',
                                        MultiOutputClassifier(estimator=KNeighborsClassifier()))]),
             param_grid={'clf__estimator__n_neighbors': [50, 100]})

### 5. Check accuracy

In [10]:
# model = pickle.read("../models/mod.pkl")
print('Evaluating model...')
evaluate_model(model, X_test, Y_test, category_names)


Evaluating model...
Accuracy Score: 0.9382574794474108


### 6. Save model as pickle file

In [13]:
# print('Saving model...\n  =================  MODEL: {}'.format(model_filepath))
save_model(model,'../models/mod.pkl')