# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [297]:
# import libraries
import numpy as np
import pandas as pd
import sklearn
import nltk
from sqlalchemy import create_engine

In [298]:
from sqlalchemy import create_engine

# load data from database
engine =  create_engine('sqlite:///DisasterTweets.db')

In [299]:
def load_data(table_name, conn_engine):
    """
    Args:
       table_name("String"):
       conn_engine"
    """
    # Load data from a specific table into a DataFrame
    table_name = 'cleandata'
    df = pd.read_sql_table(table_name, con=conn_engine)

    X = df["message"]
    Y = df.iloc[:,4:]
    
    return X, Y, df

text_inputs, response_labels, df = load_data(table_name="cleandata", conn_engine=engine)
X, Y = text_inputs.values, response_labels.values

In [300]:
def display_dataset(X_train, y_train, X_test=None, y_test=None):
    """
    """
    print("unique Y values: ", np.unique(Y))
    print("training set, X: ", X_train.shape)
    if X_test is not None:
        print("test set, X: ",X_test.shape)
    print("training set, Y: ",y_train.shape)
    if y_test is not None:
        print("test set, Y: ",y_test.shape)
        
def data_type_check(X1, X2):
    """
    """
    # check data types of 
    print("X1 shape: ", X1.shape)
    print("X2 shape: ", X2.shape)
    print("X1 Type: ", type(X1))
    print("X2 Type: ",type(X2))          

In [301]:
# View the first few rows of the DataFrame
display(df.head(3))
display_dataset(X, Y)
classes = response_labels.columns
print(f"\nlabels: {list(classes)}")

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


unique Y values:  [0 1 2]
training set, X:  (26386,)
training set, Y:  (26386, 36)

labels: ['related', 'request', 'offer', 'aid_related', 'medical_help', 'medical_products', 'search_and_rescue', 'security', 'military', 'child_alone', 'water', 'food', 'shelter', 'clothing', 'money', 'missing_people', 'refugees', 'death', 'other_aid', 'infrastructure_related', 'transport', 'buildings', 'electricity', 'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure', 'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold', 'other_weather', 'direct_report']


### 2. Write a tokenization function to process your text data

In [302]:
# Download the stopwords and all nltk relevant packages.
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt_tab')



[nltk_data] Downloading package punkt to /Users/emmanuele/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/emmanuele/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/emmanuele/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/emmanuele/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/emmanuele/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/emmanuele/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [303]:
import re
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


In [304]:
def tokenize(text, stop_words=None):
    """
    Tokenize a text by normalizing, lemmatizing and removing stop words.
    
    Args:
        text (list): list of strings
        stop_words (set): a set of word strings for stop words.

    Returns:
        tokens(list): list of token strings.
    """
    # Import stopwords if not imported.
    if stop_words is None:
        stop_words = set(stopwords.words("english"))
    
    lemmatizer = WordNetLemmatizer()    
    
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    
    # Replace URLs with a placeholder and normalize case.
    normalized_text = re.sub(url_regex, ' ', text.lower())

    # Replace non-alphanumeric characters with spaces.
    normalized_text = re.sub(r'[^a-zA-Z0-9]', ' ', normalized_text)
    
    tokens = word_tokenize(normalized_text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return tokens

# Consider extending the tokenize function to be able to perform sentence tokenization.

  url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'


In [305]:
#Testing Tokenize Function

text1 = "Barclaysjbki CEO stresses the importance of regulatory and cultural reform in financial services at Brussels conference  https://www.google.com"
print(f'input text: "{text1}"\n')
print(f"text tokens: {tokenize(text1)} \n")
text2 = "The No. 8 Northeast Gale or storm signal was issued at 5.55pm yesterday (September 14) and was replaced by Southeast gale and storm signal at 12.35am today (September 15)."
print(f'input text: "{text2}" \n')
print(f"text tokens: {tokenize(text2)} \n")
sentence_list = sent_tokenize(text2)
print(f"sentences: {sentence_list} \n")
print("testing sentence tokenization...")
for text in sentence_list:
    print(f'\ntext: "{text}"')
    print(f"\ntext tokens: {tokenize(text)}")

input text: "Barclaysjbki CEO stresses the importance of regulatory and cultural reform in financial services at Brussels conference  https://www.google.com"

text tokens: ['barclaysjbki', 'ceo', 'stress', 'importance', 'regulatory', 'cultural', 'reform', 'financial', 'service', 'brussels', 'conference'] 

input text: "The No. 8 Northeast Gale or storm signal was issued at 5.55pm yesterday (September 14) and was replaced by Southeast gale and storm signal at 12.35am today (September 15)." 

text tokens: ['8', 'northeast', 'gale', 'storm', 'signal', 'issued', '5', '55pm', 'yesterday', 'september', '14', 'replaced', 'southeast', 'gale', 'storm', 'signal', '12', '35am', 'today', 'september', '15'] 

sentences: ['The No.', '8 Northeast Gale or storm signal was issued at 5.55pm yesterday (September 14) and was replaced by Southeast gale and storm signal at 12.35am today (September 15).'] 

testing sentence tokenization...

text: "The No."

text tokens: []

text: "8 Northeast Gale or storm s

In [306]:
text = 'What can I do?'
tokens = tokenize(text)
print(tokens)
for token in word_tokenize(text.lower()):
    print(WordNetLemmatizer().lemmatize(token))
    print(f'{token}, {token in set(stopwords.words("english"))}')

[]
what
what, True
can
can, True
i
i, True
do
do, True
?
?, False


### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. You may find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [307]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

#help(sklearn.multioutput)

In [128]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=41)

In [129]:
display_dataset(X_train=X_train, y_train=y_train, X_test = X_test, y_test=y_test)
print("\n")
print(data_type_check(X_train, y_train))

unique Y values:  [0 1 2]
training set, X:  (21108,)
test set, X:  (5278,)
training set, Y:  (21108, 36)
test set, Y:  (5278, 36)


X1 shape:  (21108,)
X2 shape:  (21108, 36)
X1 Type:  <class 'numpy.ndarray'>
X2 Type:  <class 'numpy.ndarray'>
None


In [130]:
model = Pipeline([
    ("vectorize", CountVectorizer(tokenizer=tokenize)),
    ("tfidf", TfidfTransformer()),
    ("clf", RandomForestClassifier())
])

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [131]:
trained_model = model.fit(X_train, y_train)



### 5. Test your model
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [136]:
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
#help(sklearn.metrics)

In [137]:
y_pred = trained_model.predict(X_test)

In [138]:
def accuracy(y_actual, y_pred):
    """_summary_

    Args:
        y_actual (_type_): 1D or ND arrary for labelled y/ output examples
        y_pred (_type_): 1D or ND arrary for predicted/inference output examples
    """
    accuracy = (y_pred == y_actual).mean()
    print("Accuracy: ", accuracy)
    return accuracy




def evaluate_multilabel_model(y_true, y_pred, class_names):
    """
    Comprehensive evaluation of a multi-label classification model.
    
    Args:
        y_true: Ground truth (correct) target values
        y_pred: Estimated targets as returned by classifier
        class_names: List of class names for each column
    """
    # Overall accuracy
    sample_accuracy = accuracy_score(y_true, y_pred)
    print(f"Overall Sample-wise Accuracy: {sample_accuracy:.3f}")
    
    # Per-class metrics
    results = []
    for i in range(y_true.shape[1]):
        precision = precision_score(y_true[:, i], y_pred[:, i], zero_division=0)
        recall = recall_score(y_true[:, i], y_pred[:, i], zero_division=0)
        f1 = f1_score(y_true[:, i], y_pred[:, i], zero_division=0)
        
        results.append({
            'Class': class_names[i],
            'Precision': precision,
            'Recall': recall,
            'F1-Score': f1
        })
        
        # Detailed classification report per class
        print(f"\nDetailed metrics for {class_names[i]}:")
        print(classification_report(y_true[:, i], y_pred[:, i], zero_division=0))
    
    # Create a DataFrame with all metrics
    metrics_df = pd.DataFrame(results)
    print("\nSummary of all metrics:")
    print(metrics_df.round(3))
    
    # Visualize metrics
    plt.figure(figsize=(10, 6))
    metrics_melted = pd.melt(metrics_df, id_vars=['Class'], 
                           value_vars=['Precision', 'Recall', 'F1-Score'])
    sns.barplot(x='Class', y='value', hue='variable', data=metrics_melted)
    plt.xticks(rotation=45)
    plt.title('Model Performance Metrics by Class')
    plt.tight_layout()
    plt.show()
    
    return metrics_df

# Example usage:
def evaluate_model(model, X_test, y_test, class_names):
    """
    Evaluate the model and print all relevant metrics
    """
    # Get predictions
    y_pred = model.predict(X_test)
    
    # Get best parameters if using GridSearchCV
    if hasattr(model, 'best_params_'):
        print("Best parameters found:")
        print(model.best_params_)
        print("\nBest cross-validation score:", model.best_score_)
    
    # Evaluate the model
    metrics_df = evaluate_multilabel_model(y_test, y_pred, class_names)
    
    return y_pred, metrics_df

Accuracy:  0.9450181045008631
0.9450181045008631


In [337]:
evaluate_model(new_model, X_test, y_test, classes)

print(accuracy(y_test, y_pred))

# Now you can generate the classification report
for col_index in range(0,y_test.shape[1]):
    report = classification_report(y_test[:,col_index], y_pred[:, col_index], zero_division=0)
    print(classes[col_index])
    print(report)
    


NameError: name 'evaluate_model' is not defined

### 6. Improve your model
Use grid search to find better parameters. 

In [140]:
# help(sklearn)
from sklearn.base import BaseEstimator
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import FeatureUnion,Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

In [315]:
class StartingVerbExtractor(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        """_summary_
        """
        self.run_count = 1
        print("Start Verb Extractor running...")

    def starting_verb(self, text):
        try:
            
            # print("\n\nText:", text)
            sentence_list = nltk.sent_tokenize(text)
            for sentence in sentence_list:
                # Tokenize the sentence
                text_tokens = tokenize(sentence)
                
                if text_tokens:
                    # Get the POS (parts of speech) of the words in the text.
                    first_word, first_tag = nltk.pos_tag(text_tokens)[0]
                    
                    # Check if the first word is a Verb or 'RT' (retweet). 
                    if first_tag in ['VB', 'VBP', 'UH'] or first_word == 'RT':
                        # print("\nVerb, Tag: ",first_tag, ", First Word: ",first_word)
                        return 1 
                    else:
                        # print(f"\nNon-verb, Tag: {first_tag}, First Word: ,{first_word}")
                        return 0
                else:
                    # print(f'Empty Text Tokens, {text_tokens} in "{sentence}"')
                    pass
            
            # If no sentences were found in the entire text.
            self.run_count += 1
            print(f"Empty Text ({self.run_count}): ", sentence_list)
            return 0
        
        except IndexError as e:
            print(f"IndexError: {e}")
            print(f"Text causing issue: {text}")
            return 0
    
        except Exception as e:
            print(f"Unexpected error: {e}")
            print(f"Text causing issue: {text}")
            return 0
    

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_verb)
        df = pd.DataFrame(X_tagged).values # converts to a 2D numpy array.
        # df = X_tagged.values # removing this because the hstack fails.
        
        # Log information about the transformation
        print("\n\nFeature Extraction and Text Transformation Complete:")
        print("Extracted/New feature shape:", df.shape)
        print("Input feature shape: ", X.shape)
        
        return df

In [334]:
def create_model_pipeline():
    """
    Constructs an ML pipeline using FeatureUnion to add a new binary feature.
    The feature checks if the first word in each text is a verb (1 if a verb, 0 otherwise),
    and combines this feature with the text data processed through a TF-IDF transformer pipeline.
    The combined feature matrix is used to train the model to enhance its performance.

    Returns:
        model (Pipeline): A scikit-learn pipeline object for training and evaluation, 
        which includes feature extraction, transformation, and a classifier.
    """
    
    text_pipeline = Pipeline([
        ("vect", CountVectorizer(tokenizer=tokenize)),
        ("tfidf", TfidfTransformer())
    ])
    
    features = FeatureUnion([
        ("text_pipeline",text_pipeline),
        ("starting_verb", StartingVerbExtractor())
    ])
    
    model = Pipeline([
        ("features", features),
        ("clf", MultiOutputClassifier(RandomForestClassifier()))
])
    
    # specify parameters for grid search
    parameters = {
        'features__text_pipeline__vect__ngram_range': ((1, 1), (1, 2)),
        'clf__estimator__n_estimators': [10, 25, 50],
        'clf__estimator__min_samples_split': [2, 3, 4]
    }
    
    # create grid search object
    cv = GridSearchCV(estimator=model, param_grid=parameters, verbose=2, cv=3, error_score='raise')
    
    return cv


In [335]:
display_dataset(X_train=X_train, y_train=y_train, X_test = X_test, y_test=y_test)
print("\n")
data_type_check(X_train, y_train)

unique Y values:  [0 1 2]
training set, X:  (21108,)
test set, X:  (5278,)
training set, Y:  (21108, 36)
test set, Y:  (5278, 36)


X1 shape:  (21108,)
X2 shape:  (21108, 36)
X1 Type:  <class 'numpy.ndarray'>
X2 Type:  <class 'numpy.ndarray'>


In [336]:
new_model = create_model_pipeline()
new_model.fit(X_train, y_train)

# print("Best parameters found: ", new_model.best_params_)
# print(new_model)


Start Verb Extractor running...
Start Verb Extractor running...
Fitting 3 folds for each of 18 candidates, totalling 54 fits
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=10, features__text_pipeline__vect__ngram_range=(1, 1); total time=  21.2s
Start Verb Extractor running...




Empty Text (2):  ['          .']
Empty Text (3):  ['//// // @:@']
Empty Text (4):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (5):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (6):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=10, features__text_pipeline__vect__ngram_range=(1, 1); total time=  20.6s
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=10, features__text_pipeline__vect__ngram_range=(1, 1); total time=  21.2s
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=10, features__text_pipeline__vect__ngram_range=(1, 2); total time=  32.3s
Start Verb Extractor running...




Empty Text (2):  ['          .']
Empty Text (3):  ['//// // @:@']
Empty Text (4):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (5):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (6):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=10, features__text_pipeline__vect__ngram_range=(1, 2); total time=  33.2s
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=10, features__text_pipeline__vect__ngram_range=(1, 2); total time=  31.9s
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=25, features__text_pipeline__vect__ngram_range=(1, 1); total time=  37.5s
Start Verb Extractor running...




Empty Text (2):  ['          .']
Empty Text (3):  ['//// // @:@']
Empty Text (4):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (5):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (6):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=25, features__text_pipeline__vect__ngram_range=(1, 1); total time=  37.9s
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=25, features__text_pipeline__vect__ngram_range=(1, 1); total time=  38.6s
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=25, features__text_pipeline__vect__ngram_range=(1, 2); total time= 1.2min
Start Verb Extractor running...




Empty Text (2):  ['          .']
Empty Text (3):  ['//// // @:@']
Empty Text (4):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (5):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (6):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=25, features__text_pipeline__vect__ngram_range=(1, 2); total time= 1.1min
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=25, features__text_pipeline__vect__ngram_range=(1, 2); total time= 1.1min
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=50, features__text_pipeline__vect__ngram_range=(1, 1); total time= 1.1min
Start Verb Extractor running...




Empty Text (2):  ['          .']
Empty Text (3):  ['//// // @:@']
Empty Text (4):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (5):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (6):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=50, features__text_pipeline__vect__ngram_range=(1, 1); total time= 1.1min
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=50, features__text_pipeline__vect__ngram_range=(1, 1); total time= 1.1min
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=50, features__text_pipeline__vect__ngram_range=(1, 2); total time= 2.1min
Start Verb Extractor running...




Empty Text (2):  ['          .']
Empty Text (3):  ['//// // @:@']
Empty Text (4):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (5):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (6):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=50, features__text_pipeline__vect__ngram_range=(1, 2); total time= 2.2min
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=50, features__text_pipeline__vect__ngram_range=(1, 2); total time= 2.2min
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=3, clf__estimator__n_estimators=10, features__text_pipeline__vect__ngram_range=(1, 1); total time=  20.4s
Start Verb Extractor running...




Empty Text (2):  ['          .']
Empty Text (3):  ['//// // @:@']
Empty Text (4):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (5):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (6):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=3, clf__estimator__n_estimators=10, features__text_pipeline__vect__ngram_range=(1, 1); total time=  21.3s
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=3, clf__estimator__n_estimators=10, features__text_pipeline__vect__ngram_range=(1, 1); total time=  19.4s
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=3, clf__estimator__n_estimators=10, features__text_pipeline__vect__ngram_range=(1, 2); total time=  29.9s
Start Verb Extractor running...




Empty Text (2):  ['          .']
Empty Text (3):  ['//// // @:@']
Empty Text (4):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (5):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (6):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=3, clf__estimator__n_estimators=10, features__text_pipeline__vect__ngram_range=(1, 2); total time=  30.2s
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=3, clf__estimator__n_estimators=10, features__text_pipeline__vect__ngram_range=(1, 2); total time=  31.4s
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=3, clf__estimator__n_estimators=25, features__text_pipeline__vect__ngram_range=(1, 1); total time=  37.1s
Start Verb Extractor running...




Empty Text (2):  ['          .']
Empty Text (3):  ['//// // @:@']
Empty Text (4):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (5):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (6):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=3, clf__estimator__n_estimators=25, features__text_pipeline__vect__ngram_range=(1, 1); total time=  40.8s
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=3, clf__estimator__n_estimators=25, features__text_pipeline__vect__ngram_range=(1, 1); total time=  34.4s
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=3, clf__estimator__n_estimators=25, features__text_pipeline__vect__ngram_range=(1, 2); total time=  58.0s
Start Verb Extractor running...




Empty Text (2):  ['          .']
Empty Text (3):  ['//// // @:@']
Empty Text (4):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (5):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (6):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=3, clf__estimator__n_estimators=25, features__text_pipeline__vect__ngram_range=(1, 2); total time= 1.0min
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=3, clf__estimator__n_estimators=25, features__text_pipeline__vect__ngram_range=(1, 2); total time=  57.8s
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=3, clf__estimator__n_estimators=50, features__text_pipeline__vect__ngram_range=(1, 1); total time= 1.0min
Start Verb Extractor running...




Empty Text (2):  ['          .']
Empty Text (3):  ['//// // @:@']
Empty Text (4):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (5):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (6):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=3, clf__estimator__n_estimators=50, features__text_pipeline__vect__ngram_range=(1, 1); total time=  59.8s
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=3, clf__estimator__n_estimators=50, features__text_pipeline__vect__ngram_range=(1, 1); total time= 1.0min
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=3, clf__estimator__n_estimators=50, features__text_pipeline__vect__ngram_range=(1, 2); total time= 1.9min
Start Verb Extractor running...




Empty Text (2):  ['          .']
Empty Text (3):  ['//// // @:@']
Empty Text (4):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (5):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (6):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=3, clf__estimator__n_estimators=50, features__text_pipeline__vect__ngram_range=(1, 2); total time= 1.9min
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=3, clf__estimator__n_estimators=50, features__text_pipeline__vect__ngram_range=(1, 2); total time= 1.9min
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=4, clf__estimator__n_estimators=10, features__text_pipeline__vect__ngram_range=(1, 1); total time=  19.7s
Start Verb Extractor running...




Empty Text (2):  ['          .']
Empty Text (3):  ['//// // @:@']
Empty Text (4):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (5):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (6):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=4, clf__estimator__n_estimators=10, features__text_pipeline__vect__ngram_range=(1, 1); total time=  18.9s
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=4, clf__estimator__n_estimators=10, features__text_pipeline__vect__ngram_range=(1, 1); total time=  19.7s
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=4, clf__estimator__n_estimators=10, features__text_pipeline__vect__ngram_range=(1, 2); total time=  28.9s
Start Verb Extractor running...




Empty Text (2):  ['          .']
Empty Text (3):  ['//// // @:@']
Empty Text (4):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (5):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (6):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=4, clf__estimator__n_estimators=10, features__text_pipeline__vect__ngram_range=(1, 2); total time=  30.3s
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=4, clf__estimator__n_estimators=10, features__text_pipeline__vect__ngram_range=(1, 2); total time=  29.4s
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=4, clf__estimator__n_estimators=25, features__text_pipeline__vect__ngram_range=(1, 1); total time=  33.7s
Start Verb Extractor running...




Empty Text (2):  ['          .']
Empty Text (3):  ['//// // @:@']
Empty Text (4):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (5):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (6):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=4, clf__estimator__n_estimators=25, features__text_pipeline__vect__ngram_range=(1, 1); total time=  35.6s
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=4, clf__estimator__n_estimators=25, features__text_pipeline__vect__ngram_range=(1, 1); total time=  38.6s
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=4, clf__estimator__n_estimators=25, features__text_pipeline__vect__ngram_range=(1, 2); total time=  55.6s
Start Verb Extractor running...




Empty Text (2):  ['          .']
Empty Text (3):  ['//// // @:@']
Empty Text (4):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (5):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (6):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=4, clf__estimator__n_estimators=25, features__text_pipeline__vect__ngram_range=(1, 2); total time=  54.5s
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=4, clf__estimator__n_estimators=25, features__text_pipeline__vect__ngram_range=(1, 2); total time=  52.8s
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=4, clf__estimator__n_estimators=50, features__text_pipeline__vect__ngram_range=(1, 1); total time=  56.7s
Start Verb Extractor running...




Empty Text (2):  ['          .']
Empty Text (3):  ['//// // @:@']
Empty Text (4):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (5):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (6):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=4, clf__estimator__n_estimators=50, features__text_pipeline__vect__ngram_range=(1, 1); total time=  56.5s
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=4, clf__estimator__n_estimators=50, features__text_pipeline__vect__ngram_range=(1, 1); total time=  56.9s
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=4, clf__estimator__n_estimators=50, features__text_pipeline__vect__ngram_range=(1, 2); total time= 1.9min
Start Verb Extractor running...




Empty Text (2):  ['          .']
Empty Text (3):  ['//// // @:@']
Empty Text (4):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (5):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (6):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=4, clf__estimator__n_estimators=50, features__text_pipeline__vect__ngram_range=(1, 2); total time= 2.0min
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (14072, 1)
Input feature shape:  (14072,)
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (7036, 1)
Input feature shape:  (7036,)
[CV] END clf__estimator__min_samples_split=4, clf__estimator__n_estimators=50, features__text_pipeline__vect__ngram_range=(1, 2); total time= 1.7min
Start Verb Extractor running...




Empty Text (2):  ['(http://www.guardian.co.uk/global-development/2013/jan/16/somali-ngos-mogadishu-street-children)']
Empty Text (3):  []
Empty Text (4):  ['          .']
Empty Text (5):  ['//// // @:@']
Empty Text (6):  ['http://wap.sina.comhttp://wap.sina.com']


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (21108, 1)
Input feature shape:  (21108,)


In [338]:
from pprint import pprint
pprint(new_model.get_params().keys())

dict_keys(['cv', 'error_score', 'estimator__memory', 'estimator__steps', 'estimator__verbose', 'estimator__features', 'estimator__clf', 'estimator__features__n_jobs', 'estimator__features__transformer_list', 'estimator__features__transformer_weights', 'estimator__features__verbose', 'estimator__features__verbose_feature_names_out', 'estimator__features__text_pipeline', 'estimator__features__starting_verb', 'estimator__features__text_pipeline__memory', 'estimator__features__text_pipeline__steps', 'estimator__features__text_pipeline__verbose', 'estimator__features__text_pipeline__vect', 'estimator__features__text_pipeline__tfidf', 'estimator__features__text_pipeline__vect__analyzer', 'estimator__features__text_pipeline__vect__binary', 'estimator__features__text_pipeline__vect__decode_error', 'estimator__features__text_pipeline__vect__dtype', 'estimator__features__text_pipeline__vect__encoding', 'estimator__features__text_pipeline__vect__input', 'estimator__features__text_pipeline__vect__

In [340]:
if isinstance(new_model, Pipeline):
    print("Best parameters found: ", new_model.best_params_ if hasattr(new_model, 'best_params_') else "No best parameters found.")
    print(new_model)
    y_pred = new_model.predict(X_test)
    print("y_pred:", y_pred)

if isinstance(new_model, GridSearchCV):
    print('Checking outputs of Transformers')
    for name, transformer in new_model.best_estimator_.steps:
        output = transformer.fit(X_train, y_train)
        print(f"{name} output shape:", output.shape)

Checking outputs of Transformers


AttributeError: 'FeatureUnion' object has no attribute 'shape'

In [341]:
from sklearn.model_selection import GridSearchCV

def inspect_pipeline_steps(model, X_train):
    """
    Safely inspect each step of a fitted pipeline with proper error handling
    
    Args:
        model: Fitted GridSearchCV object
        X_train: Training data to inspect transformations
    """
    if not isinstance(model, GridSearchCV):
        print("Warning: Model is not a GridSearchCV instance")
        return
    
    print("Best Parameters:", model.best_params_)
    print("\nInspecting Pipeline Steps:")
    
    # Access the fitted pipeline
    pipeline = model.best_estimator_
    current_data = X_train
    
    # Iterate through named steps
    for name, transformer in pipeline.named_steps.items():
        print(f"\nStep: {name}")
        print(f"Input shape: {current_data.shape}")
        
        try:
            # Check if step has transform method
            if hasattr(transformer, 'transform'):
                current_data = transformer.transform(current_data)
                print(f"Output shape: {current_data.shape}")
                
                # Print feature names if available
                if hasattr(transformer, 'get_feature_names_out'):
                    features = transformer.get_feature_names_out()
                    print(f"First few features: {features[:5]}")
                    
            else:
                print(f"Note: {name} doesn't have transform method (might be final estimator)")
                
            # Print additional info for specific transformer types
            if hasattr(transformer, 'n_features_in_'):
                print(f"Number of input features: {transformer.n_features_in_}")
                
        except Exception as e:
            print(f"Error processing {name}: {str(e)}")
            continue
    
    return current_data


In [348]:
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def evaluate_multilabel_model(y_true, y_pred, class_names):
    """
    Comprehensive evaluation of a multi-label classification model.
    
    Args:
        y_true: Ground truth (correct) target values
        y_pred: Estimated targets as returned by classifier
        class_names: List of class names for each column
    """
    # Overall accuracy
    # sample_accuracy = accuracy_score(y_true, y_pred)
    # print(f"Overall Sample-wise Accuracy: {sample_accuracy:.3f}")
    
    # Per-class metrics
    results = []
    for i in range(y_true.shape[1]):
        precision = precision_score(y_true[:, i], y_pred[:, i], zero_division=0)
        recall = recall_score(y_true[:, i], y_pred[:, i], zero_division=0)
        f1 = f1_score(y_true[:, i], y_pred[:, i], average='macro', zero_division=0)
        
        results.append({
            'Class': class_names[i],
            'Precision': precision,
            'Recall': recall,
            'F1-Score': f1
        })
        
        # Detailed classification report per class
        print(f"\nDetailed metrics for {class_names[i]}:")
        print(classification_report(y_true[:, i], y_pred[:, i], zero_division=0))
    
    # Create a DataFrame with all metrics
    metrics_df = pd.DataFrame(results)
    print("\nSummary of all metrics:")
    print(metrics_df.round(3))
    
    # Visualize metrics
    plt.figure(figsize=(10, 6))
    metrics_melted = pd.melt(metrics_df, id_vars=['Class'], 
                           value_vars=['Precision', 'Recall', 'F1-Score'])
    sns.barplot(x='Class', y='value', hue='variable', data=metrics_melted)
    plt.xticks(rotation=45)
    plt.title('Model Performance Metrics by Class')
    plt.tight_layout()
    plt.show()
    
    return metrics_df

# Example usage:
def evaluate_model(model, X_test, y_test, class_names):
    """
    Evaluate the model and print all relevant metrics
    """
    # Get predictions
    y_pred = model.predict(X_test)
    
    # Get best parameters if using GridSearchCV
    if hasattr(model, 'best_params_'):
        print("Best parameters found:")
        print(model.best_params_)
        print("\nBest cross-validation score:", model.best_score_)
    
    # Evaluate the model
    metrics_df = evaluate_multilabel_model(y_test, y_pred, class_names)
    
    return y_pred, metrics_df

In [349]:
evaluate_model(new_model, X_test, y_test, classes)

Empty Text (13):  []


Feature Extraction and Text Transformation Complete:
Extracted/New feature shape: (5278, 1)
Input feature shape:  (5278,)
Best parameters found:
{'clf__estimator__min_samples_split': 2, 'clf__estimator__n_estimators': 50, 'features__text_pipeline__vect__ngram_range': (1, 2)}

Best cross-validation score: 0.26658139094182304


ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

### 7. Test your model
Show the accuracy, precision, and recall of the tuned model.  

Since this project focuses on code quality, process, and  pipelines, there is no minimum performance metric needed to pass. However, make sure to fine tune your models for accuracy, precision and recall to make your project stand out - especially for your portfolio!

In [342]:
for col_index in range(0, y_test.shape[1]):
    print(classes[col_index])
    report = classification_report(y_test[:,col_index], y_pred[:,col_index], zero_division=1)
    print(report)


print(accuracy(y_test, y_pred))    


def model_metrics():
    """
    Custom function to evaluate performance of ml models
    Accuracy, Recall & Precision.
    
    Args
    Returns:
        None, 
    """
    accuracy = accuracy(y_actual, y_pred)
    if labels:
        confusion_matrix = confusion_matrix(y_actual, y_pred, labels=labels)
    else:
        confusion_matrix = confusion_matrix(y_actual, y_pred)
    print(confusion_matrix)
    return accuracy, confusion_matrix


related
              precision    recall  f1-score   support

           0       0.71      0.45      0.55      1183
           1       0.85      0.95      0.90      4054
           2       0.46      0.39      0.42        41

    accuracy                           0.83      5278
   macro avg       0.67      0.59      0.62      5278
weighted avg       0.82      0.83      0.82      5278

request
              precision    recall  f1-score   support

           0       0.90      0.98      0.94      4361
           1       0.82      0.50      0.62       917

    accuracy                           0.89      5278
   macro avg       0.86      0.74      0.78      5278
weighted avg       0.89      0.89      0.88      5278

offer
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5258
           1       1.00      0.00      0.00        20

    accuracy                           1.00      5278
   macro avg       1.00      0.50      0.50      527

### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF

In [291]:
def model_pipeline():
    """
    Constructs an ML pipeline using FeatureUnion to add a new binary feature.
    The feature checks if the first word in each text is a verb (1 if a verb, 0 otherwise),
    and combines this feature with the text data processed through a TF-IDF transformer pipeline.
    The combined feature matrix is used to train the model to enhance its performance.

    Returns:
        model (Pipeline): A scikit-learn pipeline object for training and evaluation, 
        which includes feature extraction, transformation, and a classifier.
    """
    
    text_pipeline = Pipeline([
        ("vect", CountVectorizer(tokenizer=tokenize)),
        ("tfidf", TfidfTransformer())
    ])
    
    features = FeatureUnion([
        ("text_pipeline",text_pipeline),
        ("starting_verb", StartingVerbExtractor())
    ])
    
    model = Pipeline([
        ("features", features),
        ("clf", RandomBoostedTrees())
])
    
    # specify parameters for grid search
    parameters = {
        'features__text_pipeline__vect__ngram_range': ((1, 1), (1, 2)),
        'clf__n_estimators': [1, 25, 125],
        'clf__max_depth': [None, 25, 125]
    } 
    
    # create grid search object
    # cv = GridSearchCV(estimator=pipeline, param_grid=parameters, verbose=2, cv=3, error_score='raise' )
    
    return model

In [None]:
# Experimental Parameters


# parameters = {
#         'features__text_pipeline__vect__ngram_range': ((1, 1), (1, 2)),
#         'clf__n_estimators': [50, 100, 200],
#         'clf__min_samples_split': [2, 3, 4]
#     }

### 9. Export your model as a pickle file

In [344]:
import pickle 

# Step 2: Export the model using pickle
with open('train_classifier.pkl', 'wb') as model_file:
    pickle.dump(new_model, model_file)

print("Model exported successfully!")

Model exported successfully!


### 10. Use this notebook to complete `train_classifier.py`
Use the template file attached in the Resources folder to write a script that runs the steps above to create a database and export a model based on a new dataset specified by the user.

In [67]:
# def build_model():
#     """
#     Model.
    
#     Args
    
#     """
#     pipeline = Pipeline([
#     ('features', FeatureUnion([
#         ('text_pipeline', Pipeline([
#             ('vect', CountVectorizer(tokenizer=tokenize)),
#             ('tfidf', TfidfTransformer())
#         ])),
#         ('starting_verb', StartingVerbExtractor())
#     ])),
#     ('clf', MultiOutputClassifier(RandomForestClassifier()))
#     ])
            
#     # specify parameters for grid search
#     parameters = {
#         'features__text_pipeline__vect__ngram_range': ((1, 1), (1, 2)),
#         'clf__estimator__n_estimators': [10, 25, 50],
#         'clf__estimator__min_samples_split': [2, 3, 4]
#     }
    
#     # create grid search object
#     # cv = GridSearchCV(estimator=pipeline, param_grid=parameters, verbose=2, cv=3, error_score='raise' )
    
#     return pipeline



# def build_model():
#     """
#     Model.
    
#     Args
    
#     """
        
#     pipeline = Pipeline([
#     ('features', FeatureUnion([
#         ('text_pipeline', Pipeline([
#             ('vect', CountVectorizer(tokenizer=tokenize)),
#             ('tfidf', TfidfTransformer())
#         ])),
#         ('starting_verb', StartingVerbExtractor())
#     ])),
#     ('clf', MultiOutputClassifier(RandomForestClassifier()))
#     ])
            
#     # specify parameters for grid search
#     parameters = {
#         'features__text_pipeline__vect__ngram_range': ((1, 1), (1, 2)),
#         'clf__estimator__n_estimators': [10, 25, 50],
#         'clf__estimator__min_samples_split': [2, 3, 4]
#     }
    
#     # create grid search object
#     # cv = GridSearchCV(estimator=pipeline, param_grid=parameters, verbose=2, cv=3, error_score='raise' )
    
#     return pipeline
