# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [111]:
# import libraries
import numpy as np
import pandas as pd
import sklearn
import nltk
from sqlalchemy import create_engine

In [112]:
from sqlalchemy import create_engine

# load data from database
engine =  create_engine('sqlite:///DisasterTweets.db')

In [113]:
def load_data(table_name, conn_engine):
    """
    Args:
       table_name("String"):
       conn_engine"
    """
    # Load data from a specific table into a DataFrame
    table_name = 'cleandata'
    df = pd.read_sql_table(table_name, con=conn_engine)

    X = df["message"]
    Y = df.iloc[:,4:]
    
    return X, Y, df

text_inputs, response_labels, df = load_data(table_name="cleandata", conn_engine=engine)
X, Y = text_inputs.values, response_labels.values

In [117]:
def display_dataset(X_train, y_train, X_test=None, y_test=None):
    """
    """
    print("unique Y values: ", np.unique(Y))
    print("training set, X: ", X_train.shape)
    if X_test is not None:
        print("test set, X: ",X_test.shape)
    print("training set, Y: ",y_train.shape)
    if y_test is not None:
        print("test set, Y: ",y_test.shape)
        
def data_type_check(X1, X2):
    """
    """
    # check data types of 
    print("X1 shape: ", X1.shape)
    print("X2 shape: ", X2.shape)
    print("X1 Type: ", type(X1))
    print("X2 Type: ",type(X2))          

In [120]:
# View the first few rows of the DataFrame
display(df.head(3))
display_dataset(X, Y)
classes = response_labels.columns
print(f"\nlabels: {list(classes)}")

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


unique Y values:  [0 1 2]
training set, X:  (26386,)
training set, Y:  (26386, 36)

labels: ['related', 'request', 'offer', 'aid_related', 'medical_help', 'medical_products', 'search_and_rescue', 'security', 'military', 'child_alone', 'water', 'food', 'shelter', 'clothing', 'money', 'missing_people', 'refugees', 'death', 'other_aid', 'infrastructure_related', 'transport', 'buildings', 'electricity', 'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure', 'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold', 'other_weather', 'direct_report']


### 2. Write a tokenization function to process your text data

In [116]:
# Download the stopwords and all nltk relevant packages.
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt_tab')



[nltk_data] Downloading package punkt to /Users/emmanuele/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/emmanuele/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/emmanuele/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/emmanuele/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/emmanuele/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/emmanuele/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [92]:
import re
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


In [93]:
def tokenize(text, stop_words=None):
    """
    Tokenize a text by normalizing, lemmatizing and removing stop words.
    
    Args:
        text (list): list of strings
        stop_words (set): a set of word strings for stop words.

    Returns:
        tokens(list): list of token strings.
    """
    # Import stopwords if not imported.
    if stop_words is None:
        stop_words = set(stopwords.words("english"))
    
    lemmatizer = WordNetLemmatizer()    
    
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    
    # Replace URLs with a placeholder and normalize case.
    normalized_text = re.sub(url_regex, ' ', text.lower())

    # Replace non-alphanumeric characters with spaces.
    normalized_text = re.sub(r'[^a-zA-Z0-9]', ' ', normalized_text)
    
    tokens = word_tokenize(normalized_text)
    
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    
    return tokens

# Consider extending the tokenize function to be able to perform sentence tokenization.

  url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'


In [94]:
#Testing Tokenize Function

text1 = "Barclaysjbki CEO stresses the importance of regulatory and cultural reform in financial services at Brussels conference  https://www.google.com"
print(f"input text: {text1}\n")
print(f"text tokens: {tokenize(text1)} \n")
text2 = "The No. 8 Northeast Gale or storm signal was issued at 5.55pm yesterday (September 14) and was replaced by Southeast gale and storm signal at 12.35am today (September 15)."
print(f"input text: {text2} \n")
print(f"text tokens: {tokenize(text2)} \n")
sentence_list = sent_tokenize(text2)
print(sentence_list)
for text in sentence_list:
    print(text,"\n")
    print(tokenize(text),"\n")

Barclaysjbki CEO stresses the importance of regulatory and cultural reform in financial services at Brussels conference  https://www.google.com 

['barclaysjbki', 'ceo', 'stress', 'importance', 'regulatory', 'cultural', 'reform', 'financial', 'service', 'brussels', 'conference'] 

The No. 8 Northeast Gale or storm signal was issued at 5.55pm yesterday (September 14) and was replaced by Southeast gale and storm signal at 12.35am today (September 15). 

['8', 'northeast', 'gale', 'storm', 'signal', 'issued', '5', '55pm', 'yesterday', 'september', '14', 'replaced', 'southeast', 'gale', 'storm', 'signal', '12', '35am', 'today', 'september', '15'] 

['The No.', '8 Northeast Gale or storm signal was issued at 5.55pm yesterday (September 14) and was replaced by Southeast gale and storm signal at 12.35am today (September 15).']
The No. 

[] 

8 Northeast Gale or storm signal was issued at 5.55pm yesterday (September 14) and was replaced by Southeast gale and storm signal at 12.35am today (Sept

### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. You may find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [95]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

#help(sklearn.multioutput)

In [97]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=41)

In [98]:
display_dataset(X_train=X_train, y_train=y_train, X_test = X_test, y_test=y_test)
print("\n")
print(data_type_check(X_train, y_train))

Unique Y values:  [0 1 2]
training set, X:  (21108,)
test set, X:  (5278,)
training set, Y:  (21108, 36)
test set, Y:  (5278, 36)


X1 shape:  (21108,)
X2 shape:  (21108, 36)
X1 Type:  <class 'numpy.ndarray'>
X2 Type:  <class 'numpy.ndarray'>
None


In [99]:
model = Pipeline([
    ("vectorize", CountVectorizer(tokenizer=tokenize)),
    ("tfidf", TfidfTransformer()),
    ("clf", RandomForestClassifier())
])

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [100]:
trained_model = model.fit(X_train, y_train)



### 5. Test your model
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [101]:
from sklearn.metrics import classification_report, accuracy_score
#help(sklearn.metrics)

In [102]:
y_pred = trained_model.predict(X_test)

In [103]:
def accuracy(y_actual, y_pred):
    """_summary_

    Args:
        y_actual (_type_): 1D or ND arrary for labelled y/ output examples
        y_pred (_type_): 1D or ND arrary for predicted/inference output examples
    """
    accuracy = (y_pred == y_actual).mean()
    print("Accuracy: ", accuracy)
    return accuracy

print(accuracy(y_test, y_pred))

Accuracy:  0.9451075744179193
0.9451075744179193


In [104]:
# Now you can generate the classification report
for col_index in range(0,y_test.shape[1]):
    report = classification_report(y_test[:,col_index], y_pred[:, col_index], zero_division=0)
    print(classes[col_index])
    print(report)

related
              precision    recall  f1-score   support

           0       0.69      0.50      0.58      1183
           1       0.86      0.93      0.90      4054
           2       0.65      0.41      0.51        41

    accuracy                           0.83      5278
   macro avg       0.73      0.62      0.66      5278
weighted avg       0.82      0.83      0.82      5278

request
              precision    recall  f1-score   support

           0       0.90      0.98      0.94      4361
           1       0.82      0.50      0.62       917

    accuracy                           0.89      5278
   macro avg       0.86      0.74      0.78      5278
weighted avg       0.89      0.89      0.88      5278

offer
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5258
           1       0.00      0.00      0.00        20

    accuracy                           1.00      5278
   macro avg       0.50      0.50      0.50      527

### 6. Improve your model
Use grid search to find better parameters. 

In [105]:
# help(sklearn)
from sklearn.base import BaseEstimator
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import FeatureUnion,Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

In [106]:
class StartingVerbExtractor(BaseEstimator, TransformerMixin):

    def starting_verb(self, text):
        try:
            print("Start Verb Extractor running...")
            # print("\n\nText:", text)
            sentence_list = nltk.sent_tokenize(text)
            for sentence in sentence_list:
                # Tokenize the sentence
                text_tokens = tokenize(sentence)
                
                if text_tokens:
                    # Get the POS (parts of speech) of the words in the text.
                    first_word, first_tag = nltk.pos_tag(text_tokens)[0]
                    
                    # Check if the first word is a Verb or 'RT' (retweet). 
                    if first_tag in ['VB', 'VBP', 'UH'] or first_word == 'RT':
                        # print("\nVerb, Tag: ",first_tag, ", First Word: ",first_word)
                        return 1 
                    else:
                        # print(f"\nNon-verb, Tag: {first_tag}, First Word: ,{first_word}")
                        return 0
                else:
                    print("Empty Text Tokens")
            
            # If no sentences were found in the entire text.
            print("Empty Text", sentence_list)
            return 0
        
        except IndexError as e:
            print(f"IndexError: {e}")
            print(f"Text causing issue: {text}")
            return 0
    
        except Exception as e:
            print(f"Unexpected error: {e}")
            print(f"Text causing issue: {text}")
            return 0
    

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_verb)
        df = pd.DataFrame(X_tagged) #.values
        
        # Log information about the transformation
        print("\n\nFeature Extraction and Text Transformation Complete:")
        print("Extracted/New feature shape:", df.shape)
        print("Input feature shape: ", X.shape)
        
        # Perform data type check
        data_type_check(X, df)

        # # Check for shape mismatch and raise an error if neccessary
        if df.shape != X.shape:
            raise ValueError(f"Feature Shape MisMatch Errror: New Feature: {df.shape}, Previous Feature: {X.shape}")
        return df

In [107]:
def build_model():
    """
    Model.
    
    Args
    
    """
    pipeline = Pipeline([
    ('features', FeatureUnion([
        ('text_pipeline', Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer())
        ])),
        ('starting_verb', StartingVerbExtractor())
    ])),
    ('clf', MultiOutputClassifier(RandomForestClassifier()))
    ])
            
    # specify parameters for grid search
    parameters = {
        'features__text_pipeline__vect__ngram_range': ((1, 1), (1, 2)),
        'clf__estimator__n_estimators': [10, 25, 50],
        'clf__estimator__min_samples_split': [2, 3, 4]
    }
    
    # create grid search object
    #cv = GridSearchCV(estimator=pipeline, param_grid=parameters, verbose=2, cv=3, error_score='raise' )
    
    return pipeline



# def build_model():
#     pipeline = Pipeline([
#         ('features', FeatureUnion([
            
#             ('text_pipeline', Pipeline([
#                 ('vect', CountVectorizer(tokenizer=tokenize)),
#                 ('tfidf', TfidfTransformer())
#             ])),

#             ('starting_verb', StartingVerbExtractor())
#         ])),
    
#         ('clf', RandomForestClassifier())
#     ])

#     # specify parameters for grid search
#     parameters = {
#         'clf__n_estimators': [1, 25, 30],
#         'clf__max_depth': [None, 25, 30]
#     }

#     # create grid search object
#     cv = GridSearchCV(pipeline, param_grid=parameters)
    
#     return cv


In [108]:
display_dataset(X_train=X_train, y_train=y_train, X_test = X_test, y_test=y_test)
print("\n")
print(data_type_check(X_train, y_train))

Unique Y values:  [0 1 2]
training set, X:  (21108,)
test set, X:  (5278,)
training set, Y:  (21108, 36)
test set, Y:  (5278, 36)


X1 shape:  (21108,)
X2 shape:  (21108, 36)
X1 Type:  <class 'numpy.ndarray'>
X2 Type:  <class 'numpy.ndarray'>
None


In [109]:
new_model = build_model()
new_model.fit(X_train, y_train)

# print("Best parameters found: ", new_model.best_params_)
# print(new_model)





Non-verb, Tag: NN, First Word: ,relief

Non-verb, Tag: NNS, First Word: ,trinamariexox

Non-verb, Tag: VBN, First Word: ,sanctioned

Non-verb, Tag: VBD, First Word: ,decided

Non-verb, Tag: NN, First Word: ,shelter

Non-verb, Tag: NN, First Word: ,movement

Non-verb, Tag: NN, First Word: ,jfk

Non-verb, Tag: NN, First Word: ,foodtruck

Non-verb, Tag: RB, First Word: ,early

Non-verb, Tag: JJ, First Word: ,whould

Non-verb, Tag: NNS, First Word: ,people

Non-verb, Tag: RB, First Word: ,normally

Non-verb, Tag: JJ, First Word: ,second

Non-verb, Tag: NNS, First Word: ,c

Non-verb, Tag: NN, First Word: ,ceci

Non-verb, Tag: JJ, First Word: ,un

Non-verb, Tag: NN, First Word: ,tent

Non-verb, Tag: NN, First Word: ,forecast

Non-verb, Tag: MD, First Word: ,must

Non-verb, Tag: JJ, First Word: ,international

Non-verb, Tag: NN, First Word: ,el

Non-verb, Tag: NN, First Word: ,bamako

Non-verb, Tag: JJ, First Word: ,live

Non-verb, Tag: VBN, First Word: ,given

Non-verb, Tag: NNS, First Word

In [110]:
new_model = build_model()
for name, transformer in new_model.steps:
    output = transformer.fit_transform(X_train, y_train)
    print(f"{name} output shape:", output.shape)




Non-verb, Tag: NN, First Word: ,relief

Non-verb, Tag: NNS, First Word: ,trinamariexox

Non-verb, Tag: VBN, First Word: ,sanctioned

Non-verb, Tag: VBD, First Word: ,decided

Non-verb, Tag: NN, First Word: ,shelter

Non-verb, Tag: NN, First Word: ,movement

Non-verb, Tag: NN, First Word: ,jfk

Non-verb, Tag: NN, First Word: ,foodtruck

Non-verb, Tag: RB, First Word: ,early

Non-verb, Tag: JJ, First Word: ,whould

Non-verb, Tag: NNS, First Word: ,people

Non-verb, Tag: RB, First Word: ,normally

Non-verb, Tag: JJ, First Word: ,second

Non-verb, Tag: NNS, First Word: ,c

Non-verb, Tag: NN, First Word: ,ceci

Non-verb, Tag: JJ, First Word: ,un

Non-verb, Tag: NN, First Word: ,tent

Non-verb, Tag: NN, First Word: ,forecast

Non-verb, Tag: MD, First Word: ,must

Non-verb, Tag: JJ, First Word: ,international

Non-verb, Tag: NN, First Word: ,el

Non-verb, Tag: NN, First Word: ,bamako

Non-verb, Tag: JJ, First Word: ,live

Non-verb, Tag: VBN, First Word: ,given

Non-verb, Tag: NNS, First Word

AttributeError: 'MultiOutputClassifier' object has no attribute 'fit_transform'

In [55]:
print("Best parameters found: ", new_model.best_params_)
print(new_model)
y_pred = new_model.predict(X_test)

AttributeError: 'Pipeline' object has no attribute 'best_params_'

### 7. Test your model
Show the accuracy, precision, and recall of the tuned model.  

Since this project focuses on code quality, process, and  pipelines, there is no minimum performance metric needed to pass. However, make sure to fine tune your models for accuracy, precision and recall to make your project stand out - especially for your portfolio!

In [86]:
for col_index in range(0, y_test.shape[1]):
    print(classes[col_index])
    report = classification_report(y_test[:,col_index], y_pred[:,col_index])
    print(report)


print(accuracy(y_test, y_pred))    

# classes or np.unique(y_test) confirm **
print(accuracy(y_test, y_pred))


def model_metrics():
    """
    Custom function to evaluate performance of ml models
    Accuracy, Recall & Precision.
    
    Args
    Returns:
        None, 
    """
    accuracy = accuracy(y_actual, y_pred)
    if labels:
        confusion_matrix = confusion_matrix(y_actual, y_pred, labels=labels)
    else:
        confusion_matrix = confusion_matrix(y_actual, y_pred)
    print(confusion_matrix)
    return accuracy, confusion_matrix


related
              precision    recall  f1-score   support

           0       0.70      0.50      0.58      1183
           1       0.86      0.94      0.90      4054
           2       0.67      0.34      0.45        41

    accuracy                           0.84      5278
   macro avg       0.74      0.59      0.64      5278
weighted avg       0.82      0.84      0.82      5278

request
              precision    recall  f1-score   support

           0       0.90      0.97      0.94      4361
           1       0.81      0.50      0.62       917

    accuracy                           0.89      5278
   macro avg       0.86      0.74      0.78      5278
weighted avg       0.89      0.89      0.88      5278

offer
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5258
           1       0.00      0.00      0.00        20

    accuracy                           1.00      5278
   macro avg       0.50      0.50      0.50      527

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF

In [None]:
def model_pipeline():
    """
    ML pipeline model with Feature Union for adding a 
    noun verb Tag to the message
    
    using Feature Union to improve performance
    return:
        model(GridSearchCV object): 
    
    """
    text_processing_pipeline = Pipeline([
        ("Vect", CountVectorizer(tokenizer=tokenize)),
        ("Tfidf", TfidfTransformer())
    ])
    
    text_feature_union = FeatureUnion([
        ("FeatureExtractor",text_processing_pipeline),
        ("StartVerb", StartingVerbEstimator())
    ])
    
    
    model = Pipeline([
        ("Text_feature_union",text_feature_union),
        ("clf", RandomBoostedTrees())
])
    
    # specify parameters for grid search
    parameters = {
        'clf__n_estimators': [1, 25, 125],
        'clf__max_depth': [None, 25, 125]
    } 
    
    
    
    return GridSearchCV(model, parameters)

### 9. Export your model as a pickle file

In [None]:
# Step 2: Export the model using pickle
with open('train_classifier.pkl', 'wb') as model_file:
    pickle.dump(new_model, model_file)

print("Model exported successfully!")

### 10. Use this notebook to complete `train_classifier.py`
Use the template file attached in the Resources folder to write a script that runs the steps above to create a database and export a model based on a new dataset specified by the user.

In [None]:
# params = {
#     'Clf__n_estimators':[1,30],
#     'Clf__max_features':[1,20],
#     'Clf__max_depth':[2,10]
# }

#params = {
#         'features__vect__ngram_range': ((1, 1), (1, 2)),
#         'clf__n_estimators': [50, 100, 200],
#         'clf__min_samples_split': [2, 3, 4]
#     }
#
# specify parameters for grid search

# * ------------------------------------------------------- *

# params = {
#     'clf__n_estimators': [1, 25, 125],
#     'clf__max_depth': [None, 25, 125]
# }


# pipeline = Pipeline([
#         ('features', FeatureUnion([
            
#             ('text_pipeline', Pipeline([
#                 ('vect', CountVectorizer(tokenizer=tokenize)),
#                 ('tfidf', TfidfTransformer())
#             ])),

#             ('starting_verb', StartingVerbEstimator())
#         ])),
    
#         ('clf', RandomForestClassifier())
#     ])


# * ------------------------------------------------------- *

        

In [67]:
def model_pipeline():
    """
    ML pipeline model with Grid Search CV for parameters tuning
    
    return:
    
    """
    text_processing_pipeline = Pipeline([
        ("vect", CountVectorizer(tokenizer=tokenize)),
        ("tfidf", TfidfTransformer())
    ])
    
    pipeline = Pipeline([
        ("features",text_processing_pipeline),
        ("clf", RandomForestClassifier())
])
    
     # specify parameters for grid search
    parameters = {
        'clf__n_estimators': [1, 25, 30],
        'clf__max_depth': [None, 25, 30]
    }

    # create grid search object
    cv = GridSearchCV(pipeline, parameters)
    return cv