In [1]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [2]:
import numpy as np
import pandas as pd
import pickle

In [None]:
tags = [
    'javascript',
    'python',
    'java',
    'c_sharp',
    'php',
    'android',
    'html',
    'jquery',
    'c_plus_plus',
    'css',
    'ios',
    'sql',
    'mysql',
    'r',
    'reactjs',
    'node_js',
    'arrays',
    'c',
    'asp_net',
    'json'
]

X_tf_idf = pd.read_csv("X_tf_idf.csv")
X_word2vec = pd.read_csv("X_word2vec.csv")
X_bert = pd.read_csv("X_bert.csv")
X_use = pd.read_csv("X_use.csv")

df = pd.read_csv("Question Extraction V05 year.csv")

y = df[tags]

title_vectorizer = pickle.load(open("title_vectorizer.sav", "rb"))
body_vectorizer = pickle.load(open("body_vectorizer.sav", "rb"))


# We need this function to evaluate data drift.
def get_X_tf_idf(df, title_vectorizer, body_vectorizer):
    cleaned_title = df['Title'].apply(clean_for_tf_idf)
    title_tf_idf_features = title_vectorizer.transform(cleaned_title)
    
    title_tf_idf_features_df = pd.DataFrame(
        data=title_tf_idf_features.toarray(),
        columns=['title_' + feature_name for feature_name in title_vectorizer.get_feature_names_out()]
    )
    
    cleaned_body = df['Body'].apply(clean_for_tf_idf)
    body_tf_idf_features = body_vectorizer.transform(cleaned_body)

    body_tf_idf_features_df = pd.DataFrame(
        data=body_tf_idf_features.toarray(),
        columns=['body_' + feature_name for feature_name in body_vectorizer.get_feature_names_out()]
    )

    X_tf_idf = pd.concat(
        [
            title_tf_idf_features_df,
            body_tf_idf_features_df
        ],
        axis=1
    )
    
    return X_tf_idf

In [None]:
test_size = .2

X_tf_idf_train,   X_tf_idf_test,   _,       __     = train_test_split(X_tf_idf,   y, test_size=test_size, random_state=0)
X_word2vec_train, X_word2vec_test, _,       __     = train_test_split(X_word2vec, y, test_size=test_size, random_state=0)
X_bert_train,     X_bert_test,     _,       __     = train_test_split(X_bert,     y, test_size=test_size, random_state=0)
X_use_train,      X_use_test,      y_train, y_test = train_test_split(X_use,      y, test_size=test_size, random_state=0)

In [None]:
URI = "http://127.0.0.1:8080"

client = mlflow.MlflowClient(tracking_uri=URI)

# Provide an Experiment description that will appear in the UI
experiment_description = (
    """Tag prediction V21. Comparing embeddings: tf-idf, Word2Vec, BERT, USE.
    Comparing models: Logistic Regression, Random Forest, XGBoost.
    """
)

# Provide searchable tabs that define characteristics of the Runs
# that will be in this Experiment
experiment_tags = {
    "project_name": "stackoverflow_tagging",
    "mlflow.note.content": experiment_description
}

# Create the Experiment, providing a unique name
experiment_name = "tag_prediction_v21"
if not client.get_experiment_by_name(experiment_name):
    client.create_experiment(name=experiment_name, tags=experiment_tags)

mlflow.set_tracking_uri(URI)

# Sets the current active experiment and
# returns the Experiment metadata
experiment = mlflow.set_experiment(experiment_name)

# Define a run name for this iteration of training.
# If this is not set, a unique name will be auto-generated for your run.

# Define an artifact path that the model will be saved to.
artifact_path = "tag_prediction_21_artifact_path"

In [None]:
for embeddings_name in ['tf_idf', 'word2vec', 'bert', 'use']:
    for model_name in ['logistic_regression', 'random_forest', 'xgboost']:
                
        run_name = embeddings_name + '_default_' + model_name
        
        params = {'embeddings_name': embeddings_name, 'model_name': model_name}

        if embeddings_name == 'tf_idf':
            X_train, X_test = X_tf_idf_train, X_tf_idf_test
        elif embeddings_name == 'word2vec':
            X_train, X_test = X_word2vec_train, X_word2vec_test
        elif embeddings_name == 'bert':
            X_train, X_test = X_bert_train, X_bert_test
        else:
            X_train, X_test = X_use_train, X_use_test

        if model_name == 'logistic_regression':
            model = OneVsRestClassifier(LogisticRegression(random_state=0))
        elif model_name == 'random_forest':
            model = RandomForestClassifier(random_state=0)
        else:
            model = XGBClassifier(random_state=0)
        
        fit_start = time.time()
        model.fit(X_train, y_train)
        fit_duration = time.time() - fit_start
        
        pred_start = time.time()
        y_pred = model.predict(X_test)
        pred_duration = time.time() - pred_start
        
        f1 = f1_score(y_test, y_pred, average='micro')
        precision = precision_score(y_test, y_pred, average='micro')
        recall = recall_score(y_test, y_pred, average='micro')
        
        metrics = {
            'fit_duration': fit_duration, 'pred_duration': pred_duration,
            'f1': f1, 'precision': precision, 'recall': recall
        }
        
        with mlflow.start_run(run_name=run_name) as run:
            mlflow.log_params(params)
            mlflow.log_metrics(metrics)
            mlflow.sklearn.log_model(
                sk_model=model, input_example=X_test, artifact_path=artifact_path
            )
        print(f"run name: {run_name}.")
        print(metrics)

In [None]:
mlflow.search_runs(
    order_by=["metrics.f1 DESC"],
    search_all_experiments=True
).iloc[0]

# Model fine-tuning

In [3]:
# We want to fine-tune XGBoost but it takes too long to run as is,
# because X has 4,000 features. We want to decrease the number of features.
# For that we look at the most important features, thanks to the
# Logistic Regression model.
# For each tag, we keep the 20 features with highest coefficient.
# This will give us a smaller X.
# We'll then be able to fine-tune XGBoost on that smaller X.

In [None]:
lr_model_artifact_uri = (
    mlflow.search_runs(
        filter_string="params.model_name = 'logistic_regression'",
        order_by=["metrics.f1 DESC"],
        search_all_experiments=True
    ).iloc[0]['artifact_uri']
)

path = lr_model_artifact_uri + '/' + artifact_path

lr_model = mlflow.sklearn.load_model(path)

In [None]:
def get_most_important_features(lr_model, X, tags=tags, n_features_per_tag=20):
    most_important_features = set()
    for i, tag in enumerate(tags):
        coefs = lr_model.estimators_[i].coef_[0]
        feature_importance_for_that_tag = pd.Series(
            data=coefs,
            index=X.columns
        )
        
        important_features_for_that_tag = set(feature_importance_for_that_tag.sort_values(ascending=False).head(n_features_per_tag).index)
        most_important_features = most_important_features.union(important_features_for_that_tag)
    most_important_features = list(most_important_features)
    return most_important_features

In [None]:
most_important_features = get_most_important_features(
    lr_model=lr_model,
    X=X_tf_idf
)

In [None]:
X_tf_idf_20_train = X_tf_idf_train[most_important_features]
X_tf_idf_20_test = X_tf_idf_test[most_important_features]

In [None]:
for embeddings_name in ['tf_idf_top_20']:
    for model_name in ['xgboost']:
        for eta in [.1, .3]:
            for colsample_bytree in [.5, 1]:
                for max_depth in [5, 6, 7]:
                            
                    run_name = (
                        embeddings_name + '_' + model_name + '_eta_' + str(eta) +
                        '_colsample_bytree_' + str(colsample_bytree) + '_max_depth_' + str(max_depth)
                    )
                    
                    params = {
                        'embeddings_name': embeddings_name, 'model_name': model_name,
                        'eta': eta, 'colsample_bytree': colsample_bytree, 'max_depth': max_depth
                    }
            
                    X_train, X_test = X_tf_idf_20_train, X_tf_idf_20_test
            
                    model = XGBClassifier(
                        eta=eta, colsample_bytree=colsample_bytree, max_depth=max_depth,
                        random_state=0
                    )
                    
                    fit_start = time.time()
                    model.fit(X_train, y_train)
                    fit_duration = time.time() - fit_start
                    
                    pred_start = time.time()
                    y_pred = model.predict(X_test)
                    pred_duration = time.time() - pred_start
                    
                    f1 = f1_score(y_test, y_pred, average='micro')
                    precision = precision_score(y_test, y_pred, average='micro')
                    recall = recall_score(y_test, y_pred, average='micro')
                    
                    metrics = {
                        'fit_duration': fit_duration, 'pred_duration': pred_duration,
                        'f1': f1, 'precision': precision, 'recall': recall
                    }
                    
                    with mlflow.start_run(run_name=run_name) as run:
                        mlflow.log_params(params)
                        mlflow.log_metrics(metrics)
                        mlflow.sklearn.log_model(
                            sk_model=model, input_example=X_test, artifact_path=artifact_path
                        )
                    duration = time.time() - start
                    print(f"run name: {run_name}.")
                    print(metrics)

In [None]:
mlflow.search_runs(
    order_by=["metrics.f1 DESC"],
    search_all_experiments=True
).iloc[0]

# Model for tag prediction app

In [None]:
# We're looking for a light, robust and high-performance model for the tag prediction app.

# Fine-tuning Logistic Regression on full tf_idf embeddings.

In [None]:
for embeddings_name in ['tf_idf']:
    for model_name in ['logistic_regression']:
        for penalty in ['l2', 'none']:
                            
                    run_name = (
                        embeddings_name + '_' + model_name + '_penalty_' + penalty
                    )
                    
                    params = {
                        'embeddings_name': embeddings_name, 'model_name': model_name,
                        'penalty': penalty
                    }
            
                    X_train, X_test = X_tf_idf_train, X_tf_idf_test
            
                    model = OneVsRestClassifier(LogisticRegression(penalty=penalty, random_state=0))
                    
                    fit_start = time.time()
                    model.fit(X_train, y_train)
                    fit_duration = time.time() - fit_start
                    
                    pred_start = time.time()
                    y_pred = model.predict(X_test)
                    pred_duration = time.time() - pred_start
                    
                    f1 = f1_score(y_test, y_pred, average='micro')
                    precision = precision_score(y_test, y_pred, average='micro')
                    recall = recall_score(y_test, y_pred, average='micro')
                    
                    metrics = {
                        'fit_duration': fit_duration, 'pred_duration': pred_duration,
                        'f1': f1, 'precision': precision, 'recall': recall
                    }
                    
                    with mlflow.start_run(run_name=run_name) as run:
                        mlflow.log_params(params)
                        mlflow.log_metrics(metrics)
                        mlflow.sklearn.log_model(
                            sk_model=model, input_example=X_test, artifact_path=artifact_path
                        )
                    duration = time.time() - start
                    print(f"run name: {run_name}.")
                    print(metrics)

In [None]:
mlflow.search_runs(
    filter_string="params.model_name = 'logistic_regression'",
    order_by=["metrics.f1 DESC"],
    search_all_experiments=True
).iloc[0]

In [None]:
best_lr_model_artifact_uri = (
    mlflow.search_runs(
        filter_string="params.model_name = 'logistic_regression'",
        order_by=["metrics.f1 DESC"],
        search_all_experiments=True
    ).iloc[0]['artifact_uri']
)

path = best_lr_model_artifact_uri + '/' + artifact_path

best_lr_model = mlflow.sklearn.load_model(path)

In [None]:
# Saving the model as Pickle file to use it in the tag prediction app.

In [None]:
pickle.dump(best_lr_model, open('tag_predictor.sav', "wb"))

# Evaluating model drift

In [None]:
year_range = range(2008, 2024)

data_drift_Xs_dict = {}
for year in year_range:
    year_data_drift_df = df.loc[df['year'] == year]
    
    year_data_drift_X = get_X_tf_idf(
        year_data_drift_df, title_vectorizer, body_vectorizer
    )
    data_drift_Xs_dict.update({year: year_data_drift_X})

In [None]:
for embeddings_name in ['tf_idf']:
    for model_name in ['logistic_regression']:
        for year in year_range:
            run_name = (
                embeddings_name + '_best_' + model_name
                + '_year_' + str(year)
            )
            
            params = {
                'embeddings_name': embeddings_name, 'model_name': model_name,
                'year': year
            }
            
            X_test = data_drift_Xs_dict[year]
            
            model = best_lr_model
            
            pred_start = time.time()
            y_pred = model.predict(X_test)
            pred_duration = time.time() - pred_start
            
            f1 = f1_score(y_test, y_pred, average='micro')
            precision = precision_score(y_test, y_pred, average='micro')
            recall = recall_score(y_test, y_pred, average='micro')
            
            metrics = {
                'pred_duration': pred_duration,
                'f1': f1, 'precision': precision, 'recall': recall
            }
            
            with mlflow.start_run(run_name=run_name) as run:
                mlflow.log_params(params)
                mlflow.log_metrics(metrics)
                mlflow.sklearn.log_model(
                    sk_model=model, input_example=X_test, artifact_path=artifact_path
                )
            duration = time.time() - start
            print(f"run name: {run_name}.")
            print(metrics)