In [1]:
TRAIN_PATH = "../data/processed/00_train_df.pkl"
VAL_PATH = "../data/processed/00_validation_df.pkl"
TEST_PATH = "../data/processed/00_test_df.pkl"

TRAIN_FEATURES_BOW = "../data/processed/01_train_features_BOW.pkl"
VAL_FEATURES_BOW = "../data/processed/01_validation_features_BOW.pkl"
TEST_FEATURES_BOW = "../data/processed/01_test_features_BOW.pkl"

TRAIN_FEATURES_TFIDF = "../data/processed/01_train_features_TFIDF.pkl"
VAL_FEATURES_TFIDF = "../data/processed/01_validation_features_TFIDF.pkl"
TEST_FEATURES_TFIDF = "../data/processed/01_test_features_TFIDF.pkl"


TRAIN_TARGET_EXPORT = "../data/processed/01_train_target.pkl"
VAL_TARGET_EXPORT = "../data/processed/01_validation_target.pkl"

In [2]:
# Load packages
import pandas as pd
import numpy as np

import pickle

from sklearn.pipeline import make_pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

___
## Functions:

In [3]:
def calculate_quality(ground_truth, predictions, metric_function, model_name, Average='macro'):
    """
    Calculate the quality of the model according to different metric scores
    Input:
        ground_truth: from real observed data
        predictions: the predicted values from the model
        metric_function: the metric score funcrion used to measure performance
    Output:
        A dict of all scores for the given inputs
    """
    quality_score = {}
    if metric_function.__name__ == 'accuracy_score':
        quality_score[model_name] = round(metric_function(ground_truth, predictions), 3)
    else:
        quality_score[model_name] = round(metric_function(ground_truth, predictions, average=Average), 3)

    quality_score = pd.Series(quality_score.values(), index=quality_score.keys())
    
    return quality_score

In [4]:
def evaluate_model(model, data_list, metrics_list, model_name, score_average='macro'):
    """
    Get the scores of the model to better understand its performance
    Input:
        data_list: a list of all data that we evaluate model upon, train and test data.
                typical input : [[X_train, y_train, 'train'], [X_test, y_test, 'test']]
        scores_list: a list of all metrics used in the evaluation. 
                typical input : [accuracy_score, precision_score, recall_score, f1_score]
        model_name: a string input used as the index for score dataframe.
    Output:
        scores: a dataframe of evaluation based on data.
        general_error: the generalized error that would be used for logging in mlflow
    """
    predicts = []
    scores = []
    for [X,y,stage] in data_list:

        probas = model.predict(X)
        predictions = pd.DataFrame(probas)
        predicts.append(predictions)

        result = {score.__name__:calculate_quality(y, predictions, score, f"{model_name}_{stage}", score_average)
                        for score in metrics_list}

        result = pd.concat(result, axis=1)
        scores.append(result)
    scores = pd.concat(scores)
    return scores, predicts

In [35]:
def print_words_for_tag(classifier, tag, tags_classes, index_to_words):
    """
        classifier: trained classifier
        tag: particular tag
        tags_classes: a list of classes names from MultiLabelBinarizer
        index_to_words: index_to_words transformation
        all_words: all words in the dictionary
        
        return nothing, just print top 5 positive and top 5 negative words for current tag
    """
    print('Tag:\t{}'.format(tag))
    
    coef = classifier.coef_[tags_classes.index(tag)]
    
    top_positive_words = [index_to_words[idx] for idx in coef.argsort()[-1:-10:-1]]# top-5 words sorted by the coefficiens.
    top_negative_words = [index_to_words[idx] for idx in coef.argsort()[:10]]# bottom-5 words  sorted by the coefficients.
    print('Top positive words:\t{}'.format(', '.join(top_positive_words)))
    print('Top negative words:\t{}\n'.format(', '.join(top_negative_words)))

___
## Read data:

In [6]:
train_df = pd.read_pickle(TRAIN_PATH)
val_df = pd.read_pickle(VAL_PATH)
test_df = pd.read_pickle(TEST_PATH)

In [7]:
# Load BOW features
with open(TRAIN_FEATURES_BOW, 'rb') as handle:
    X_train_BOW = pickle.load(handle)

with open(VAL_FEATURES_BOW, 'rb') as handle:
    X_val_BOW = pickle.load(handle)

with open(TEST_FEATURES_BOW, 'rb') as handle:
    X_test_BOW = pickle.load(handle)

In [8]:
# Load TF-IDF features
with open(TRAIN_FEATURES_TFIDF, 'rb') as handle:
    X_train_tfidf = pickle.load(handle)

with open(VAL_FEATURES_TFIDF, 'rb') as handle:
    X_val_tfidf = pickle.load(handle)

with open(TEST_FEATURES_TFIDF, 'rb') as handle:
    X_test_tfidf = pickle.load(handle)

In [9]:
# Load MultiLabelBinarizer target
with open(TRAIN_TARGET_EXPORT, 'rb') as handle:
    y_train = pickle.load(handle)

with open(VAL_TARGET_EXPORT, 'rb') as handle:
    y_val = pickle.load(handle)

In [10]:
# Load the MLB
with open('../data/objects/mlb.pkl', 'rb') as handle:
    mlb = pickle.load(handle)
    
# Load the BOW_vectorizer
with open('../data/objects/BOW_vectorizer.pkl', 'rb') as handle:
    BOW_vectorizer = pickle.load(handle)
    
# Load the tfidf_vectorizer
with open('../data/objects/tfidf_vectorizer.pkl', 'rb') as handle:
    tfidf_vectorizer = pickle.load(handle)

___
## Construct the base model:

In [11]:
lor = LogisticRegression(penalty='l2', C=1.0, solver='liblinear')

___
## Train model with Bag_Of_Words representaiton:

In [12]:
BOW_clf = make_pipeline(OneVsRestClassifier(lor))
BOW_clf.fit(X_train_BOW, y_train)

Pipeline(steps=[('onevsrestclassifier',
                 OneVsRestClassifier(estimator=LogisticRegression(solver='liblinear')))])

In [13]:
# Prepare data and metrics that would be used to evaluate the model
data_list = [[X_train_BOW, y_train, 'train'], [X_val_BOW, y_val, 'test']]
metrics_list = [accuracy_score, precision_score, recall_score, f1_score] 

In [14]:
# Use evaluate_model function to run evaluations
scores_BOW, predicts_BOW = evaluate_model(BOW_clf, data_list, metrics_list, "LogisticRegression", 'micro')
scores_BOW

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
LogisticRegression_train,0.38,0.888,0.565,0.691
LogisticRegression_test,0.329,0.843,0.525,0.648


In [15]:
# Use evaluate_model function to run evaluations
scores_BOW, predicts_BOW = evaluate_model(BOW_clf, data_list, metrics_list, "LogisticRegression", 'macro')
scores_BOW

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
LogisticRegression_train,0.38,0.818,0.458,0.565
LogisticRegression_test,0.329,0.694,0.401,0.494


In [16]:
# Use evaluate_model function to run evaluations
scores_BOW, predicts_BOW = evaluate_model(BOW_clf, data_list, metrics_list, "LogisticRegression", 'weighted')
scores_BOW

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
LogisticRegression_train,0.38,0.864,0.565,0.669
LogisticRegression_test,0.329,0.801,0.525,0.624


**Run some random samples to test the classifier** 

In [17]:
# Get the invers of the y_val predictions to find the predicted class
y_val_inv_BOW = mlb.inverse_transform(predicts_BOW[1].values) # (predicts) has the predictions of both X_train_BOW and X_val_BOW

In [18]:
samples = np.random.randint(1, val_df.shape[0], 5)
for i in samples:
    print("title: ", val_df['title'][i])
    print('True labels: ', val_df['tags'][i])
    print("Predicted labels: ", y_val_inv_BOW[i])
    print()

title:  way create bean class json response
True labels:  ['java', 'json']
Predicted labels:  ('java', 'json')

title:  windowonload fired
True labels:  ['javascript']
Predicted labels:  ('javascript',)

title:  format lld expects type long long int argument 4 type int64_t
True labels:  ['c', 'linux']
Predicted labels:  ()

title:  would pass data child view parent view using iphone sdk
True labels:  ['iphone', 'objective-c']
Predicted labels:  ('iphone', 'objective-c')

title:  welcome back youve already connected app via google+ signin
True labels:  ['javascript']
Predicted labels:  ()



___
## Train model with TF-IDF representaiton:

In [19]:
tfidf_clf = make_pipeline(OneVsRestClassifier(lor))
tfidf_clf.fit(X_train_tfidf, y_train)

Pipeline(steps=[('onevsrestclassifier',
                 OneVsRestClassifier(estimator=LogisticRegression(solver='liblinear')))])

In [20]:
# Prepare data and metrics that would be used to evaluate the model
data_list = [[X_train_tfidf, y_train, 'train'], [X_val_tfidf, y_val, 'test']]
metrics_list = [accuracy_score, precision_score, recall_score, f1_score] 

In [21]:
# Use evaluate_model function to run evaluations
scores_tfidf, predicts_tfidf = evaluate_model(tfidf_clf, data_list, metrics_list, "LogisticRegression", 'micro')
scores_tfidf

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
LogisticRegression_train,0.362,0.92,0.522,0.666
LogisticRegression_test,0.334,0.893,0.501,0.642


In [22]:
# Use evaluate_model function to run evaluations
scores_tfidf, predicts_tfidf = evaluate_model(tfidf_clf, data_list, metrics_list, "LogisticRegression", 'macro')
scores_tfidf

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
LogisticRegression_train,0.362,0.797,0.366,0.478
LogisticRegression_test,0.334,0.734,0.34,0.446


In [23]:
# Use evaluate_model function to run evaluations
scores_tfidf, predicts_tfidf = evaluate_model(tfidf_clf, data_list, metrics_list, "LogisticRegression", 'weighted')
scores_tfidf

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
LogisticRegression_train,0.362,0.883,0.522,0.639
LogisticRegression_test,0.334,0.845,0.501,0.614


**Run some random samples to test the classifier** 

In [24]:
# Get the invers of the y_val predictions to find the predicted class
y_val_inv_tfidf = mlb.inverse_transform(predicts_tfidf[1].values) # (predicts) has the predictions of both X_train_BOW and X_val_BOW

In [25]:
samples = np.random.randint(1, val_df.shape[0], 5)
for i in samples:
    print("title: ", val_df['title'][i])
    print('True labels: ', val_df['tags'][i])
    print("Predicted labels: ", y_val_inv_tfidf[i])
    print()

True labels:  ['c++']
Predicted labels:  ('c++',)

title:  itext merge documents acrofields
True labels:  ['java']
Predicted labels:  ('java',)

title:  get current index combobox
True labels:  ['c#', '.net', 'winforms']
Predicted labels:  ()

title:  validate uitextview allow special characters like
True labels:  ['iphone', 'objective-c']
Predicted labels:  ()

title:  angulartojson scopeobject jsonstringify scopeobject return null
True labels:  ['javascript', 'json', 'angularjs']
Predicted labels:  ()



___
## Analysis of most important features for each class:

**1. BOW classifier:**

In [26]:
BOW_reversed_vocab = {i:word for word, i in tfidf_vectorizer.vocabulary_.items()}

In [33]:
print_words_for_tag(BOW_clf.named_steps['onevsrestclassifier'], 'c', mlb.classes, BOW_reversed_vocab)
print_words_for_tag(BOW_clf.named_steps['onevsrestclassifier'], 'c++', mlb.classes, BOW_reversed_vocab)
print_words_for_tag(BOW_clf.named_steps['onevsrestclassifier'], 'linux', mlb.classes, BOW_reversed_vocab)
print_words_for_tag(BOW_clf.named_steps['onevsrestclassifier'], 'javascript', mlb.classes, BOW_reversed_vocab)
print_words_for_tag(BOW_clf.named_steps['onevsrestclassifier'], 'python', mlb.classes, BOW_reversed_vocab)

Tag:	c
Top positive words:	branching, best, back button, breakpoints, content div, c# problem, connections, count elements, data points
Top negative words:	#1, #ifdef, + c#, 2 decimal, 12, #2, 0, #pragma, 10 seconds, 1 mysqli

Tag:	c++
Top positive words:	add field, angularjs form, basic net, checkbox click, based application, applied, color value, call member, cast
Top negative words:	#1, #ifdef, + c#, 10 seconds, 2 decimal, 12, 0, #pragma, 2d numpy, 192

Tag:	linux
Top positive words:	access array, c using, data points, code c++, allowed memory, content html, blade, c# problem, descriptors
Top negative words:	#ifdef, + c#, 1 mysqli, 301, + php, + ajax, + sql, 1 0, 1404, browser windows

Tag:	javascript
Top positive words:	#ifdef, ascending order, 3 seconds, abc, access session, auto complete, 2d numpy, associated, background color
Top negative words:	24 hours, #pragma, 2 decimal, 22, access file, 3 razor, 10 seconds, alt, ado, ajax javascript

Tag:	python
Top positive words:	#pragma,

**2. TF-IDF classifier:**

In [28]:
tfidf_reversed_vocab = {i:word for word, i in tfidf_vectorizer.vocabulary_.items()}

In [34]:
print_words_for_tag(tfidf_clf.named_steps['onevsrestclassifier'], 'c', mlb.classes, tfidf_reversed_vocab)
print_words_for_tag(tfidf_clf.named_steps['onevsrestclassifier'], 'c++', mlb.classes, tfidf_reversed_vocab)
print_words_for_tag(tfidf_clf.named_steps['onevsrestclassifier'], 'linux', mlb.classes, tfidf_reversed_vocab)
print_words_for_tag(tfidf_clf.named_steps['onevsrestclassifier'], 'javascript', mlb.classes, tfidf_reversed_vocab)
print_words_for_tag(tfidf_clf.named_steps['onevsrestclassifier'], 'python', mlb.classes, tfidf_reversed_vocab)

Tag:	c
Top positive words:	c, malloc, scanf, printf, gcc, pointer, linux, kernel, struct
Top negative words:	java, php, python, javascript, c#, objective c, objective, jquery, ruby, swift

Tag:	c++
Top positive words:	c++, qt, boost, mfc, opencv, stl, c++11, stdstring, boostasio
Top negative words:	java, php, python, javascript, c#, jquery, r, ruby, swift, rails

Tag:	linux
Top positive words:	linux, ubuntu, c, address, signal, shared, unix, fork, process
Top negative words:	javascript, c#, jquery, array, method, aspnet, image, android, page, string

Tag:	javascript
Top positive words:	javascript, jquery, js, angularjs, nodejs, angular, div, extjs, typescript
Top negative words:	python, c#, java, php, c++, django, swift, wpf, c, rails

Tag:	python
Top positive words:	python, pandas, numpy, matplotlib, flask, tkinter, sqlalchemy, django, beautifulsoup
Top negative words:	php, java, c#, javascript, jquery, c++, r, rails, c, swift



___
**As we can see; the classifier trained on the tfidf representation is more robust and accurate than the classifier trained on the BOW document representation; and so we can notice this clearly when we examine the most positive and negative words that affects our model. We can see that the words are somehow relevant to the tag or class it predicts in the tfidf classifier and so we can see the huge effect of this representation than the traditional BOW**