In [1]:
# Import Python libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval

In [2]:
data = pd.read_csv('StackOverflow_cleaned.csv',sep=";", index_col=0,converters={"Title": literal_eval,
                                                                                 "Body": literal_eval,
                                                                                  "Tags": literal_eval})

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24044 entries, 0 to 27048
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Title   24044 non-null  object
 1   Body    24044 non-null  object
 2   Score   24044 non-null  int64 
 3   Tags    24044 non-null  object
dtypes: int64(1), object(3)
memory usage: 939.2+ KB


In [4]:
data=data[0:500]

In [5]:
data

Unnamed: 0,Title,Body,Score,Tags
0,"[core, dump, linux, segmentation, fault]","[process, linux, segmentation, fault, core, dump]",237,"[linux, bash]"
1,"[php, server, mysql, server, time, zone]","[hosting, package, godaddy, network, solution,...",12,"[php, mysql]"
2,"[constraint, landscape, orientation]","[constraint, device, example, image, portrait,...",48,"[ios, cocoa-touch]"
3,"[loading, system, servicemodel, configuration,...","[net, wcf, wcf, configuration, client, applica...",64,"[c#, .net]"
4,"[xml, serialization, inherited, type]","[question, object, model, xml, problem, quelle...",86,[c#]
...,...,...,...,...
550,"[xpath, query, html, table, work, firebug, app...","[question, target, candidate, week, applicatio...",21,[html]
551,"[convert, generic, collection, datatable]","[collection, list, datatable, code, problem, p...",80,[c#]
552,"[aspnet, custom, validator, client, side, serv...","[reason, client, side, validation, event, side...",74,"[c#, .net, asp.net]"
553,"[django, python, web, framework]","[python, web, framework, time, bullet, framewo...",61,"[python, django]"


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
X = data["Body"]
y = data["Tags"]

vectorizer = TfidfVectorizer(analyzer="word",
                             max_df=.6,
                             min_df=0.005,
                             tokenizer=None,
                             preprocessor=' '.join,
                             stop_words=None,
                             lowercase=False)


vectorizer.fit(X)
X_tfidf = vectorizer.transform(X)


In [7]:
X_tfidf.shape

(500, 606)

In [8]:
X_tfidf

<500x606 sparse matrix of type '<class 'numpy.float64'>'
	with 6566 stored elements in Compressed Sparse Row format>

RandomForest ONeVsRest

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as metrics
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split



# transform output : 
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(y)
y_bin = multilabel_binarizer.transform(y)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_bin, test_size=0.3, random_state=8)



In [10]:
# Initialize RandomForest with OneVsRest
param_rfc = {"estimator__max_depth": [5, 25, 50],
             "estimator__min_samples_leaf": [1, 5, 10],
             "estimator__class_weight": ["balanced"]}

multi_rfc_cv = GridSearchCV(OneVsRestClassifier(RandomForestClassifier()),
                            param_grid=param_rfc,
                            n_jobs=-1,
                            cv=2,
                            scoring="f1_weighted",
                            return_train_score = True,
                            refit=True,
                            verbose=3)
# Fit on Sample data
multi_rfc_cv.fit(X_train, y_train)

Fitting 2 folds for each of 9 candidates, totalling 18 fits


GridSearchCV(cv=2,
             estimator=OneVsRestClassifier(estimator=RandomForestClassifier()),
             n_jobs=-1,
             param_grid={'estimator__class_weight': ['balanced'],
                         'estimator__max_depth': [5, 25, 50],
                         'estimator__min_samples_leaf': [1, 5, 10]},
             return_train_score=True, scoring='f1_weighted', verbose=3)

In [11]:
rfc_cv_results = pd.DataFrame.from_dict(multi_rfc_cv.cv_results_)
rfc_best_params = multi_rfc_cv.best_params_
print(rfc_best_params)

{'estimator__class_weight': 'balanced', 'estimator__max_depth': 25, 'estimator__min_samples_leaf': 5}


In [12]:
rfc_best_params_ok = {}
for k, v in rfc_best_params.items():
    rfc_best_params_ok[k.replace("estimator__","")] = v

In [14]:
# Refit RandomForestClassifier best_params with full dataset
rfc_final_model = OneVsRestClassifier(RandomForestClassifier(**rfc_best_params_ok))
rfc_final_model.fit(X_train, y_train)

# Predict
y_test_predicted_labels_tfidf_rfc = rfc_final_model.predict(X_test)

# Inverse transform
y_test_pred_inversed_rfc = multilabel_binarizer.inverse_transform(y_test_predicted_labels_tfidf_rfc)
y_test_inversed = multilabel_binarizer.inverse_transform(y_test)
print("Predicted:", y_test_pred_inversed_rfc[0:5])
print("True:", y_test_inversed[0:5])

Predicted: [('.net', 'asp.net'), (), ('bash', 'jquery', 'linux', 'shell'), (), ()]
True: [('performance',), ('.net', 'c#'), ('bash',), ('node.js',), ('python',)]


In [15]:
def metrics_score(model, df, y_true, y_pred):
 
    if(df is not None):
        temp_df = df
    else:
        temp_df = pd.DataFrame(index=["Accuracy", "F1",
                                      "Jaccard", "Recall",
                                      "Precision"],
                               columns=[model])
        
    scores = []
    scores.append(metrics.accuracy_score(y_true, 
                                         y_pred))
    scores.append(metrics.f1_score(y_pred, 
                                   y_true, 
                                   average='weighted'))
    scores.append(metrics.jaccard_score(y_true, 
                                        y_pred, 
                                        average='weighted'))
    scores.append(metrics.recall_score(y_true, 
                                       y_pred, 
                                       average='weighted'))
    scores.append(metrics.precision_score(y_true, 
                                          y_pred, 
                                          average='weighted'))
    temp_df[model] = scores
    
    return temp_df
    
df_metrics_compare = metrics_score("RandomForest", 
                                   df=None,
                                   y_true = y_test,
                                   y_pred = y_test_predicted_labels_tfidf_rfc)
df_metrics_compare


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,RandomForest
Accuracy,0.086667
F1,0.303404
Jaccard,0.14556
Recall,0.1841
Precision,0.311666
