In [65]:
TRAIN_PATH = "../data/processed/00_train_df.pkl"
VAL_PATH = "../data/processed/00_validation_df.pkl"
TEST_PATH = "../data/processed/00_test_df.pkl"

TRAIN_FEATURES_BOW = "../data/processed/01_train_features_BOW.pkl"
VAL_FEATURES_BOW = "../data/processed/01_validation_features_BOW.pkl"
TEST_FEATURES_BOW = "../data/processed/01_test_features_BOW.pkl"

TRAIN_FEATURES_TFIDF = "../data/processed/01_train_features_TFIDF.pkl"
VAL_FEATURES_TFIDF = "../data/processed/01_validation_features_TFIDF.pkl"
TEST_FEATURES_TFIDF = "../data/processed/01_test_features_TFIDF.pkl"


TRAIN_TARGET_EXPORT = "../data/processed/01_train_target.pkl"
VAL_TARGET_EXPORT = "../data/processed/01_validation_target.pkl"

In [57]:
# Load packages
import pandas as pd
import numpy as np

import pickle

from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

from scipy import sparse

pd.set_option('display.max_colwidth', None)
pd.options.display.max_columns = 1000

___
## Read Data:

In [3]:
train_df = pd.read_pickle(TRAIN_PATH)
val_df = pd.read_pickle(VAL_PATH)
test_df = pd.read_pickle(TEST_PATH)

In [4]:
train_df.head()

Unnamed: 0,title,tags
0,draw stacked dotplot r,[r]
1,mysql select records datetime field less specified value,"[php, mysql]"
2,terminate windows phone 81 app,[c#]
3,get current time specific country via jquery,"[javascript, jquery]"
4,configuring tomcat use ssl,[java]


___
## First step: count tag/word frequency:
**At the first step we will get the count of each word and tag so we can sort them and find the best vectors we can classify upon**

In [5]:
tags_counts = Counter() # counter for tags
words_counts = Counter() # counter for words in titles

for tags in train_df['tags']:
    for tag in tags:
        tags_counts[tag] += 1

for words in train_df['title']:
    for word in words.split():
        words_counts[word] += 1

---
## Construct the BOW:

In [23]:
DICT_SIZE = 5000

# Find the N sorted words as my features
#SORT_BY_FREQ = sorted(words_counts.keys(), key=lambda x: words_counts[x], reverse=True)[:DICT_SIZE]
SORT_BY_FREQ = [x[0] for x in words_counts.most_common(DICT_SIZE)]

BOW_vectorizer = CountVectorizer(vocabulary=SORT_BY_FREQ)

In [24]:
X_train_BOW = BOW_vectorizer.fit_transform(train_df['title'])
X_val_BOW = BOW_vectorizer.transform(val_df['title'])
X_test_BOW = BOW_vectorizer.transform(test_df['title'])

In [13]:
print(X_train_BOW[5])
print(vectorizer.get_feature_names_out()[:10])

  (0, 31)	1
  (0, 34)	1
  (0, 94)	1
  (0, 236)	1
  (0, 395)	1
  (0, 518)	1
  (0, 1143)	1
  (0, 2394)	1
  (0, 2508)	1
  (0, 4852)	1
['using' 'php' 'java' 'file' 'javascript' 'error' 'get' 'c#' 'python'
 'string']


In [14]:
print("X_train_BOW.shape: ", X_train_BOW.shape)
print("X_val_BOW.shape: ", X_val_BOW.shape)
print("X_test_BOW.shape: ", X_test_BOW.shape)

X_train_BOW.shape:  (100000, 5000)
X_val_BOW.shape:  (30000, 5000)
X_test_BOW.shape:  (20000, 5000)


___
## Construct TF-IDF:

In [21]:
tfidf_vectorizer = TfidfVectorizer(min_df=5, max_df=0.9,
                                   ngram_range=(1, 2),
                                   token_pattern='(\S+)')

In [22]:
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['title'])
X_val_tfidf = tfidf_vectorizer.transform(val_df['title'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['title'])

In [27]:
print(X_train_tfidf[5])
print(X_train_tfidf.shape)
print(tfidf_vectorizer.get_feature_names_out()[:10])

  (0, 461)	0.32939124453958957
  (0, 9091)	0.3655867713235215
  (0, 17370)	0.3626593411993441
  (0, 16059)	0.28363456686039207
  (0, 2590)	0.3232152261847885
  (0, 10616)	0.22522950985922077
  (0, 415)	0.20280157594045314
  (0, 11993)	0.2731386994273628
  (0, 14326)	0.2021656129988156
  (0, 10551)	0.2534013186848059
  (0, 1378)	0.4121462714963972
(100000, 18300)
['#' '#1' '#2' '#define' '#ifdef' '#include' '#object' '#object method'
 '#pragma' '+']


In [35]:
[(i,word) for word, i in tfidf_vectorizer.vocabulary_.items()][:5]

[(4792, 'draw'),
 (14941, 'stacked'),
 (12748, 'r'),
 (10394, 'mysql'),
 (14019, 'select')]

___
## MultiLabelBinarizer:
**Construct a matrix of targets using the multilabelbinarizer**

In [36]:
mlb = MultiLabelBinarizer(classes=sorted(tags_counts.keys()))

In [37]:
y_train = mlb.fit_transform(train_df['tags'])
y_val = mlb.transform(val_df['tags'])

In [40]:
pd.DataFrame(y_train, columns=mlb.classes_, index=train_df['tags'].index).head()

Unnamed: 0,.net,ajax,algorithm,android,angularjs,apache,arrays,asp.net,asp.net-mvc,c,c#,c++,class,cocoa-touch,codeigniter,css,csv,database,date,datetime,django,dom,eclipse,entity-framework,excel,facebook,file,forms,function,generics,google-maps,hibernate,html,html5,image,ios,iphone,java,javascript,jquery,json,jsp,laravel,linq,linux,list,loops,maven,mongodb,multithreading,mysql,node.js,numpy,objective-c,oop,opencv,osx,pandas,parsing,performance,php,pointers,python,python-2.7,python-3.x,qt,r,regex,rest,ruby,ruby-on-rails,ruby-on-rails-3,selenium,servlets,session,sockets,sorting,spring,spring-mvc,sql,sql-server,string,swift,swing,twitter-bootstrap,uitableview,unit-testing,validation,vb.net,visual-studio,visual-studio-2010,wcf,web-services,windows,winforms,wordpress,wpf,xaml,xcode,xml
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


___
## Export and save results to use in modeling:

In [66]:
# Save BOW features
with open(TRAIN_FEATURES_BOW, 'wb') as handle:
    pickle.dump(X_train_BOW, handle)

with open(VAL_FEATURES_BOW, 'wb') as handle:
    pickle.dump(X_val_BOW, handle)

with open(TEST_FEATURES_BOW, 'wb') as handle:
    pickle.dump(X_test_BOW, handle)

In [67]:
# Save TF-IDF features
with open(TRAIN_FEATURES_TFIDF, 'wb') as handle:
    pickle.dump(X_train_tfidf, handle)

with open(VAL_FEATURES_TFIDF, 'wb') as handle:
    pickle.dump(X_val_tfidf, handle)

with open(TEST_FEATURES_TFIDF, 'wb') as handle:
    pickle.dump(X_test_tfidf, handle)

In [68]:
# Save MultiLabelBinarizer target
with open(TRAIN_TARGET_EXPORT, 'wb') as handle:
    pickle.dump(y_train, handle)

with open(VAL_TARGET_EXPORT, 'wb') as handle:
    pickle.dump(y_val, handle)