In [112]:
TRAIN_PATH = "predict_question_tags_stackoverflow/data/processed/00_train_df.pkl"
VAL_PATH = "predict_question_tags_stackoverflow/data/processed/00_validation_df.pkl"
TEST_PATH = "predict_question_tags_stackoverflow/data/processed/00_test_df.pkl"

In [2]:
import pandas as pd
import numpy as np

pd.set_option('display.max_colwidth', None)
pd.options.display.max_columns = 1000

In [3]:
train_df = pd.read_pickle(TRAIN_PATH)
val_df = pd.read_pickle(VAL_PATH)
test_df = pd.read_pickle(TEST_PATH)

In [4]:
train_df.head()

Unnamed: 0,title,tags
0,draw stacked dotplot r,[r]
1,mysql select records datetime field less specified value,"[php, mysql]"
2,terminate windows phone 81 app,[c#]
3,get current time specific country via jquery,"[javascript, jquery]"
4,configuring tomcat use ssl,[java]


## First step: count tag/word frequency:
**At the first step we will get the count of each word and tag so we can sort them and find the best vectors we can classify upon**

In [80]:
from collections import Counter, defaultdict

tags_counts = Counter() # counter for tags
words_counts = Counter() # counter for words in titles
#We can use ============> tags_dict = defaultdict(int)

for tags in train_df['tags']:
    for tag in tags:
        tags_counts[tag] += 1

for words in train_df['title']:
    for word in words.split():
        words_counts[word] += 1

**Now sort to construct our bag of words BOW we need to transform each sentense into  a vector of features** \
so we need first to identify our words as features and so we will sort the most common words to N number we specify and so we will loop through sentense and check wether this word exist in the features or not, if it exists it will get 1 (or n number of existing in the sentense)

so for example: features are \['hi', 'you', 'me', 'are'\]

sentense is "hi how are you" 

so as a vector it would look like \[1, 1, 0, 1\]

and so on.

In [108]:
# first set your features and sort them with their indices
DICT_SIZE = 5000

#SORT_BY_FREQ = sorted(words_counts.keys(), key=lambda x: words_counts[x], reverse=True)[:DICT_SIZE] # a sorted list of DICT_SIZE words according to frequency
SORT_BY_FREQ = [x[0] for x in words_counts.most_common(DICT_SIZE)]

WORDS_TO_INDEX = {word: i for i, word in enumerate(SORT_BY_FREQ)} # Putting each word with its index in a dict so as to find its place in the word vector
#ALL_WORDS = WORDS_TO_INDEX.keys()

In [109]:
def bag_of_words(text, words_index_dict, dict_size):
    result_vector = np.zeros(dict_size)
    for word in text.split():
        if word in words_index_dict:
            result_vector[words_index_dict[word]] += 1
    return result_vector

**Now after constructing features and function we will transform our text into feature form for all train, validat and test data**

In [83]:
from scipy import sparse
X_train_BOW = sparse.vstack([sparse.csr_matrix(bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in train_df['title']])
X_val_BOW = sparse.vstack([sparse.csr_matrix(bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in val_df['title']])
X_test_BOW = sparse.vstack([sparse.csr_matrix(bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in test_df['title']])

In [110]:
print(X_train_BOW[5])
print(X_train_BOW.shape)
print(SORT_BY_FREQ[:10])

  (0, 31)	1.0
  (0, 34)	1.0
  (0, 94)	1.0
  (0, 236)	1.0
  (0, 395)	1.0
  (0, 518)	1.0
  (0, 1143)	1.0
  (0, 2394)	1.0
  (0, 2508)	1.0
  (0, 4852)	1.0
(100000, 5000)
['using', 'php', 'java', 'file', 'javascript', 'error', 'get', 'c#', 'python', 'string']


___
## We can use the CountVectorizer class to build our BOW as follows:

In [87]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(vocabulary=SORT_BY_FREQ)
X = vectorizer.fit_transform(train_df['title'])

In [95]:
print(X[5])
print(X.shape)
print(vectorizer.get_feature_names_out()[:10])

  (0, 31)	1
  (0, 34)	1
  (0, 94)	1
  (0, 236)	1
  (0, 395)	1
  (0, 518)	1
  (0, 1143)	1
  (0, 2394)	1
  (0, 2508)	1
  (0, 4852)	1
(100000, 5000)
['using' 'php' 'java' 'file' 'javascript' 'error' 'get' 'c#' 'python'
 'string']


## TF-IDF
The second approach extends the bag-of-words framework by taking into account total frequencies of words in the corpora. It helps to penalize too frequent words and provide better features space.

Implement function tfidf_features using class TfidfVectorizer from scikit-learn. Use train corpus to train a vectorizer. Don't forget to take a look into the arguments that you can pass to it. We suggest that you filter out too rare words (occur less than in 5 titles) and too frequent words (occur more than in 90% of the titles). Also, use bigrams along with unigrams in your vocabulary.

In [113]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [114]:
tfidf_vectorizer = TfidfVectorizer(min_df=5, max_df=0.9,
                                   ngram_range=(1, 2),
                                   token_pattern='(\S+)')

In [115]:
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['title'])
X_val_tfidf = tfidf_vectorizer.transform(val_df['title'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['title'])

In [116]:
print(X_train_tfidf[5])
print(X_train_tfidf.shape)
print(tfidf_vectorizer.get_feature_names_out()[:10])

  (0, 461)	0.32939124453958957
  (0, 9091)	0.3655867713235215
  (0, 17370)	0.3626593411993441
  (0, 16059)	0.28363456686039207
  (0, 2590)	0.3232152261847885
  (0, 10616)	0.22522950985922077
  (0, 415)	0.20280157594045314
  (0, 11993)	0.2731386994273628
  (0, 14326)	0.2021656129988156
  (0, 10551)	0.2534013186848059
  (0, 1378)	0.4121462714963972
(100000, 18300)
['#' '#1' '#2' '#define' '#ifdef' '#include' '#object' '#object method'
 '#pragma' '+']


In [119]:
tfidf_vectorizer.get_feature_names_out()[11993]

'plugin'