In [2]:
import pandas as pd
import numpy as np

# Lesson for TF-IDF (NLP)
**Stands for Term-Frequency and Inverse Document Frequency**

Assigns scores to each word:
- Score goes UP when we have a big word frequency in a document
- Score goes DOWN when we have the same word across many documents (goes UP if not)

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np

def get_tfidf_matrix(docs):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(docs)
    return tfidf_matrix.toarray(), vectorizer.get_feature_names()

documents = ['The sky is bule', 'The sun is bright', 'The sun in the sky is bright', 'we can see the shining sun, the bright sun']

matrix = get_tfidf_matrix(documents)[0]
feature_names = get_tfidf_matrix(documents)[1]

print(feature_names)
print(matrix)

['bright', 'bule', 'shining', 'sky', 'sun']
[[0.         0.78528828 0.         0.6191303  0.        ]
 [0.70710678 0.         0.         0.         0.70710678]
 [0.53256952 0.         0.         0.65782931 0.53256952]
 [0.36626037 0.         0.57381765 0.         0.73252075]]


So here we have the columns as unique words, and the rows are sentences. The value is the 'score' of that word

In [8]:
averages_for_word = np.sum(matrix, axis=0)/len(feature_names)

In [10]:
print(feature_names)
print(averages_for_word)

['bright', 'bule', 'shining', 'sky', 'sun']
[0.32118734 0.15705766 0.11476353 0.25539192 0.39443941]


**Cool way to sort a dict by values!**

In [12]:
sorted(dict(zip(feature_names, averages_for_word)).items(), key=lambda x: x[1], reverse=True)

[('sun', 0.39443941015123685),
 ('bright', 0.3211873355368286),
 ('sky', 0.25539192195797),
 ('bule', 0.15705765514207934),
 ('shining', 0.11476353002337394)]

## Word2Vec
**Assigning a vector to each word**  
Words with similar meanings will be closer to each other in Euclidian space

In [20]:
import codecs

with codecs.open('glove.6B.300d.txt', 'r') as f:
    ls = {}
    for c, r in enumerate(f):
        sr = r.split()
        if sr[0] in ['pizza', 'food', 'sport']:
            ls[sr[0]] =[float(i) for i in sr[1:]]
        if len(ls) == 3:
            break
            
print(ls)

{'food': [0.38544, 0.34247, 0.29599, -0.262, 0.037383, 0.45544, 0.49097, 0.11481, -0.11437, -1.9067, 0.035563, -1.1094, -0.26512, 0.64418, -0.031008, -0.3513, -0.0010547, 0.074658, -0.30369, -0.28188, -0.34342, 0.36205, 0.71009, 0.30243, 0.070325, 0.29492, -0.16233, 0.30998, 0.13705, 0.11847, -0.68642, 0.43305, -0.61518, 0.23643, -0.84174, 0.14667, -0.096616, -0.20908, -0.42296, -0.27254, -0.79343, -0.62781, 0.64804, 0.11541, -0.33486, -0.14101, 0.12864, -0.25123, -0.26515, 0.30876, -0.063111, 0.17893, 0.41197, 0.019621, -0.15406, 0.17542, 0.39268, 0.088817, 0.018012, -0.22508, -0.31832, 0.022296, 0.59453, 0.056538, -0.72464, -0.31751, -0.3865, 0.33806, -0.16237, -0.0076169, 0.52897, 0.14628, -0.22458, -0.66751, 0.23012, -0.068667, 0.4668, 0.2204, -0.38321, -0.18401, 0.36828, 0.085637, -0.28836, 0.43894, 0.1579, -1.1443, -0.17327, -0.0027353, -0.32621, -0.2411, -0.14597, 0.061719, -0.3021, -0.16319, 0.0025848, 0.26203, 0.045155, -0.019056, -0.48923, 0.062269, 0.1214, 0.054817, 0.01157,

In [21]:
np.linalg.norm(np.array(ls['pizza']) - np.array(ls['food']))

7.63426225303502

In [22]:
np.linalg.norm(np.array(ls['pizza']) - np.array(ls['sport']))

9.341560240989656

In [23]:
np.linalg.norm(np.array(ls['food']) - np.array(ls['sport']))

8.896379012366626

By comparing the eucledian distances between pizza food and sport keywords, we can see which words are closer related to each other. As we can see above, pizza and food are more related to each other than pizza and sport and food and sport. Pretty cool stuff! 

## Sorting dict by values

In [36]:
d = {
    'hi': 1,
    'hello': 2,
    'wow': 10,
    'damn': 6,
    'dope': 3
}

print(f'unsorted: {d}')
s_d = sorted(d.items(), key=lambda x: x[1], reverse=False)
print(f'sorted: {s_d}')

unsorted: {'hi': 1, 'hello': 2, 'wow': 10, 'damn': 6, 'dope': 3}
sorted: [('hi', 1), ('hello', 2), ('dope', 3), ('damn', 6), ('wow', 10)]


In [37]:
## another way to sort dict by values

def sort_dict_by_value(d):
    key_list = np.array(list(d.keys()))
    val_list = np.array(list(d.values()))
    # Smallest to largest:
    ind_sorted_val = np.argsort(val_list)
    # Largest to smallest:
#     ind_sorted_val = np.argsort(val_list)[::-1]
    return key_list[ind_sorted_val]

print(sort_dict_by_value(d))

['hi' 'hello' 'dope' 'damn' 'wow']
