## Coding Exercise #0512

In [1]:
import numpy as np
import warnings
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition  import TruncatedSVD
warnings.filterwarnings('ignore')

### 1. Latent Semantic Analysis (LSA):

In [2]:
# The data.
my_docs = ["The economic slowdown is becoming more severe",
           "The movie was simply awesome",
           "I like cooking my own food",
           "Samsung is announcing a new technology",
           "Machine Learning is an example of awesome technology",
           "All of us were excited at the movie",
           "We have to do more to reverse the economic slowdown"]

#### 1.1. Create a TF IDF representation:
TfidfVectorizer() arguments: <br>
- *max_features* : maximum number of features (distict words). <br>
- *min_df* : The minimum DF. Integer value means count and real number (0~1) means proportion. <br> 
- *max_df* : The maximum DF. Integer value means count and real number (0~1) means proportion. Helps to filter out the stop words. <br> 

In [3]:
my_docs = [x.lower() for x in my_docs]

In [4]:
my_stop_words = ['us', 'like']

In [5]:
vectorizer = TfidfVectorizer(max_features = 15, min_df = 1, max_df = 3, stop_words = stopwords.words('english') + my_stop_words)
X = vectorizer.fit_transform(my_docs).toarray()              

In [6]:
# Size of X (=m x n). m = number of documents = 7 & n = number of features.
X.shape

(7, 15)

In [7]:
# View the features.
features = vectorizer.get_feature_names_out()
print(features)

['awesome' 'becoming' 'cooking' 'economic' 'example' 'excited' 'food'
 'learning' 'machine' 'movie' 'new' 'reverse' 'samsung' 'slowdown'
 'technology']


#### 1.2. Apply the truncated SVD:

In [8]:
n_topics = 4
svd = TruncatedSVD(n_components=n_topics, n_iter=100)
svd.fit(X)

0,1,2
,n_components,4
,algorithm,'randomized'
,n_iter,100
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,
,tol,0.0


In [None]:
#El significado de tol en svd.fit(X) es un parámetro que se utiliza para determinar la convergencia del algoritmo 
# de descomposición en valores singulares (SVD). El algoritmo de SVD se considera convergente cuando la diferencia 
# entre los valores singulares consecutivos es menor que el valor especificado en tol. En otras 
# palabras, tol establece un umbral para la precisión de la descomposición, y el algoritmo se detendrá cuando los 
# valores singulares hayan convergido lo suficiente, es decir, cuando la diferencia entre ellos sea menor que tol.  
# get the V^t matrix.
vt = svd.components_
vtabs = np.abs(vt)
print(vtabs)
#vtabs significa que se toma el valor absoluto de cada elemento de la matriz vt, 
# lo que permite identificar la importancia de cada término en cada tema sin considerar su signo.

[[8.33391547e-18 3.64848333e-01 1.68000794e-17 6.05710899e-01
  9.20727509e-18 8.66577451e-17 1.68000794e-17 9.20727509e-18
  9.20727509e-18 7.35039981e-17 4.11912396e-17 3.64848333e-01
  4.11912396e-17 6.05710899e-01 4.14147237e-17]
 [5.24870792e-01 2.38677416e-17 8.54456102e-17 2.08166817e-17
  1.58868996e-01 3.55524692e-01 8.54456102e-17 1.58868996e-01
  1.58868996e-01 6.88111908e-01 7.49353714e-02 8.63177867e-17
  7.49353714e-02 2.08166817e-17 1.94077704e-01]
 [1.10873880e-01 6.94360073e-17 1.99390162e-14 4.18245990e-17
  2.53945040e-01 2.94139077e-01 1.99405654e-14 2.53945040e-01
  2.53945040e-01 3.44083006e-01 3.89758799e-01 2.92590836e-17
  3.89758799e-01 4.18245861e-17 5.34329467e-01]
 [4.30871310e-16 2.90406391e-17 7.07106781e-01 1.00759278e-16
  4.78654205e-15 4.87119823e-15 7.07106781e-01 4.78654205e-15
  4.78654205e-15 8.56858274e-15 1.50688455e-14 9.41605819e-17
  1.50688455e-14 6.57741757e-17 1.65316801e-14]]


In [11]:
# Check for the size of V^t. 
vt.shape
print(vt)

[[-8.33391547e-18  3.64848333e-01 -1.68000794e-17  6.05710899e-01
   9.20727509e-18  8.66577451e-17 -1.68000794e-17  9.20727509e-18
   9.20727509e-18  7.35039981e-17  4.11912396e-17  3.64848333e-01
   4.11912396e-17  6.05710899e-01  4.14147237e-17]
 [ 5.24870792e-01 -2.38677416e-17 -8.54456102e-17  2.08166817e-17
   1.58868996e-01  3.55524692e-01 -8.54456102e-17  1.58868996e-01
   1.58868996e-01  6.88111908e-01  7.49353714e-02  8.63177867e-17
   7.49353714e-02  2.08166817e-17  1.94077704e-01]
 [ 1.10873880e-01 -6.94360073e-17  1.99390162e-14  4.18245990e-17
   2.53945040e-01 -2.94139077e-01  1.99405654e-14  2.53945040e-01
   2.53945040e-01 -3.44083006e-01  3.89758799e-01 -2.92590836e-17
   3.89758799e-01  4.18245861e-17  5.34329467e-01]
 [ 4.30871310e-16  2.90406391e-17  7.07106781e-01 -1.00759278e-16
  -4.78654205e-15  4.87119823e-15  7.07106781e-01 -4.78654205e-15
  -4.78654205e-15  8.56858274e-15 -1.50688455e-14  9.41605819e-17
  -1.50688455e-14  6.57741757e-17 -1.65316801e-14]]


#### 1.3. From each topic, extract the top features:

In [11]:
n_top = 3
for i in range(n_topics):
    topic_features = [features[idx] for idx in np.argsort(-vtabs[i,:])]   # argsort() shows the sorted index.
    topic_features_top = topic_features[0:n_top]
    if i == 0:
        topic_matrix = [topic_features_top]                    # list의 list 만들 준비!
    else:
        topic_matrix.append(topic_features_top) 

In [12]:
# Show the top features for each topic.
topic_matrix

[['economic', 'slowdown', 'becoming'],
 ['movie', 'awesome', 'excited'],
 ['technology', 'new', 'samsung'],
 ['food', 'cooking', 'technology']]

In [13]:
# In view of the top features, we can name the topics.
topic_names = ['Economy', 'Movie','Technology', 'Cuisine']

#### 1.4. Label each document with the most predominant topic:

In [14]:
n_docs = len(my_docs)
for i in range(n_docs):
    score_pick = 0
    topic_pick = 0
    tokennized_doc = nltk.word_tokenize(my_docs[i])
    for j in range(n_topics):
        found = [ x in topic_matrix[j] for x in tokennized_doc ] 
        score = np.sum(found)
        if (score > score_pick):
            score_pick = score
            topic_pick = j
    print("Document " + str(i+1) + " = " + topic_names[topic_pick])

Document 1 = Economy
Document 2 = Movie
Document 3 = Cuisine
Document 4 = Technology
Document 5 = Movie
Document 6 = Movie
Document 7 = Economy


**NOTE**: We can notice some inaccuracies.