# 잠재 의미 분석(Latent Semantic Analysis, LSA)
- https://wikidocs.net/24949
- LSA는 기본적으로 DTM이나 TF-IDF 행렬에 절단된 SVD(truncated SVD)를 사용하여 차원을 축소시키고, 단어들의 잠재적인 의미를 끌어낸다는 아이디어를 갖고 있습니다.

## Import

In [59]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.decomposition import TruncatedSVD
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

## SVD, Truncated SVD

### test matrix

In [2]:
A = np.array([
    [0,0,0,1,0,1,1,0,0],
    [0,0,0,1,1,0,1,0,0],
    [0,1,1,0,2,0,0,0,0],
    [1,0,0,0,0,0,0,1,1]
])
A.shape

(4, 9)

In [11]:
pd.DataFrame(
    A, 
    columns=["과일이", "길고", "노란", "먹고", "바나나", "사과", "싶은", "저는", "좋아요"],
    index=["doc1", "doc2", "doc3", "doc4"]
)

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
doc1,0,0,0,1,0,1,1,0,0
doc2,0,0,0,1,1,0,1,0,0
doc3,0,1,1,0,2,0,0,0,0
doc4,1,0,0,0,0,0,0,1,1


### Full SVD

In [13]:
U, s, V = np.linalg.svd(A, full_matrices=True)
U.shape, s.shape, V.shape

((4, 4), (4,), (9, 9))

In [17]:
U.round(2), s.round(2), V.round(2)

(array([[-0.24,  0.75,  0.  , -0.62],
        [-0.51,  0.44, -0.  ,  0.74],
        [-0.83, -0.49, -0.  , -0.27],
        [-0.  , -0.  ,  1.  ,  0.  ]]),
 array([2.69, 2.05, 1.73, 0.77]),
 array([[-0.  , -0.31, -0.31, -0.28, -0.8 , -0.09, -0.28, -0.  , -0.  ],
        [ 0.  , -0.24, -0.24,  0.58, -0.26,  0.37,  0.58, -0.  , -0.  ],
        [ 0.58, -0.  ,  0.  ,  0.  , -0.  ,  0.  , -0.  ,  0.58,  0.58],
        [ 0.  , -0.35, -0.35,  0.16,  0.25, -0.8 ,  0.16, -0.  , -0.  ],
        [-0.  , -0.78, -0.01, -0.2 ,  0.4 ,  0.4 , -0.2 ,  0.  ,  0.  ],
        [-0.29,  0.31, -0.78, -0.24,  0.23,  0.23,  0.01,  0.14,  0.14],
        [-0.29, -0.1 ,  0.26, -0.59, -0.08, -0.08,  0.66,  0.14,  0.14],
        [-0.5 , -0.06,  0.15,  0.24, -0.05, -0.05, -0.19,  0.75, -0.25],
        [-0.5 , -0.06,  0.15,  0.24, -0.05, -0.05, -0.19, -0.25,  0.75]]))

In [22]:
# orhogonal matrix
sum(U[0] * U[0]), sum(U[1] * U[1]), sum(U[2] * U[2]), sum(U[3] * U[3])

(1.0, 1.0, 1.0000000000000002, 1.0)

In [23]:
# diagonal matrix
S = np.zeros(A.shape)
S[:4, :4] = np.diag(s)
S.round(2)

array([[2.69, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 2.05, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 1.73, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.77, 0.  , 0.  , 0.  , 0.  , 0.  ]])

In [26]:
_A = np.dot(np.dot(U, S), V)
_A

array([[ 5.48519247e-17,  5.62821277e-17,  5.62821277e-17,
         1.00000000e+00, -2.41172566e-17,  1.00000000e+00,
         1.00000000e+00,  5.48519247e-17,  5.48519247e-17],
       [ 2.53218958e-16, -4.34272465e-17, -4.34272465e-17,
         1.00000000e+00,  1.00000000e+00, -1.70971093e-16,
         1.00000000e+00, -8.22778870e-17, -8.22778870e-17],
       [ 1.07780421e-16,  1.00000000e+00,  1.00000000e+00,
        -1.57145142e-16,  2.00000000e+00, -1.68949211e-16,
        -1.57145142e-16,  6.85649058e-17,  6.85649058e-17],
       [ 1.00000000e+00, -3.37739028e-17,  2.08965714e-16,
         3.73418078e-16, -2.43289974e-17,  2.43959150e-16,
        -1.11744589e-17,  1.00000000e+00,  1.00000000e+00]])

In [27]:
np.allclose(A, _A)

True

### Truncated SVD

In [31]:
trunc_U = U[:, :2]
trunc_U.round(2)

array([[-0.24,  0.75],
       [-0.51,  0.44],
       [-0.83, -0.49],
       [-0.  , -0.  ]])

In [30]:
trunc_S = S[:2, :2]
trunc_S.round(2)

array([[2.69, 0.  ],
       [0.  , 2.05]])

In [33]:
trunc_V = V[:2, :]
trunc_V.round(2)

array([[-0.  , -0.31, -0.31, -0.28, -0.8 , -0.09, -0.28, -0.  , -0.  ],
       [ 0.  , -0.24, -0.24,  0.58, -0.26,  0.37,  0.58, -0.  , -0.  ]])

In [34]:
A_hat = np.dot(np.dot(trunc_U, trunc_S), trunc_V)
A_hat.round(2)

array([[ 0.  , -0.17, -0.17,  1.08,  0.12,  0.62,  1.08, -0.  , -0.  ],
       [ 0.  ,  0.2 ,  0.2 ,  0.91,  0.86,  0.45,  0.91,  0.  ,  0.  ],
       [ 0.  ,  0.93,  0.93,  0.03,  2.05, -0.17,  0.03,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  ,  0.  ,  0.  , -0.  ,  0.  ,  0.  ,  0.  ]])

In [35]:
np.allclose(A, A_hat)

False

## Exercise

### Load dataset

In [38]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=("headers", "footers", "quotes"))
documents = dataset.data
len(documents)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


11314

In [39]:
documents[0]

"Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n"

In [40]:
# category for article
dataset.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

### Text preprocessing

In [41]:
df = pd.DataFrame({"document":documents})
# 알파벳만 남기고 제거
df["clean_document"] = df["document"].str.replace("[^a-zA-Z]", " ")
# 길이가 3이하인 단어 제거 > 왱? 짧은 단어는 유용한 정보를 담고있지 않다고 가정한다는군
df["clean_document"] = df["clean_document"].apply(lambda x: " ".join([w for w in x.split() if len(w) > 3]))
# 소문자로 변환
df["clean_document"] = df["clean_document"].apply(lambda x: x.lower())
df.head()

Unnamed: 0,document,clean_document
0,Well i'm not sure about the story nad it did s...,well sure about story seem biased what disagre...
1,"\n\n\n\n\n\n\nYeah, do you expect people to re...",yeah expect people read actually accept hard a...
2,Although I realize that principle is not one o...,although realize that principle your strongest...
3,Notwithstanding all the legitimate fuss about ...,notwithstanding legitimate fuss about this pro...
4,"Well, I will have to change the scoring on my ...",well will have change scoring playoff pool unf...


In [43]:
df["clean_document"][0]

'well sure about story seem biased what disagree with your statement that media ruin israels reputation that rediculous media most israeli media world having lived europe realize that incidences such described letter have occured media whole seem ignore them subsidizing israels existance europeans least same degree think that might reason they report more clearly atrocities what shame that austria daily reports inhuman acts commited israeli soldiers blessing received from government makes some holocaust guilt away after look jews treating other races when they power unfortunate'

In [46]:
english_stopwords = stopwords.words('english')
len(english_stopwords)

179

In [47]:
df["tokenize_document"] = df["clean_document"].apply(lambda x: [w for w in x.split() if x not in english_stopwords])
df["tokenize_document"][0]

['well',
 'sure',
 'about',
 'story',
 'seem',
 'biased',
 'what',
 'disagree',
 'with',
 'your',
 'statement',
 'that',
 'media',
 'ruin',
 'israels',
 'reputation',
 'that',
 'rediculous',
 'media',
 'most',
 'israeli',
 'media',
 'world',
 'having',
 'lived',
 'europe',
 'realize',
 'that',
 'incidences',
 'such',
 'described',
 'letter',
 'have',
 'occured',
 'media',
 'whole',
 'seem',
 'ignore',
 'them',
 'subsidizing',
 'israels',
 'existance',
 'europeans',
 'least',
 'same',
 'degree',
 'think',
 'that',
 'might',
 'reason',
 'they',
 'report',
 'more',
 'clearly',
 'atrocities',
 'what',
 'shame',
 'that',
 'austria',
 'daily',
 'reports',
 'inhuman',
 'acts',
 'commited',
 'israeli',
 'soldiers',
 'blessing',
 'received',
 'from',
 'government',
 'makes',
 'some',
 'holocaust',
 'guilt',
 'away',
 'after',
 'look',
 'jews',
 'treating',
 'other',
 'races',
 'when',
 'they',
 'power',
 'unfortunate']

## Create TF-IDF

In [50]:
detokenized_doc = df["tokenize_document"].apply(lambda x: " ".join(x))
len(detokenized_doc)

11314

In [51]:
detokenized_doc[0]

'well sure about story seem biased what disagree with your statement that media ruin israels reputation that rediculous media most israeli media world having lived europe realize that incidences such described letter have occured media whole seem ignore them subsidizing israels existance europeans least same degree think that might reason they report more clearly atrocities what shame that austria daily reports inhuman acts commited israeli soldiers blessing received from government makes some holocaust guilt away after look jews treating other races when they power unfortunate'

In [53]:
vectorizer = TfidfVectorizer(
    stop_words="english",
    max_features=1000,
    max_df=0.5,
    smooth_idf=True
)
vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=1000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [55]:
X = vectorizer.fit_transform(detokenized_doc)
X.shape

(11314, 1000)

## Topic modeling

In [60]:
trunc_svd = TruncatedSVD(
    n_components=20,
    algorithm="randomized",
    n_iter=100,
    random_state=0
)
trunc_svd

TruncatedSVD(algorithm='randomized', n_components=20, n_iter=100,
       random_state=0, tol=0.0)

In [61]:
trunc_svd.fit(X)
len(trunc_svd.components_)

20

In [62]:
help(trunc_svd)

Help on TruncatedSVD in module sklearn.decomposition.truncated_svd object:

class TruncatedSVD(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin)
 |  TruncatedSVD(n_components=2, algorithm='randomized', n_iter=5, random_state=None, tol=0.0)
 |  
 |  Dimensionality reduction using truncated SVD (aka LSA).
 |  
 |  This transformer performs linear dimensionality reduction by means of
 |  truncated singular value decomposition (SVD). Contrary to PCA, this
 |  estimator does not center the data before computing the singular value
 |  decomposition. This means it can work with scipy.sparse matrices
 |  efficiently.
 |  
 |  In particular, truncated SVD works on term count/tf-idf matrices as
 |  returned by the vectorizers in sklearn.feature_extraction.text. In that
 |  context, it is known as latent semantic analysis (LSA).
 |  
 |  This estimator supports two algorithms: a fast randomized SVD solver, and
 |  a "naive" algorithm that uses ARPACK as an eigensolver on (X * X.T) or
 | 

In [67]:
# s
trunc_svd.singular_values_.shape, trunc_svd.singular_values_.round(2)

((20,), array([17.16,  9.94,  8.17,  7.92,  7.63,  7.53,  7.25,  7.01,  6.88,
         6.86,  6.68,  6.56,  6.53,  6.42,  6.34,  6.22,  6.17,  6.09,
         6.  ,  5.91]))

In [64]:
# V
trunc_svd.components_.shape

(20, 1000)

In [73]:
terms = vectorizer.get_feature_names()

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print(f"Topic {idx+1}: {[(feature_names[i], topic[i].round(5)) for i in np.argsort(topic)[::-1][:n]]}")
        
get_topics(trunc_svd.components_, terms)

Topic 1: [('just', 0.20887), ('like', 0.20469), ('know', 0.19349), ('people', 0.18318), ('think', 0.1697)]
Topic 2: [('thanks', 0.32763), ('windows', 0.28786), ('card', 0.18019), ('drive', 0.16864), ('mail', 0.15261)]
Topic 3: [('game', 0.34011), ('team', 0.30311), ('year', 0.26894), ('games', 0.23784), ('drive', 0.17472)]
Topic 4: [('drive', 0.46159), ('scsi', 0.17188), ('disk', 0.14451), ('hard', 0.13805), ('problem', 0.12763)]
Topic 5: [('drive', 0.39993), ('know', 0.28768), ('thanks', 0.24917), ('does', 0.24678), ('just', 0.17387)]
Topic 6: [('just', 0.55559), ('like', 0.23559), ('windows', 0.23078), ('know', 0.15795), ('does', 0.11156)]
Topic 7: [('just', 0.43264), ('like', 0.22858), ('mail', 0.15052), ('bike', 0.11698), ('thanks', 0.10025)]
Topic 8: [('does', 0.39692), ('know', 0.25192), ('chip', 0.22492), ('like', 0.17824), ('card', 0.15695)]
Topic 9: [('like', 0.42065), ('card', 0.32249), ('sale', 0.20267), ('video', 0.1571), ('offer', 0.14119)]
Topic 10: [('like', 0.61166), ('