In [1]:
import spacy
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df = pd.read_csv('transcript_data.csv')
df = df.dropna()

In [3]:
df.shape

(4298, 2)

In [4]:
cv = CountVectorizer(max_df=0.80, min_df=0.30, stop_words='english')

In [5]:
dtm = cv.fit_transform(df.transcript)

In [6]:
dtm

<4298x145 sparse matrix of type '<class 'numpy.int64'>'
	with 284473 stored elements in Compressed Sparse Row format>

In [7]:
from sklearn.decomposition import LatentDirichletAllocation

In [8]:
LDA = LatentDirichletAllocation(n_components=7)

In [9]:
LDA = LDA.fit(dtm)

In [10]:
# Vocabulary of words
cv.get_feature_names_out()[1:20]

array(['10', '20', 'able', 'actually', 'ago', 'applause', 'ask', 'asked',
       'away', 'believe', 'best', 'better', 'big', 'bit', 'called',
       'came', 'case', 'change', 'come'], dtype=object)

In [11]:
len(cv.get_feature_names_out())

145

In [12]:
# Topics

In [13]:
len(LDA.components_)

7

In [14]:
LDA.components_.shape

(7, 145)

In [15]:
LDA.components_

array([[ 284.63722648,  158.89546207,   89.69778237, ..., 5959.90483694,
         265.09895965, 1231.78870986],
       [ 125.22670954,  291.13411878,  224.93164716, ..., 1475.93890884,
        1577.11240628, 2661.9369707 ],
       [ 197.25548275,  111.80578177,  128.61674436, ..., 2049.96050901,
         365.84994005, 1244.28210786],
       ...,
       [ 525.55496929,  558.21465738,  253.02687785, ...,  129.4562041 ,
         454.74171233,  676.87142921],
       [ 132.32356872,  328.75216894,  149.13258775, ...,  833.58797788,
         307.5787342 ,  464.05704878],
       [2543.86497179, 1620.10844391, 1067.09246597, ..., 3313.79819898,
        2793.635857  , 4892.52566527]])

In [16]:
single_topic = LDA.components_[0]

In [17]:
single_topic.argsort()

array([ 52, 105,  88, 102,  62,  86,  42,  70,  25,   8,   7,   2, 127,
        87, 100, 103, 139,  47,  39, 115,  61,  48,  75,  44,  50,  89,
        58,   1,  17,  51,  46,  82,  68, 125,  93,  14, 104,  27, 117,
        36,  94,  30,  63,  66,  77,  65,  24,  45, 106,  40,  49,  11,
       143,  16,  95,  83, 107,  80,  20,  21,   0, 128, 116,   9,  41,
        97,  71,  96, 109,  26, 124,  12,  33,  29, 118, 129, 120,  55,
        10,  32,  81,  73,  69, 130,  92, 137,  37,  79,  57,   6,  67,
        35,  60,  31, 141,  13,  64,   5, 123,  98,  19, 108,  78,  91,
       136, 110, 119,  38, 131, 101,  22, 111,  56, 112, 113, 135,  74,
        53,  15,  90, 138,   3, 126, 140,  43,  34,  72, 121, 134, 114,
        59, 144,  84,   4, 133, 132,  54,  23,  76,  18,  99, 122,  28,
        85, 142])

In [18]:
single_topic.argsort()[-10:] # Top 10 values

array([132,  54,  23,  76,  18,  99, 122,  28,  85, 142])

In [19]:
top_10_words = single_topic.argsort()[-10:]
for index in top_10_words:
    print(cv.get_feature_names_out()[index])

use
idea
create
make
change
really
think
different
new
world


In [20]:
for i, topic in enumerate(LDA.components_):
    print(f"Top 20 words for topic #{i}")
    print([cv.get_feature_names_out()[index] for index in topic.argsort()[-20:]])
    print('\n')

Top 20 words for topic #0
['example', 'look', 'things', 'using', 'story', 'kind', 'years', 'need', 'actually', 'used', 'use', 'idea', 'create', 'make', 'change', 'really', 'think', 'different', 'new', 'world']


Top 20 words for topic #1
['thought', 'want', 'come', 'story', 'school', 'started', 'came', 'don', 'got', 'went', 'years', 'love', 'applause', 'didn', 'day', 'did', 'life', 'know', 'laughter', 'said']


Top 20 words for topic #2
['change', 'day', 've', 'use', 'live', 'better', 'lives', 'years', 'home', 'future', 'today', 'help', 'need', 'make', 'world', 'life', 'power', 'new', 'human', 'work']


Top 20 words for topic #3
['got', 'kind', 'mean', 'look', 'good', 'actually', 'right', 'talk', 'lot', 'said', 'thing', 'want', 'say', 've', 'things', 'don', 'really', 'going', 'know', 'think']


Top 20 words for topic #4
['work', 'look', 'left', 'problem', 'does', 'second', 'know', 'able', 'school', 'example', 'll', 'need', 'different', 'make', 'right', 'called', 'use', 'let', 'high', '

In [21]:
topic_results = LDA.transform(dtm)

In [22]:
topic_results.round(2)

array([[0.29, 0.  , 0.22, ..., 0.36, 0.  , 0.12],
       [0.  , 0.  , 0.  , ..., 0.45, 0.  , 0.54],
       [0.28, 0.  , 0.  , ..., 0.71, 0.  , 0.  ],
       ...,
       [0.  , 0.13, 0.42, ..., 0.  , 0.23, 0.22],
       [0.  , 0.24, 0.06, ..., 0.  , 0.61, 0.09],
       [0.07, 0.47, 0.  , ..., 0.  , 0.46, 0.  ]])

In [23]:
df['topic'] = pd.DataFrame(topic_results.argmax(axis=1))

In [24]:
df.dropna()

Unnamed: 0,title,transcript,topic
0,Can you outsmart the apples and oranges fallacy?,Baking apple pie? Discount orange warehouse ha...,4.0
1,The exploitation of US college athletes,"In college sports, American universities are e...",6.0
2,How does ultrasound work?,"In a pitch-black cave, bats can’t see much. Bu...",4.0
3,"An honest history of an ancient and ""nasty"" word","First, a warning. As far as offensive words go...",5.0
4,The electrical blueprints that orchestrate life,"Chris Anderson: Mike, welcome. It's good to se...",5.0
...,...,...,...
4292,The anthropology of mobile phones,"I live and work from Tokyo, Japan. And I speci...",1.0
4293,The illustrated woman,What I am always thinking about is what this s...,6.0
4295,"Life at 30,000 feet",Chris Anderson: Welcome to TED.Richard Branson...,2.0
4296,My magic moves,(Applause)(Music)(Applause),5.0


In [25]:
len(topic_results.argmax(axis=1))

4298