# Topic Modelling

## Import

In [28]:
import csv
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from pprint import pprint
from gensim.parsing.preprocessing import preprocess_documents

## Estrazione dati

In [22]:
def get_documents(path):
    file = open(path, encoding="utf8")
    train_set = csv.reader(file)

    header = next(train_set)
    documents = []
    tags = []
    for doc in train_set:
        documents.append(doc[1])
        tags.append(doc[2:])
    file.close()

    return header, documents, tags

PATH = 'dataset/Train.csv'
header, documents, tags = get_documents(PATH)
len(documents)

14004

### Preprocessing dei dati

In [24]:
preprocessed_documents = preprocess_documents(documents)

In [25]:
# Create Dictionary
id2word = Dictionary(preprocessed_documents)

# Term Document Frequency
corpus = [id2word.doc2bow(doc) for doc in preprocessed_documents]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 2), (5, 3), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 5), (14, 2), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 3), (22, 1), (23, 3), (24, 1), (25, 5), (26, 1), (27, 1), (28, 1), (29, 1), (30, 2), (31, 3), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 2), (40, 1), (41, 1), (42, 1), (43, 2), (44, 1), (45, 1), (46, 1), (47, 4), (48, 1), (49, 1), (50, 3), (51, 1), (52, 1), (53, 1), (54, 11), (55, 1), (56, 6), (57, 1), (58, 1), (59, 1), (60, 2), (61, 2), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 4), (70, 1), (71, 1), (72, 2), (73, 3), (74, 1), (75, 1), (76, 1), (77, 3), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 3), (85, 1), (86, 1), (87, 1), (88, 1), (89, 2), (90, 1), (91, 1), (92, 2), (93, 1), (94, 1), (95, 1), (96, 2), (97, 1), (98, 1), (99, 1), (100, 1), (101, 1), (102, 1), (103, 3), (104, 1), (105, 1), (106, 1), (107, 1), (108, 1), (109, 6), (110, 5

In [26]:
lda_model = LdaModel(corpus=corpus,
                    id2word=id2word,
                    num_topics=len(header[1:]), 
                    random_state=100,
                    update_every=1,
                    chunksize=100,
                    passes=10,
                    alpha='auto',
                    per_word_topics=True)

In [27]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(5,
  '0.324*"group" + 0.217*"distanc" + 0.087*"formula" + 0.084*"constrain" + '
  '0.036*"tau" + 0.029*"round" + 0.029*"hilbert" + 0.004*"gross" + '
  '0.003*"certain" + 0.000*"atv"'),
 (12,
  '0.311*"graph" + 0.137*"node" + 0.087*"maxim" + 0.082*"updat" + '
  '0.039*"vertic" + 0.034*"graphic" + 0.031*"structur" + 0.030*"market" + '
  '0.027*"deploi" + 0.025*"proxim"'),
 (21,
  '0.142*"devic" + 0.125*"materi" + 0.120*"boundari" + 0.085*"polar" + '
  '0.066*"grid" + 0.058*"concentr" + 0.034*"cooper" + 0.032*"tunnel" + '
  '0.024*"contract" + 0.021*"defect"'),
 (4,
  '0.143*"tool" + 0.141*"popul" + 0.100*"stage" + 0.076*"causal" + '
  '0.065*"mobil" + 0.042*"littl" + 0.030*"custom" + 0.029*"ask" + 0.027*"sky" '
  '+ 0.026*"mitig"'),
 (20,
  '0.212*"commun" + 0.186*"question" + 0.102*"pair" + 0.081*"relationship" + '
  '0.080*"answer" + 0.071*"examin" + 0.032*"distinguish" + 0.031*"author" + '
  '0.027*"highest" + 0.013*"median"'),
 (18,
  '0.108*"relev" + 0.092*"perturb" + 0.090*"matte