In [None]:
cd ..

# Preprocessing

In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import re
from octis.preprocessing.preprocessing import Preprocessing
from nltk.corpus import stopwords
from data_preparation import for_dlda, for_detm

min_df=0.001 #ignore terms that appear in less than 0.1% of the documents

raw_data_parent_dir = './Raw_data/' #loading_path
save_data_parent_dir = './Processed_data/' #saving path

ministry = 'Finance'
data = pd.read_csv(raw_data_parent_dir+'{}/{}.csv'.format(ministry,ministry),
                   usecols=['date', 'question_text', 'answer_text', 'subject']) #loading the csv file

#combining the required columns
data['QnA_sub']=data['question_text'].astype(str) + data['answer_text'].astype(str) + data['subject'].astype(str)
data = data.drop(['question_text', 'answer_text', 'subject'], 1)

#sorting the dataframe wrt date
def sort_by_time(data):
    data['date'] =pd.to_datetime(data.date)
    data.sort_values(by=['date'], inplace=True)
    data = data.reset_index(drop=True)
    data['year'] = data.apply(lambda row: row.date.year, axis = 1)
    data = data.drop(['date'], axis = 1)
    return data

data = sort_by_time(data)
time_slice = list(data.groupby('year')['year'].transform('count').unique())
timestamps = list(data['year']-data.year[0])

#Replacing the term demonetization with demonetisation
insensitive_demonetiz = re.compile(re.escape('demonetiz'), re.IGNORECASE)

#saving the raw data
with open(raw_data_parent_dir+"{}/raw_data.txt".format(ministry), 'w') as output:
    for row in data.QnA_sub:
        output.write(insensitive_demonetiz.sub('demonetis', str(row)) + '\n')

#saving the timestamps
with open(raw_data_parent_dir+"{}/initial_timestamps.txt".format(ministry), 'w') as output:
    for row in timestamps:
        output.write(str(row) + '\n')

#removing stopwords
stop_words = stopwords.words('english')
stop_words.extend(['shri', 'sir', 'hitherto', 'narain', 'namo', 'namonarain', 'meena', 'date', 'dated',
                   'yashwant', 'sinha', 'various', 'likely', 'unikely', 'say', 'government', 'minister',
                   'ministers', 'ministry', 'about', 'total', 'such', 'bring', 'regard', 'patil',
                   'adhalrao', 'thereof', 'etc', 'made', 'also', 'per', 'however', 'india', 'indian',
                   'indias','taken', 'aforesaid', 'along', 'manner', 'upto', 'would', 'getting',
                   'regarding', 'said', 'if', 'receive', 'whatever', 'gingee', 'ramachandran', 'sh',
                   'palanimanickam', 'chidambaram', 'jaswant', 'new', 'old', 'raise', 'singh', 'rakesh',
                   'mohan', 'datum', 'refer', 'do', 'done', 'shall', 'i', 'ii', 'iii', 'iv', 'v', 'vi',
                   'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'date', 'yes', 'no', 'not',
                   'non', 'www', 'whether', 'detail', 'submit', 'finance', 'take'])


#preprocessing using octis
preprocessor = Preprocessing(vocabulary=None,
                             min_df = min_df,
                             max_df = .95,
                             remove_punctuation=True,
                             stopword_list=stop_words,
                             min_chars=3, 
                             min_words_docs=3,
                             verbose=True
                            )

processed_data = preprocessor.preprocess_dataset(documents_path=raw_data_parent_dir+"{}/raw_data.txt".format(ministry),
                                                 labels_path=raw_data_parent_dir+"{}/initial_timestamps.txt".format(ministry))

processed_data.save(save_data_parent_dir+"{}/octis_data/".format(ministry))

# creating data for dlda 
print('Creating data for dlda to analyze the full dataset:')
for_dlda(data_load_dir=save_data_parent_dir+"{}/octis_data/".format(ministry),
         save_dir=save_data_parent_dir+"{}/DLDA/full_data/".format(ministry), 
         unprep_path=raw_data_parent_dir+"{}/raw_data.txt".format(ministry),
         verbose=True,
         min_count=5,
         threshold=20,
         add_val=True,# set add_val=True, add_test=True to analyze the full dataset
         add_test=True,
         seed = 2021)
print('Done...!!!')

# Run LDA

In [None]:
import pickle
from gensim.models import LdaModel

def unpickling(path):
    with open(path, 'rb') as f:
        data = pickle.load(f)
        return data

ministry = 'Finance'
save_data_parent_dir = './Processed_data/'
data_dir = save_data_parent_dir+"{}/DLDA/full_data/".format(ministry)

#loading required files
id2word = unpickling(data_dir+'id2word.pkl')
bow = unpickling(data_dir+'train_bow.pkl')

lda = LdaModel(corpus=bow, id2word=id2word, num_topics=20, random_state=2021, passes=20)

#Saving the learned LDA model
# lda.save('./Results/lda_{}.model'.format(ministry))

# Run LDAseq

In [None]:
from gensim.models.ldaseqmodel import LdaSeqModel

#loading time slice
time_slice = unpickling(data_dir+'tslice_tr.pkl')

lda_seq_fulldata = LdaSeqModel(corpus=bow,
                               time_slice=time_slice,
                               id2word=id2word,
                               num_topics=lda.num_topics,
                               initialize='ldamodel',
                               lda_model=lda,
                               passes=20,
                               random_state=2021,
                               chunksize=100)

#Saving the ldaseq model
# lda_seq_fulldata.save('./Results/ldaseq_{}_ntopics{}.model'.format(ministry,
#                                                                    full_data_lda.num_topics))

# Analyze the topics

In [None]:
#loading the model
from gensim.models.ldaseqmodel import LdaSeqModel
lda_seq_fulldata = LdaSeqModel.load('./Results/ldaseq_Finance_ntopics20.model')

In [None]:
import pandas as pd
import plotly.io as pio
pio.renderers.default='notebook'


def plot_topic_words(model, topic = 0, top_n_words = 10, allow_save = True, path = ''):
    '''
    #plot words wrt time
    --------------------
    model: LdaSeqModel. (type: gensim.models.ldaseqmodel.LdaSeqModel)
    topic: Which topic to plot. (type: int)
    top_n_words: No. of top words to plot. (type: int)
    allow_save: True, if HTML plot need to save. (type: bool)
    path: Path to save the plot if (only if allow_save=True). (type: str)
    --------------------
    '''    
    path = path+'topic_'+str(topic)+'.html'
    lst = model.print_topic_times(topic = topic, top_terms = top_n_words)
    words = set([word for time in range(len(lst)) for word,_ in lst[time]])
    data = pd.DataFrame(columns=words , index=range(len(lst)))
    for time in range(len(lst)):
        for word,prob in lst[time]:
            data.loc[time, word] = prob
    data.index = data.index + 1999
    pd.options.plotting.backend = "plotly"
    fig = data.plot(title="Distribution of words for topic-" + str(topic) +" wrt time",
                    y=list(words),
                    labels=dict(index="Time", value="Probability", variable="Words"))
    fig.update_xaxes(nticks=len(lst))
    if allow_save:
        fig.write_html(path)
    fig.show()
    return data

In [None]:
dataframe = plot_topic_words(lda_seq_fulldata,
                             topic = 8,
                             top_n_words = 10,
                             allow_save = True,
                             path='./Results/Finance_ldaseq.html')