In [1]:
import pandas as pd
from gensim.models.wrappers import DtmModel
from gensim.corpora import Dictionary
import pickle  # Import pickle for serialization

# Load my preprocessed dataset
print("Loading data...")
df = pd.read_csv('/mnt/c/Users/arnea/OneDrive/Desktop/Thesis/Work/Python/df_preprocessedv7.csv')
df['Date'] = pd.to_datetime(df['Date'])

# Preparing documents
print("Preparing documents...")
documents = [doc.split() for doc in df['Processed_Article'].dropna()]

# Prepare timestamps for monthly time slices
print("Preparing time slices...")
df['YearMonth'] = df['Date'].dt.to_period('M')
timestamps = df['YearMonth'].dt.strftime('%Y-%m').unique().tolist()
time_slices = [df['YearMonth'].value_counts()[period] for period in sorted(df['YearMonth'].unique())]

# Create a dictionary representation of the documents
print("Creating dictionary...")
dictionary = Dictionary(documents)

# Create a corpus from the dictionary and documents
print("Creating corpus...")
corpus = [dictionary.doc2bow(doc) for doc in documents]

# Define the path to the DTM binary
dtm_path = '/home/arne/dtm/dtm/main'

# Initialize and train the DTM model
print("Training DTM model...")
num_topics = 17
model = DtmModel(dtm_path, corpus=corpus, time_slices=time_slices, num_topics=num_topics, id2word=dictionary)

# Save the model to disk
print("Saving model...")
with open('dtm_model5.pkl', 'wb') as file:
    pickle.dump(model, file)

# Load the model from disk
print("Loading model from disk...")
with open('dtm_model5.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Displaying topics
print("Displaying topics...")
topics = loaded_model.print_topics()
for topic in topics:
    print(topic)

# Loop over each time slice to gather topic information
print("Gathering topic information over time...")
all_topics = []
for time_slice in range(len(time_slices)):
    print(f"Processing time slice {time_slice + 1}/{len(time_slices)}...")
    for topic_num in range(num_topics):
        top_words = loaded_model.show_topic(topicid=topic_num, time=time_slice, topn=20)
        topic_info = {
            "TimeSlice": time_slice,
            "TopicNum": topic_num,
            "Words": [word for word, _ in top_words],
            "Weights": [weight for _, weight in top_words]
        }
        all_topics.append(topic_info)

# Convert the list of dictionaries into a DataFrame
topics_df = pd.DataFrame(all_topics)

# Save the DataFrame to a CSV for further analysis
print("Saving topics dataframe...")
topics_df.to_csv("dtm_topics_over_timev5.csv", index=False)
print("Process completed successfully.")

Loading data...
Preparing documents...
Preparing time slices...
Creating dictionary...


  df['YearMonth'] = df['Date'].dt.to_period('M')


Creating corpus...
Training DTM model...
Saving model...
Loading model from disk...
Displaying topics...
0.042*cil + 0.039*govern + 0.033*stake + 0.030*share + 0.030*compani + 0.021*sale + 0.020*union + 0.019*percent + 0.014*disinvest + 0.013*sell
0.058*minist + 0.023*govern + 0.017*jaiswal + 0.017*bjp + 0.017*demand + 0.015*parti + 0.014*prime + 0.014*issu + 0.012*congress + 0.010*leader
0.039*cbi + 0.035*report + 0.031*court + 0.017*investig + 0.015*govern + 0.015*alloc + 0.014*minist + 0.012*law + 0.012*probe + 0.011*block
0.012*ga + 0.012*countri + 0.011*sector + 0.010*need + 0.009*product + 0.009*privat + 0.009*plan + 0.009*polici + 0.008*govern + 0.008*invest
0.011*strike + 0.011*worker + 0.009*union + 0.009*bccl + 0.007*famili + 0.007*peopl + 0.007*work + 0.006*offic + 0.006*mine + 0.006*gener
0.053*port + 0.050*railway + 0.028*transport + 0.025*handl + 0.015*termin + 0.012*ore + 0.012*import + 0.011*berth + 0.011*cargo + 0.010*oper
0.057*power + 0.035*price + 0.029*import + 0.0