In [None]:
import os
import numpy as np
import pandas as pd
import csv
import time

from google.colab import drive
drive.mount('/content/drive')
os.chdir("/content/drive/MyDrive/Code")
import cleaning_tweets as ct
from cleaning_tweets import getidx, preprocess, dealing_topics, generate_aspects, clean_aspects

# Gensim
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import gensim.corpora as corpora
from gensim.models import CoherenceModel

# NLTK
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 
from nltk.stem.porter import * 

os.chdir("/content/drive/MyDrive/Data")
model_path = "/content/drive/MyDrive/Code/Model/"

# Training Process

In [None]:
df = pd.read_csv('textid.csv', engine='python', encoding = 'utf-8-sig', dtype=str, header = None)
df.columns = ['text', 'text_id']

result = [i for i in range(0, df.shape[0], 10)]
print(f'The length of samples: {len(result)}')

train_data = pd.DataFrame()
for i in range(0, df.shape[0], 10):
    train_data = df.iloc[i:i+1, :].copy()
    # train_data.to_csv('./topic/trainset4lda_590000.csv', mode='a', header=None, index=False, encoding='utf-8-sig')

In [None]:
# get the train data to train lda
train_data = pd.read_csv('./topic/trainset4lda_590000.csv', engine='python', encoding = 'utf-8-sig', dtype=str, header = None)
train_data.columns = ['text', 'text_id']
train_data.drop_duplicates(['text_id'], keep='first', ignore_index=True, inplace=True)
print(f'Train dataset has {train_data.shape[0]} rows.')

# Preprocess
train_data.loc[:, 'preprocess_text'] = train_data['text'].apply(preprocess)
# Construct a dictionary
dictionary = gensim.corpora.Dictionary(train_data.loc[:, 'preprocess_text'])
# Get a bag-of-words representation
bow_corpus = train_data['preprocess_text'].apply(dictionary.doc2bow)

# Train the model
lda_model =  gensim.models.LdaModel(bow_corpus, num_topics=30, id2word=dictionary, \
                                                                            passes=20, random_state = 400,)

# # Save Models, Dictionary and Corpus
# lda_model.save(fname= model_path + "LDA_Model")
# dictionary.save(model_path + "dictionary.dict") 
# corpora.MmCorpus.serialize(model_path + "bow_corpus.mm", bow_corpus)

Train dataset has 590000 rows.


# Apply model to Testset

In [2]:
# load the model, dictionary and corpus
lda_model = gensim.models.LdaModel.load(model_path + "LDA_Model")
dictionary = gensim.corpora.Dictionary.load(model_path + "dictionary.dict")
bow_corpus = corpora.MmCorpus(model_path + "bow_corpus.mm")

In [3]:
topics = lda_model.print_topics(num_topics = 30, num_words = 10)
df_topic = pd.DataFrame(columns=["topic_index", "topics"])
for idx, topic in topics:
    df_topic.loc[idx, "topic_index"] = idx
    df_topic.loc[idx, "topics"] = topic
# df_topic.to_excel("./topic/original_topics.xlsx", index=False)

In [None]:
# df_topic = pd.read_excel("./topic/original_topics.xlsx")
topicidx_to_aspect = {'0': 'lockdown', '1': 'government', '2': 'lockdown', \
                                '3': 'protective measures', '4': 'china', '8': 'lockdown', \
                                '10': 'lockdown', '11': 'treatment', '12': 'support measures', \
                                '13': 'protective measures', '15': 'lockdown', '16': 'support measures', \
                                '18': 'support measures', '20': 'quarantine', '21': 'spread', \
                                '23': 'information', '25': 'support measures', '26': 'lockdown', \
                                '27': 'spread', '28': 'protective measures', '29': 'government'}

for idx, ap in topicidx_to_aspect.items():
    df_topic.loc[int(idx), 'aspects'] = ap
df_topic.reset_index(inplace=True, drop=True)

df_discard = df_topic[df_topic.isnull().T.any()]
df_discard.reset_index(inplace=True, drop=True)

df_aspect = df_topic.dropna(subset=['aspects'], axis=0, how='any').copy()
df_aspect.reset_index(inplace=True, drop=True)
aspect_lst = list(np.unique(df_aspect['aspects'].astype(str)))
aspect_to_idx = {}
for idx, ap in enumerate(aspect_lst):
    aspect_to_idx[ap] = idx

df_aspect_idx = pd.DataFrame(aspect_to_idx.keys(), columns=['aspects'])
df_aspect_idx.loc[:, 'aspect_idx'] = aspect_to_idx.values()
# df_aspect_idx.to_excel("./topic/aspect2idx.xlsx", index=False)
# df_discard.to_excel("./topic/discard.xlsx", index=False)
print(f"It has {len(np.unique(df_aspect['aspects']))} aspects: \n {aspect_to_idx}")

It has 9 aspects: 
 {'china': 0, 'government': 1, 'information': 2, 'lockdown': 3, 'protective measures': 4, 'quarantine': 5, 'spread': 6, 'support measures': 7, 'treatment': 8}


In [None]:
# topicidx_to_aspect, aspect_to_idx, idx_to_aspect = getidx()

# label all the text using lda
df_chunk = pd.read_csv('textid.csv', header = None, chunksize=300000, dtype = str, encoding = 'utf-8-sig', engine='python')

file_counts = 0
for chunk in df_chunk:
    chunk_time = time.time()
    chunk.columns = ["text", "text_id"]
    chunk.dropna(subset=["text_id"], axis=0, how='any', inplace=True)
    chunk.reset_index(inplace=True, drop=True)
    
    chunk.loc[:, "preprocess_text"] = chunk["text"].astype(str).apply(preprocess)
    chunk.loc[:, "topics"] = chunk["preprocess_text"].apply(dealing_topics)
    chunk.loc[:, "aspect"] = chunk["topics"].apply(generate_aspects)
    chunk.loc[:, "aspects"] = chunk["aspect"].astype(str).apply(clean_aspects)

    chunk.drop(labels=["text", "preprocess_text", "topics", "aspect"], axis=1, inplace=True) 
    chunk.to_csv("textid_ap.csv", mode='a', header=None, index=False, encoding='utf-8-sig') 

    print(f'File {file_counts}:')
    print(f'time cost: {(time.time()-chunk_time)//60} minutes')
    file_counts += 1