In [151]:
import pandas as pd
import gensim
from gensim import corpora, models
import nltk
import re

from nltk.tokenize import RegexpTokenizer

# Load data into a pandas DataFrame
df = pd.read_csv('train.csv')
df = df[:int(df.shape[0])]

In [152]:
df

Unnamed: 0,Id,Title,Body,Tags,CreationDate,Y
0,34552656,Java: Repeat Task Every Random Seconds,<p>I'm already familiar with repeating tasks e...,<java><repeat>,2016-01-01 00:21:59,LQ_CLOSE
1,34553034,Why are Java Optionals immutable?,<p>I'd like to understand why Java 8 Optionals...,<java><optional>,2016-01-01 02:03:20,HQ
2,34553174,Text Overlay Image with Darkened Opacity React...,<p>I am attempting to overlay a title over an ...,<javascript><image><overlay><react-native><opa...,2016-01-01 02:48:24,HQ
3,34553318,Why ternary operator in swift is so picky?,"<p>The question is very simple, but I just cou...",<swift><operators><whitespace><ternary-operato...,2016-01-01 03:30:17,HQ
4,34553755,hide/show fab with scale animation,<p>I'm using custom floatingactionmenu. I need...,<android><material-design><floating-action-but...,2016-01-01 05:21:48,HQ
...,...,...,...,...,...,...
44995,60461435,Convert List<String> to string C# - asp.net - ...,<p>I am new to this and I am asking for help t...,<c#><asp.net><sql-server>,2020-02-29 02:22:18,LQ_CLOSE
44996,60461754,Does Python execute code from the top or botto...,<p>I am working on learning Python and was won...,<python>,2020-02-29 03:33:59,LQ_CLOSE
44997,60462001,how to change payment date in Azure?,<p>It looks like it costs 8 days per month in ...,<azure><billing>,2020-02-29 04:34:16,LQ_CLOSE
44998,60465318,how to implement fill in the blank in Swift,"<p>""I _____ any questions.""</p>\n\n<p>I want t...",<ios><swift>,2020-02-29 12:50:43,LQ_CLOSE


In [153]:

tag_tokenizer = nltk.RegexpTokenizer('\w+')
df['tags'] = df['Tags'].apply(tag_tokenizer.tokenize)

In [154]:
tag2id = {}
id2tag = {}
tag_id = 0

for tags in df['tags']:
    for tag in tags:
        if tag not in tag2id:
            tag2id[tag] = tag_id
            id2tag[tag_id] = tag
            tag_id += 1

In [155]:
# Modify preprocess function to replace tags with tag IDs
def preprocess(text):
    text = re.sub(r'\b\w{1,3}\b', '', text)  # Remove short words
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'\W', ' ', text)  # Replace non-alphanumeric characters with space
    text = text.lower()  # Convert to lowercase
    words = text.split()  # Split into words

    # Replace tags with tag IDs
    for i in range(len(words)):
        if words[i] in tag2id:
            words[i] = str(tag2id[words[i]])

    return words

In [156]:
documents = [preprocess(comment) for comment in df['Title']]
for tags in df['tags']:
    tag_ids = [str(tag2id[tag]) for tag in tags]
    documents.append(tag_ids)

# Create a dictionary of all words in the documents
dictionary = corpora.Dictionary(documents)

# Create a bag of words representation of the documents
corpus = [dictionary.doc2bow(doc) for doc in documents]

In [157]:
num_topics = 10
passes = 10

model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary,
                                        num_topics=num_topics, passes=passes,
                                        alpha='auto', eta='auto',
                                        eval_every=None,
                                        gamma_threshold=0.001, minimum_probability=0.01,
                                        random_state=42, per_word_topics=True)


In [173]:
# Get top 3 tags for each comment
def get_top_tags(comment):
    doc = preprocess(comment)
    doc_bow = dictionary.doc2bow(doc)
    topics = model.get_document_topics(doc_bow)

    top_topics = sorted(topics, key=lambda x: x[1], reverse=True)[:3]
    top_tags = [id2tag[int(topic[0])] for topic in top_topics]

    return top_tags

In [174]:
df['out_tags'] = df['Title'].apply(get_top_tags)

In [177]:
df[['tags', 'out_tags']]

Unnamed: 0,tags,out_tags
0,"[java, repeat]","[repeat, react, javascript]"
1,"[java, optional]","[opacity, react, native]"
2,"[javascript, image, overlay, react, native, op...","[javascript, native, optional]"
3,"[swift, operators, whitespace, ternary, operat...","[opacity, repeat, react]"
4,"[android, material, design, floating, action, ...","[javascript, react, native]"
...,...,...
44995,"[c, asp, net, sql, server]","[swift, opacity, overlay]"
44996,[python],"[swift, overlay, javascript]"
44997,"[azure, billing]","[image, react, java]"
44998,"[ios, swift]","[optional, repeat, react]"
