In [2]:
import pandas as pd
import gensim
from gensim import corpora, models
import nltk
import re

from nltk.tokenize import RegexpTokenizer

# Load data into a pandas DataFrame
df = pd.read_csv('train.csv')
df = df[:int(df.shape[0])]

In [3]:
df['Title'][0]

'Java: Repeat Task Every Random Seconds'

In [4]:
df['Body'][0]

'<p>I\'m already familiar with repeating tasks every n seconds by using Java.util.Timer and Java.util.TimerTask. But lets say I want to print "Hello World" to the console every random seconds from 1-5. Unfortunately I\'m in a bit of a rush and don\'t have any code to show so far. Any help would be apriciated.  </p>\n'

In [6]:
df['input'] = df['Body'] + df['Title']

In [9]:

tag_tokenizer = nltk.RegexpTokenizer('\w+')
df['tags'] = df['Tags'].apply(tag_tokenizer.tokenize)

In [11]:
df['tags'][:10]

0                                       [java, repeat]
1                                     [java, optional]
2    [javascript, image, overlay, react, native, op...
3    [swift, operators, whitespace, ternary, operat...
4    [android, material, design, floating, action, ...
5                      [c, pointers, data, structures]
6                     [javascript, jquery, jquery, ui]
7              [jquery, html, css, twitter, bootstrap]
8    [c, windows, 10, windows, 10, mobile, windows,...
9                                                [vb6]
Name: tags, dtype: object

In [12]:
tag_dict = {}
for ls in df['tags']:
    for tag in ls:
        if tag in tag_dict:
            tag_dict[tag] += 1
        else:
            tag_dict[tag] = 1

In [20]:
list(tag_dict.keys())[:10]

['java',
 'repeat',
 'optional',
 'javascript',
 'image',
 'overlay',
 'react',
 'native',
 'opacity',
 'swift']

In [21]:
sorted_tags = sorted(tag_dict, key = lambda x: tag_dict[x],  reverse = True)

In [22]:
tag2id = {}
id2tag = {}
tag_id = 0

for tag in sorted_tags:
    if tag not in tag2id:
        tag2id[tag] = tag_id
        id2tag[tag_id] = tag
        tag_id += 1

In [31]:
list(id2tag.items())[:10]

[(0, 'c'),
 (1, 'python'),
 (2, 'android'),
 (3, 'javascript'),
 (4, 'java'),
 (5, 'php'),
 (6, 'html'),
 (7, 'sql'),
 (8, 'net'),
 (9, 'css')]

In [37]:
# Modify preprocess function to replace tags with tag IDs
def preprocess(text):
    text = re.sub(r'\b\w{1,3}\b', '', text)  # Remove short words
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'\W', ' ', text)  # Replace non-alphanumeric characters with space
    text = text.lower()  # Convert to lowercase
    words = text.split()  # Split into words

    # Replace tags with tag IDs
    for i in range(len(words)):
        if words[i] in tag2id:
            words[i] = str(tag2id[words[i]])

    return words

In [39]:
lst = []
for i in df['tags']:
    for k in i:
        if k not in lst:
            lst.append(k)

In [40]:
len(lst)

6917

In [49]:
num_topics = 10

model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary,
                                        alpha='auto', eta='auto',
                                        num_topics = num_topics,
                                        eval_every=None,
                                        gamma_threshold=0.001, minimum_probability=0.01,
                                        random_state=42, per_word_topics=True)


In [42]:
# Get top 3 tags for each comment
def get_top_tags(comment):
    doc = preprocess(comment)
    doc_bow = dictionary.doc2bow(doc)
    topics = model.get_document_topics(doc_bow)

    top_topics = sorted(topics, key=lambda x: x[1], reverse=True)[:3]
    top_tags = [id2tag[int(topic[0])] for topic in top_topics]
    #top_tags = [topic for topic in top_topics]

    return top_tags

In [43]:
df['out_tags'] = df['input'].apply(get_top_tags)

In [44]:
df[['tags', 'out_tags']]

Unnamed: 0,tags,out_tags
0,"[java, repeat]","[c, css, net]"
1,"[java, optional]","[css, android, net]"
2,"[javascript, image, overlay, react, native, op...","[html, python, java]"
3,"[swift, operators, whitespace, ternary, operat...","[android, c, php]"
4,"[android, material, design, floating, action, ...","[java, javascript, css]"
...,...,...
44995,"[c, asp, net, sql, server]","[c, java, android]"
44996,[python],"[android, css, html]"
44997,"[azure, billing]","[css, c, sql]"
44998,"[ios, swift]","[c, css, php]"
