In [1]:
import pandas as pd
import gensim
from gensim import corpora, models
import nltk
import re

from nltk.tokenize import RegexpTokenizer

# Load data into a pandas DataFrame
df = pd.read_csv('train.csv')
df = df[:int(df.shape[0])]

In [46]:
df['Title'][0]

'Java: Repeat Task Every Random Seconds'

In [45]:
df['Body'][0]

'<p>I\'m already familiar with repeating tasks every n seconds by using Java.util.Timer and Java.util.TimerTask. But lets say I want to print "Hello World" to the console every random seconds from 1-5. Unfortunately I\'m in a bit of a rush and don\'t have any code to show so far. Any help would be apriciated.  </p>\n'

In [48]:
df['input'][0]

'Java: Repeat Task Every Random Seconds<p>I\'m already familiar with repeating tasks every n seconds by using Java.util.Timer and Java.util.TimerTask. But lets say I want to print "Hello World" to the console every random seconds from 1-5. Unfortunately I\'m in a bit of a rush and don\'t have any code to show so far. Any help would be apriciated.  </p>\n'

In [4]:
df['input'] = df['Title'] + df['Body']

In [5]:

tag_tokenizer = nltk.RegexpTokenizer('\w+')
df['tags'] = df['Tags'].apply(tag_tokenizer.tokenize)

In [19]:
df['tags']

0                                           [java, repeat]
1                                         [java, optional]
2        [javascript, image, overlay, react, native, op...
3        [swift, operators, whitespace, ternary, operat...
4        [android, material, design, floating, action, ...
                               ...                        
44995                           [c, asp, net, sql, server]
44996                                             [python]
44997                                     [azure, billing]
44998                                         [ios, swift]
44999                                  [c, visual, studio]
Name: tags, Length: 45000, dtype: object

In [20]:
tag_dict = {}
for ls in df['tags']:
    for tag in ls:
        if tag in tag_dict:
            tag_dict[tag] += 1
        else:
            tag_dict[tag] = 1

In [27]:
tag_dict

{'java': 4979,
 'repeat': 12,
 'optional': 19,
 'javascript': 5437,
 'image': 251,
 'overlay': 4,
 'react': 796,
 'native': 484,
 'opacity': 3,
 'swift': 1176,
 'operators': 60,
 'whitespace': 9,
 'ternary': 10,
 'operator': 75,
 'android': 5672,
 'material': 118,
 'design': 146,
 'floating': 43,
 'action': 14,
 'button': 108,
 'c': 7504,
 'pointers': 204,
 'data': 321,
 'structures': 68,
 'jquery': 1461,
 'ui': 99,
 'html': 2650,
 'css': 1677,
 'twitter': 236,
 'bootstrap': 329,
 'windows': 376,
 '10': 77,
 'mobile': 74,
 'universal': 21,
 'vb6': 7,
 'linux': 528,
 'mongodb': 156,
 'ubuntu': 192,
 '14': 65,
 '04': 43,
 'startup': 6,
 'file': 444,
 'ios': 1411,
 'storyboard': 10,
 'uilabel': 16,
 'cocoapods': 38,
 'haskell': 140,
 'cabal': 2,
 'stack': 55,
 'arrays': 1378,
 'standard': 29,
 'library': 78,
 'amazon': 666,
 'web': 791,
 'services': 410,
 'ec2': 65,
 'elastic': 25,
 'beanstalk': 19,
 'arduino': 75,
 'bluebird': 4,
 'powershell': 153,
 'temporary': 2,
 'directory': 56,
 'n

In [33]:
sorted_tags = sorted(tag_dict, key = lambda x: tag_dict[x],  reverse = True)

In [35]:
tag2id = {}
id2tag = {}
tag_id = 0

for tag in sorted_tags:
    if tag not in tag2id:
        tag2id[tag] = tag_id
        id2tag[tag_id] = tag
        tag_id += 1

In [36]:
id2tag

{0: 'c',
 1: 'python',
 2: 'android',
 3: 'javascript',
 4: 'java',
 5: 'php',
 6: 'html',
 7: 'sql',
 8: 'net',
 9: 'css',
 10: 'jquery',
 11: 'ios',
 12: 'arrays',
 13: 'angular',
 14: 'js',
 15: '3',
 16: 'studio',
 17: 'mysql',
 18: 'swift',
 19: 'google',
 20: 'server',
 21: 'asp',
 22: 'x',
 23: 'r',
 24: 'reactjs',
 25: 'regex',
 26: 'node',
 27: 'visual',
 28: 'react',
 29: 'string',
 30: 'json',
 31: 'web',
 32: 'spring',
 33: 'ruby',
 34: 'core',
 35: 'docker',
 36: 'typescript',
 37: 'amazon',
 38: 'laravel',
 39: '2',
 40: 'firebase',
 41: 'excel',
 42: 'list',
 43: 'linux',
 44: 'api',
 45: 'database',
 46: 'native',
 47: 'file',
 48: 'pandas',
 49: 'git',
 50: 'services',
 51: 'apache',
 52: 'django',
 53: 'flutter',
 54: '7',
 55: 'mvc',
 56: 'windows',
 57: 'vba',
 58: 'azure',
 59: 'xcode',
 60: 'vue',
 61: 'go',
 62: 'kotlin',
 63: 'code',
 64: 'selenium',
 65: 'rails',
 66: 'bootstrap',
 67: '5',
 68: 'framework',
 69: 'data',
 70: 'cloud',
 71: 'on',
 72: 'bash',
 7

In [37]:
# Modify preprocess function to replace tags with tag IDs
def preprocess(text):
    text = re.sub(r'\b\w{1,3}\b', '', text)  # Remove short words
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'\W', ' ', text)  # Replace non-alphanumeric characters with space
    text = text.lower()  # Convert to lowercase
    words = text.split()  # Split into words

    # Replace tags with tag IDs
    for i in range(len(words)):
        if words[i] in tag2id:
            words[i] = str(tag2id[words[i]])

    return words

In [39]:
lst = []
for i in df['tags']:
    for k in i:
        if k not in lst:
            lst.append(k)

In [40]:
len(lst)

6917

In [49]:
num_topics = 10

model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary,
                                        alpha='auto', eta='auto',
                                        num_topics = num_topics,
                                        eval_every=None,
                                        gamma_threshold=0.001, minimum_probability=0.01,
                                        random_state=42, per_word_topics=True)


In [42]:
# Get top 3 tags for each comment
def get_top_tags(comment):
    doc = preprocess(comment)
    doc_bow = dictionary.doc2bow(doc)
    topics = model.get_document_topics(doc_bow)

    top_topics = sorted(topics, key=lambda x: x[1], reverse=True)[:3]
    top_tags = [id2tag[int(topic[0])] for topic in top_topics]
    #top_tags = [topic for topic in top_topics]

    return top_tags

In [43]:
df['out_tags'] = df['input'].apply(get_top_tags)

In [44]:
df[['tags', 'out_tags']]

Unnamed: 0,tags,out_tags
0,"[java, repeat]","[c, css, net]"
1,"[java, optional]","[css, android, net]"
2,"[javascript, image, overlay, react, native, op...","[html, python, java]"
3,"[swift, operators, whitespace, ternary, operat...","[android, c, php]"
4,"[android, material, design, floating, action, ...","[java, javascript, css]"
...,...,...
44995,"[c, asp, net, sql, server]","[c, java, android]"
44996,[python],"[android, css, html]"
44997,"[azure, billing]","[css, c, sql]"
44998,"[ios, swift]","[c, css, php]"
