In [138]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup

In [139]:
df = pd.read_csv('./feats_merge_train.csv')

In [140]:
df['topics'].head(5)

0    asteroid,asteroids,challenge,earth,space,u.s.,...
1    apps and software,google,open source,opn pledg...
2        entertainment,nfl,nfl draft,sports,television
3                      sports,video,videos,watercooler
4    entertainment,instagram,instagram video,nfl,sp...
Name: topics, dtype: object

In [141]:
from nltk.stem.porter import PorterStemmer

topics = df['topics'].values

In [142]:
topics

array(['asteroid,asteroids,challenge,earth,space,u.s.,world',
       'apps and software,google,open source,opn pledge,patent lawsuit theater,software patents,tech,u.s.',
       'entertainment,nfl,nfl draft,sports,television', ...,
       'food,hot dogs,humor,photography,watercooler',
       'business,marissa mayer,media,stocks,yahoo',
       'austin,business,curiocity,small business,startups'], dtype=object)

In [143]:
porter = PorterStemmer()
stemmed_topics = []
for topic in topics:
    words = str(topic).split(',')
    stemmed_topics.append([porter.stem(word) for word in words])

In [144]:
stemmed_topics

[['asteroid', 'asteroid', 'challeng', 'earth', 'space', 'u.s.', 'world'],
 ['apps and softwar',
  'googl',
  'open sourc',
  'opn pledg',
  'patent lawsuit theat',
  'software pat',
  'tech',
  'u.s.'],
 ['entertain', 'nfl', 'nfl draft', 'sport', 'televis'],
 ['sport', 'video', 'video', 'watercool'],
 ['entertain', 'instagram', 'instagram video', 'nfl', 'sport'],
 ['govern',
  'internet',
  'internet service provid',
  'lobbi',
  'lobbyist',
  'mobil',
  'polit',
  'startup',
  'u.s.',
  'world'],
 ['entertain', 'funni', 'iggy azalea', 'music', 'parodi', 'video', 'video'],
 ['bill cosbi', 'entertain', 'televis'],
 ['apps and softwar',
  'gadget',
  'mobil',
  'tech',
  'vending machin',
  'vending machin'],
 ['busi', 'media', 'the new york tim', 'paywal'],
 ['astronomi', 'moon', 'saturn', 'space', 'world'],
 ['humor', 'mad men', 'photographi', 'televis', 'watercool'],
 ['apps and softwar', 'nevada', 'tech', 'u.s.'],
 ['appl', 'busi', 'dev & design', 'steve job', 'u.s.'],
 ['entertain',

In [145]:
from sklearn.preprocessing import MultiLabelBinarizer

In [146]:
mlb = MultiLabelBinarizer()
one_hot_topic = mlb.fit_transform(stemmed_topics)

In [147]:
mlb.classes_

array([' missouri ', '"i love it"', '#1connect', ..., 'zunzuneo', 'zynga',
       'zz ward'], dtype=object)

In [148]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=10, max_iter=50, learning_method='batch', n_jobs=-2, verbose=1)

In [149]:
lda.fit(one_hot_topic)

iteration: 1 of max_iter: 50
iteration: 2 of max_iter: 50
iteration: 3 of max_iter: 50
iteration: 4 of max_iter: 50
iteration: 5 of max_iter: 50
iteration: 6 of max_iter: 50
iteration: 7 of max_iter: 50
iteration: 8 of max_iter: 50
iteration: 9 of max_iter: 50
iteration: 10 of max_iter: 50
iteration: 11 of max_iter: 50
iteration: 12 of max_iter: 50
iteration: 13 of max_iter: 50
iteration: 14 of max_iter: 50
iteration: 15 of max_iter: 50
iteration: 16 of max_iter: 50
iteration: 17 of max_iter: 50
iteration: 18 of max_iter: 50
iteration: 19 of max_iter: 50
iteration: 20 of max_iter: 50
iteration: 21 of max_iter: 50
iteration: 22 of max_iter: 50
iteration: 23 of max_iter: 50
iteration: 24 of max_iter: 50
iteration: 25 of max_iter: 50
iteration: 26 of max_iter: 50
iteration: 27 of max_iter: 50
iteration: 28 of max_iter: 50
iteration: 29 of max_iter: 50
iteration: 30 of max_iter: 50
iteration: 31 of max_iter: 50
iteration: 32 of max_iter: 50
iteration: 33 of max_iter: 50
iteration: 34 of ma

LatentDirichletAllocation(max_iter=50, n_jobs=-2, verbose=1)

In [168]:
a = lda.transform(one_hot_topic)
a.shape

(27643, 10)

In [195]:
a[0]

array([0.01428571, 0.01428571, 0.01428571, 0.65402923, 0.01428571,
       0.01428575, 0.01428765, 0.01428571, 0.23168308, 0.01428571])

In [196]:
topic_type = np.argsort(-a)[:,:1]

In [197]:
topic_type

array([[3],
       [7],
       [4],
       ...,
       [5],
       [1],
       [2]], dtype=int64)

In [201]:
typ = pd.DataFrame(data=topic_type)

Unnamed: 0,Id,Popularity,Page content,data-channel,title,raw_content,content_length,avg_word_length,topics,author,year,month,day,hour,min,sec,weekends,img_number,link_number,title_length
0,0,-1,"<html><head><div class=""article-info""> <span c...",world,NASA's Grand Challenge: Stop Asteroids From De...,There may be killer asteroids headed for Eart...,583,5.214905,3,,2013,6,19,15,4,30,0,1,22,60
1,1,1,"<html><head><div class=""article-info""><span cl...",tech,Google's New Open Source Patent Pledge: We Won...,Google took a stand of sorts against patent-l...,309,5.032787,7,christina,2013,3,28,17,40,55,0,2,18,74
2,2,1,"<html><head><div class=""article-info""><span cl...",entertainment,Ballin': 2014 NFL Draft Picks Get to Choose Th...,You've spend countless hours training to be a...,1360,4.750225,4,sam-laird,2014,5,7,19,15,20,0,2,11,68
3,3,-1,"<html><head><div class=""article-info""><span cl...",watercooler,Cameraperson Fails Deliver Slapstick Laughs,Tired of the same old sports fails and ne...,476,4.841727,5,sam-laird,2013,10,11,2,26,50,0,1,13,43
4,4,-1,"<html><head><div class=""article-info""><span cl...",entertainment,NFL Star Helps Young Fan Prove Friendship With...,"At 6-foot-5 and 298 pounds, All-Pro NFL star ...",1937,5.08965,4,connor-finnegan,2014,4,17,3,31,43,0,52,16,61


In [206]:
new_type = typ.rename(columns={0: 'topics'})
new_type.to_csv('topics.csv')

In [209]:
new_type

Unnamed: 0,topics
0,3
1,7
2,4
3,5
4,4
...,...
27638,8
27639,0
27640,5
27641,1
