In [1]:
import nltk
import pandas as pd
from pyspark.sql.types import *
from pyspark.sql.functions import *
from sklearn.datasets import fetch_20newsgroups
from nltk.stem import WordNetLemmatizer, PorterStemmer

In [2]:
newsgroups_train = fetch_20newsgroups(subset="train")
newsgroups_test = fetch_20newsgroups(subset="test")

In [3]:
nltk.download("averaged_perceptron_tagger")
nltk.download("wordnet")

In [4]:
type(newsgroups_train)

In [5]:
df = pd.DataFrame({"text": newsgroups_train.data, "target": newsgroups_train.target})
targ_df = pd.DataFrame({"title": newsgroups_train.target_names})
ngout = pd.merge(df, targ_df, left_on="target", right_index=True)

In [6]:
display(ngout)

In [7]:
sdf = spark.createDataFrame(ngout)

In [8]:
display(sdf)

In [9]:
from pyspark.sql.functions import split
from pyspark.sql.functions import monotonically_increasing_id

In [10]:
# After splitting on the basis of two newline characters, we will have mainly three items in the list.
# We are not interested in the 1st one so will will take 2nd and 3rd. We are also creating a column with increasing id. 
# number form 0 to number of items available
# This "id" column will help in identifying the parent of the seperated texts.
sdf = sdf.withColumn("text_sep", split(sdf.text, "\n\n")).select(col("text"), col("target"), col("title"), col("text_sep").getItem(1), col("text_sep").getItem(2)).withColumn("id", monotonically_increasing_id())

In [11]:
display(sdf)

In [12]:
sdf.createOrReplaceTempView("newsgroup")

In [13]:
%sql
--note that its not single quotes below
--its the apostrophe sign
select count(*) from newsgroup where `text_sep[2]` is null

count(1)
565


In [14]:
%sql
select count(*) from newsgroup where `text_sep[1]` = ''

count(1)
262


In [15]:
import re
from pyspark.sql.types import FloatType

def clean_text(in_string):
  remove_email = re.sub("\S*@\S*\s?", "", in_string)
  remove_nl = re.sub("\s+", " ", remove_email)
  remove_othr = re.sub("\'|\>|\:|\-", "", remove_nl)
  return remove_othr

spark.udf.register("clean", clean_text)

In [16]:
%sql
select clean(CASE when `text_sep[2]` is null then `text_sep[1]` when `text_sep[1]`='' then `text_sep[2]` else CONCAT(`text_sep[1]`, ' ', `text_sep[2]`) end) as text, target, title, id from newsgroup where `text_sep[2]` is not null and `text_sep[1]` <> ''

text,target,title,id
"I was wondering if anyone out there could enlighten me on this car I saw the other day. It was a 2door sports car, looked to be from the late 60s/ early 70s. It was called a Bricklin. The doors were really small. In addition, the front bumper was separate from the rest of the body. This is all I know. If anyone can tellme a model name, engine specs, years of production, where this car is made, history, or whatever info you have on this funky looking car, please email. Thanks, IL brought to you by your neighborhood Lerxst",7.0,rec.autos,0.0
"(Trevor Corkum) writes Is it only me, or is safety not one of the most important factors when buying a car? It depends on your priorities. A lot of people put higher priorities on gas mileage and cost than on safety, buying ""unsafe"" econoboxes instead of Volvos. I personally take a middle ground the only thing I really look for is a threepoint seatbelt and 5+mph bumpers. I figure that 30mph collisions into brick walls arent common enough for me to spend that much extra money for protection, but there are lots of lowspeed collisions that do worry me.",7.0,rec.autos,2.0
"an excellent automatic can be found in the subaru legacy. it switches to ""sport"" mode when the electronics figure it, not when the driver sets the switch.. which is the proper way to do it, IMO. so what does ""sport"" mode entail? several things 1) revving to red line (or to the rev limiter in the case of the legacy)",7.0,rec.autos,3.0
"Ford and his automobile. I need information on whether Ford is partially responsible for all of the car accidents and the depletion of the ozone layer. Also, any other additional information will be greatly appreciated. Thanks. SSSSSoooooooooooo!!!!! Its all HIS fault!! Thank God Louis Chevrolet is innocent! and that guy Diesel, HE otto feel guilty! Stephen Phillips Atlanta Response Center Atlanta, Ga. Home of the Braves!",7.0,rec.autos,4.0
"In article (Matthew MacIntyre at the National University of Senegal) writes (James P. Callison) writes Im not going to argue the issue of carrying weapons, but I would ask you if you would have thought seriously about shooting a kid for setting off your alarm? I can think of worse things in the world. Glad you got out of there before they did anything to give you a reason to fire your gun. I think people have a right to kill to defend their property. Why not? Be honest do you really care more about scum than about your car? Yo! Watch the attributionsI didnt say that!",7.0,rec.autos,5.0
"In article (Aviad Sheinfeld) writes Do you think I can use a electric drill( change to a suitable bit ) to turn it out? If I can succeed, can I retighten it not too tight, is it safe without oil leak? Tighten the bolt to the specified torque in your service manual. That way it wont leak, strip, break, etc. (hopefully ) ) Thank you very much in advance Winson Aviad You can avoid these problems entirely by installing an oil drain valve in place of the bolt. I have one on both of my cars. There have been no leaks in 210,000 miles (combined miles on both cars). Ron DeBlock (thats a number 1 in rdb1, not letter l) AT&T Bell Labs Somerset, NJ USA",7.0,rec.autos,6.0
"I have a 1986 Acura Integra 5 speed with 95,000 miles on it. It is positively the worst car I have ever owned. I had an 83 Prelude that had 160k miles on it when I sold it, and it was still going strong . This is with religious attention to maintenance such as oil changes etc. Both cars were driven in exactly the same manner.. 1. It has gone through two clutches (which are underrated.) 2. 3 sets of tires (really eats tires in the front even with careful align) 3. All struts started leaking about 2530k miles 4. Windshield wiper motor burned up (service note on this one) 5. Seek stop working on radio about 20k miles 6. Two timing belts. 7. Constant error signals from computer.",7.0,rec.autos,7.0
"In article (THUNDERBIRDS ARE GO !!!) writes Are there any MR2 owners or motorhead gurus out there, that know why my MR2s engine sounds noisy? The MR2s engine is noisy at the best of times, but not even a nice nose its one of those very ugly noises. assuming yours is a non turbo MR2, the gruffness is characteristic of a large inline 4 that doesnt have balance shafts. i guess toyota didnt care about ""little"" details like that when they can brag about the mid engine configuration and the flashy styling.",7.0,rec.autos,8.0
"In article (SCOTT WARREN ROSANDER) writes | In article (George Hei | nz) writes | After too many years of school Im finally graduating and getting a real | job. Of course I am trying to make plans of how to spend all this extra | money. Right now I have an 89 accord, a good car, but not real sporty & | I was thinking of selling it in about two years and dropping around | $20k on a sports car of some kind. After thinking about it, I may have a | better idea Ill keep the Accord until it drops and buy the car Ive | always wanted a Corvette Stingray. My reasoning is that $8000 (accord)+ | $8000 (corvette) =$16000 is less than what I would spend anyway. | | Basically, Im thinking of a late 70s, early 80s for around $7$10k. | My question is, what are good years to consider (for reliability, looks, | horsepower in that order, believe it or not, horsepower is not a main | concern, if I want to go fast, I get on my motorcycle) and what are | good prices? | | Also, what would insurance look like? Im male, single, 23 (I might | wait until Im 25 to get the car = lower insurance). Would the fact that | I mainly drive the other car lower it? Is there some type of ""classic | car"" or ""rarely driven"" insurance class for driving it under 10k miles | per year? | | My dad has a 66 vette and its on what you say classic insurance. | Basically what that means is that it has restricted amount of driving | time, which basically means it cant be used as an every day car and would | probably suit your needs for limited mileage. | In addition to restricted mileage, many classic insurance carriers also require that the vehicle be garaged when not in use.",7.0,rec.autos,9.0
"(Craig ""Powderkeg"" DeForest) writes If youre planning on making long drives, the 20W50 is probably fine (esp. in the summer) in your 10W40 car. But if youre making short drives, stick to the 10W40.",7.0,rec.autos,10.0


In [17]:
# same in dataframe
sdf = spark.sql("select clean(CASE when `text_sep[2]` is null then `text_sep[1]` when `text_sep[1]`='' then `text_sep[2]` else CONCAT(`text_sep[1]`, ' ', `text_sep[2]`) end) as text, target, title, id from newsgroup where `text_sep[2]` is not null and `text_sep[1]` <> ''")

In [18]:
sdf.count()

In [19]:
# Checking text which are quite small and probably of no use
from pyspark.sql.functions import col, length
display(sdf.where(length(col('text')) < 100))

In [20]:
# Removing text of small length
sdf = sdf.where(length(col("text")) > 100)

In [21]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover
tokenizer = RegexTokenizer(inputCol="text", outputCol="tokens", pattern="\\W+", minTokenLength=4, toLowercase=True)
tokenized = tokenizer.transform(sdf)

In [22]:
display(tokenized)

In [23]:
spremover = StopWordsRemover(inputCol="tokens", outputCol="spfiltered")
spremoved = spremover.transform(tokenized)

display(spremoved.select("tokens", "spfiltered"))

In [24]:
# We imported these two packages ate the top
porter = PorterStemmer()
lemma = WordNetLemmatizer()

def word_tokenize(text):
  pos = nltk.pos_tag(text)
  final = [lemma.lemmatize(word[0]) if (lemma.lemmatize(word[0]).endswith(("e","ion")) or len(word[0]) < 4) else porter.stem(word[0]) for word in pos]
  return final

In [25]:
# We will aplly the above word_tokenize function to all the words inside the list of list

stemmed = spremoved.rdd.map(lambda tup: (tup[1], tup[2], tup[3], word_tokenize(tup[5])))

In [26]:
stemmed.collect()

In [27]:
news_df = stemmed.toDF(schema = ["target", "title", "id", "word"])

In [28]:
display(news_df)

In [29]:
# If we see in the data, there are still words which can be stopwords/unnecessary words
# and we would like to remove them
# We can make a stopwords list of our own and use that
spwordlist = ["article", "write", "entry", "date", "udel", "said", "tell", "think", "know", "just", "isnt", "line", "like", "does", "going", "make", "thanks", "also"]

spremover1 = StopWordsRemover(inputCol="word", outputCol="word_new", stopWords=spwordlist)
news_df = spremover1.transform(news_df)
display(news_df.select("word", "word_new"))

word,word_new
"List(wonder, anyone, enlighten, 2door, sport, look, late, earli, call, bricklin, door, realli, small, addition, front, bumper, separate, rest, bodi, know, anyone, tellme, model, name, engine, spec, year, production, made, histori, whatev, info, funki, look, please, email, thank, brought, neighborhood, lerxst)","List(wonder, anyone, enlighten, 2door, sport, look, late, earli, call, bricklin, door, realli, small, addition, front, bumper, separate, rest, bodi, anyone, tellme, model, name, engine, spec, year, production, made, histori, whatev, info, funki, look, please, email, thank, brought, neighborhood, lerxst)"
"List(trevor, corkum, write, safeti, import, factor, buy, depend, prioriti, people, higher, prioriti, mileage, cost, safeti, buy, unsafe, econobox, instead, volvo, person, take, middle, ground, thing, realli, look, threepoint, seatbelt, bumper, figure, 30mph, collision, brick, wall, arent, common, enough, spend, much, extra, money, protection, lot, lowspe, collision, worri)","List(trevor, corkum, safeti, import, factor, buy, depend, prioriti, people, higher, prioriti, mileage, cost, safeti, buy, unsafe, econobox, instead, volvo, person, take, middle, ground, thing, realli, look, threepoint, seatbelt, bumper, figure, 30mph, collision, brick, wall, arent, common, enough, spend, much, extra, money, protection, lot, lowspe, collision, worri)"
"List(excel, automat, found, subaru, legaci, switch, sport, mode, electron, figure, driver, set, switch, proper, sport, mode, entail, sever, thing, rev, line, limit, case, legaci)","List(excel, automat, found, subaru, legaci, switch, sport, mode, electron, figure, driver, set, switch, proper, sport, mode, entail, sever, thing, rev, limit, case, legaci)"
"List(ford, automobile, need, information, whether, ford, partial, responsible, accid, depletion, ozone, layer, also, addit, information, greatli, appreci, thank, sssssoooooooooooo, fault, thank, loui, chevrolet, innoc, diesel, otto, feel, guilti, stephen, phillip, atlanta, response, center, atlanta, home, brave)","List(ford, automobile, need, information, whether, ford, partial, responsible, accid, depletion, ozone, layer, addit, information, greatli, appreci, thank, sssssoooooooooooo, fault, thank, loui, chevrolet, innoc, diesel, otto, feel, guilti, stephen, phillip, atlanta, response, center, atlanta, home, brave)"
"List(article, matthew, macintyre, nation, univers, seneg, write, jame, callison, write, go, argue, issue, carri, weapon, thought, serious, shoot, set, alarm, think, worse, thing, world, glad, anyth, give, reason, fire, think, people, right, kill, defend, properti, honest, realli, care, scum, watch, attributionsi, didnt)","List(matthew, macintyre, nation, univers, seneg, jame, callison, go, argue, issue, carri, weapon, thought, serious, shoot, set, alarm, worse, thing, world, glad, anyth, give, reason, fire, people, right, kill, defend, properti, honest, realli, care, scum, watch, attributionsi, didnt)"
"List(article, aviad, sheinfeld, write, think, electr, drill, change, suitable, turn, succeed, retighten, tight, safe, without, leak, tighten, bolt, specifi, torque, service, manual, wont, leak, strip, break, hope, thank, much, advance, winson, aviad, avoid, problem, entir, instal, drain, valve, place, bolt, car, leak, mile, combin, mile, car, deblock, that, number, rdb1, letter, bell, lab, somerset)","List(aviad, sheinfeld, electr, drill, change, suitable, turn, succeed, retighten, tight, safe, without, leak, tighten, bolt, specifi, torque, service, manual, wont, leak, strip, break, hope, thank, much, advance, winson, aviad, avoid, problem, entir, instal, drain, valve, place, bolt, car, leak, mile, combin, mile, car, deblock, that, number, rdb1, letter, bell, lab, somerset)"
"List(1986, acura, integra, speed, mile, posit, worst, ever, own, prelude, 160k, mile, sold, still, go, strong, religi, attention, maintenance, change, car, driven, exactli, manner, gone, clutch, underr, set, tire, realli, eat, tire, front, even, care, align, strut, start, leak, 2530k, mile, windshield, wiper, motor, burn, service, note, seek, stop, work, radio, mile, time, belt, constant, error, signal, comput)","List(1986, acura, integra, speed, mile, posit, worst, ever, own, prelude, 160k, mile, sold, still, go, strong, religi, attention, maintenance, change, car, driven, exactli, manner, gone, clutch, underr, set, tire, realli, eat, tire, front, even, care, align, strut, start, leak, 2530k, mile, windshield, wiper, motor, burn, service, note, seek, stop, work, radio, mile, time, belt, constant, error, signal, comput)"
"List(article, thunderbird, write, owner, motorhead, guru, know, mr2, engine, sound, noisi, mr2, engine, noisi, best, time, even, nice, nose, ugli, noise, assum, turbo, gruff, characterist, large, inline, doesnt, balance, shaft, guess, toyota, didnt, care, little, detail, like, brag, engine, configuration, flashi, style)","List(thunderbird, owner, motorhead, guru, mr2, engine, sound, noisi, mr2, engine, noisi, best, time, even, nice, nose, ugli, noise, assum, turbo, gruff, characterist, large, inline, doesnt, balance, shaft, guess, toyota, didnt, care, little, detail, brag, engine, configuration, flashi, style)"
"List(article, scott, warren, rosand, write, article, george, write, mani, year, school, final, graduat, get, real, course, tri, make, plan, spend, extra, money, right, accord, good, real, sporti, think, sell, year, drop, around, sport, kind, think, better, idea, keep, accord, drop, alway, want, corvette, stingray, reason, 8000, accord, 8000, corvette, 16000, le, spend, anyway, basic, think, late, earli, around, question, good, year, consid, reliabl, look, horsepow, order, believe, horsepow, main, concern, want, fast, motorcycle, good, price, also, insurance, look, like, male, single, might, wait, lower, insurance, fact, mainli, drive, lower, type, classic, rare, driven, insurance, class, drive, mile, year, vette, classic, insurance, basic, mean, restrict, amount, drive, time, basic, mean, cant, use, everi, probabl, suit, need, limit, mileage, addition, restrict, mileage, mani, classic, insurance, carrier, also, require, vehicle, garag)","List(scott, warren, rosand, george, mani, year, school, final, graduat, get, real, course, tri, plan, spend, extra, money, right, accord, good, real, sporti, sell, year, drop, around, sport, kind, better, idea, keep, accord, drop, alway, want, corvette, stingray, reason, 8000, accord, 8000, corvette, 16000, le, spend, anyway, basic, late, earli, around, question, good, year, consid, reliabl, look, horsepow, order, believe, horsepow, main, concern, want, fast, motorcycle, good, price, insurance, look, male, single, might, wait, lower, insurance, fact, mainli, drive, lower, type, classic, rare, driven, insurance, class, drive, mile, year, vette, classic, insurance, basic, mean, restrict, amount, drive, time, basic, mean, cant, use, everi, probabl, suit, need, limit, mileage, addition, restrict, mileage, mani, classic, insurance, carrier, require, vehicle, garag)"
"List(craig, powderkeg, deforest, write, youre, plan, make, long, drive, 20w50, probabl, fine, summer, 10w40, youre, make, short, drive, stick, 10w40)","List(craig, powderkeg, deforest, youre, plan, long, drive, 20w50, probabl, fine, summer, 10w40, youre, short, drive, stick, 10w40)"


In [30]:
# to count the frequency of the words we will explode the list into rows
# Each item in the list will be a new row
# This is for Tf-idf
# This will remove the existing word_new column

df_explode = news_df.withColumn("word_new", explode("word_new"))

In [31]:
display(df_explode)

In [32]:
news_df.count()

In [33]:
df_explode.count()

In [34]:
df_explode.createOrReplaceTempView("topwords")

In [35]:
%sql 
select word_new, count(*) as freq from topwords group by word_new order by freq desc

word_new,freq
dont,1989
use,1905
maxaxaxaxaxaxaxaxaxaxaxaxaxaxax,1888
people,1773
time,1648
work,1440
anyone,1322
look,1306
good,1279
year,1262


In [36]:
from pyspark.ml.feature import CountVectorizer

# Take only 10000 words, min term frquency across all the docs = 5
cv = CountVectorizer(inputCol="word_new", outputCol="rawFeatures", vocabSize=10000, minDF=5)
cvmodel = cv.fit(news_df)
featurized_data = cvmodel.transform(news_df)

In [37]:
# Th output in "rawFeatures" column will be in this format
# [0, len(vocabulary), [integers indices of words in voacabulary], [number of occurances of corresposning words]]
# Dummy Example:
# [0,7533,[6,7,9,26,28,31,33],[2,2,1,1,1,1,1]]
# there are total 7533 words in vovabulary
# the index of first word is 6 in vocabulary and it came twice in document
# the index of third word is 9 in vocabulary and it came once in document
display(featurized_data)

In [38]:
# Saving the vocabulary as we will need it to convert 
# integers to original text
vocab = cvmodel.vocabulary
vocab_broadcast = sc.broadcast(vocab)

In [39]:
from pyspark.ml.feature import IDF
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurized_data)
rescaledData = idfModel.transform(featurized_data)

In [40]:
display(rescaledData)

target,title,id,word,word_new,rawFeatures,features
7,rec.autos,0,"List(wonder, anyone, enlighten, 2door, sport, look, late, earli, call, bricklin, door, realli, small, addition, front, bumper, separate, rest, bodi, know, anyone, tellme, model, name, engine, spec, year, production, made, histori, whatev, info, funki, look, please, email, thank, brought, neighborhood, lerxst)","List(wonder, anyone, enlighten, 2door, sport, look, late, earli, call, bricklin, door, realli, small, addition, front, bumper, separate, rest, bodi, anyone, tellme, model, name, engine, spec, year, production, made, histori, whatev, info, funki, look, please, email, thank, brought, neighborhood, lerxst)","List(0, 7533, List(6, 7, 9, 26, 28, 31, 34, 74, 77, 84, 121, 211, 230, 244, 408, 420, 489, 512, 525, 549, 571, 618, 712, 792, 837, 1033, 1260, 1326, 1775, 2454, 3599, 3742, 7052), List(2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","List(0, 7533, List(6, 7, 9, 26, 28, 31, 34, 74, 77, 84, 121, 211, 230, 244, 408, 420, 489, 512, 525, 549, 571, 618, 712, 792, 837, 1033, 1260, 1326, 1775, 2454, 3599, 3742, 7052), List(4.150265033706053, 4.362624737788511, 2.353845919322047, 2.6074821503641106, 2.4611322901730874, 2.5786141663632587, 2.606088421915507, 3.109513456863326, 3.00669072085797, 3.084483708797491, 3.18610590234471, 3.649168502236211, 3.849839197698362, 3.7144978498286, 4.248084057833039, 4.528597640806208, 4.370373635591314, 4.171922696867475, 4.322745586602059, 4.233798100585563, 4.322745586602059, 4.618209799495895, 4.482077625171315, 4.739906734473415, 4.8387525691100475, 4.865069877427421, 5.08821342874163, 5.212266077411609, 5.444888372680363, 5.850353480788527, 6.985333413627512, 6.474507789861521, 7.390798521735676))"
7,rec.autos,2,"List(trevor, corkum, write, safeti, import, factor, buy, depend, prioriti, people, higher, prioriti, mileage, cost, safeti, buy, unsafe, econobox, instead, volvo, person, take, middle, ground, thing, realli, look, threepoint, seatbelt, bumper, figure, 30mph, collision, brick, wall, arent, common, enough, spend, much, extra, money, protection, lot, lowspe, collision, worri)","List(trevor, corkum, safeti, import, factor, buy, depend, prioriti, people, higher, prioriti, mileage, cost, safeti, buy, unsafe, econobox, instead, volvo, person, take, middle, ground, thing, realli, look, threepoint, seatbelt, bumper, figure, 30mph, collision, brick, wall, arent, common, enough, spend, much, extra, money, protection, lot, lowspe, collision, worri)","List(0, 7533, List(3, 7, 15, 22, 29, 34, 63, 113, 158, 249, 354, 400, 443, 446, 469, 552, 620, 623, 636, 769, 775, 777, 819, 939, 983, 1065, 1110, 1487, 1976, 3742, 4180, 4252, 5423, 6412, 6459, 7177), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 1.0))","List(0, 7533, List(3, 7, 15, 22, 29, 34, 63, 113, 158, 249, 354, 400, 443, 446, 469, 552, 620, 623, 636, 769, 775, 777, 819, 939, 983, 1065, 1110, 1487, 1976, 3742, 4180, 4252, 5423, 6412, 6459, 7177), List(2.0974936970111835, 2.1813123688942553, 2.3823879226615317, 2.4326267971751614, 2.620113897270011, 2.606088421915507, 2.9699518952122124, 3.2854041233269906, 3.5841360319653566, 3.874290293562526, 3.9198678020588456, 4.1011536259792685, 4.1011536259792685, 4.55758517767946, 4.192125404184995, 4.277283212525302, 4.4118733664980665, 4.330527727044114, 8.857935599714732, 4.660769413914691, 9.811783743895353, 4.682748320633466, 4.649958497810475, 4.763717383167133, 4.775838743699478, 5.105020547058012, 4.934062748914372, 5.231314272382304, 5.8867211249594025, 6.474507789861521, 13.569325436330722, 6.697651341175731, 13.970666827255023, 7.390798521735676, 7.236647841908418, 7.390798521735676))"
7,rec.autos,3,"List(excel, automat, found, subaru, legaci, switch, sport, mode, electron, figure, driver, set, switch, proper, sport, mode, entail, sever, thing, rev, line, limit, case, legaci)","List(excel, automat, found, subaru, legaci, switch, sport, mode, electron, figure, driver, set, switch, proper, sport, mode, entail, sever, thing, rev, limit, case, legaci)","List(0, 7533, List(15, 67, 96, 138, 162, 282, 310, 324, 443, 544, 591, 617, 618, 901, 917, 3795, 5068), List(1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0))","List(0, 7533, List(15, 67, 96, 138, 162, 282, 310, 324, 443, 544, 591, 617, 618, 901, 917, 3795, 5068), List(2.3823879226615317, 3.044830936877497, 3.5335837528025253, 3.3419163335903326, 3.4395548031542487, 8.27826574808897, 8.31735494023491, 4.000774440671646, 4.1011536259792685, 4.473027789651397, 4.500426763839512, 4.567437474122472, 9.23641959899179, 4.751741192120417, 4.825849164274139, 6.784662718165361, 6.985333413627512))"
7,rec.autos,4,"List(ford, automobile, need, information, whether, ford, partial, responsible, accid, depletion, ozone, layer, also, addit, information, greatli, appreci, thank, sssssoooooooooooo, fault, thank, loui, chevrolet, innoc, diesel, otto, feel, guilti, stephen, phillip, atlanta, response, center, atlanta, home, brave)","List(ford, automobile, need, information, whether, ford, partial, responsible, accid, depletion, ozone, layer, addit, information, greatli, appreci, thank, sssssoooooooooooo, fault, thank, loui, chevrolet, innoc, diesel, otto, feel, guilti, stephen, phillip, atlanta, response, center, atlanta, home, brave)","List(0, 7533, List(13, 28, 81, 185, 190, 251, 344, 357, 394, 690, 925, 945, 970, 975, 997, 1214, 1220, 1305, 1533, 1557, 1653, 2102, 2566, 2739, 3515, 4923, 5825), List(1.0, 2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","List(0, 7533, List(13, 28, 81, 185, 190, 251, 344, 357, 394, 690, 925, 945, 970, 975, 997, 1214, 1220, 1305, 1533, 1557, 1653, 2102, 2566, 2739, 3515, 4923, 5825), List(2.3208466504830008, 4.922264580346175, 6.2562372893887215, 3.587846611361892, 3.6894965476231825, 3.7401402804419375, 4.113653788743499, 3.8792530829046554, 4.0467595539134695, 4.403434497852202, 4.89209854981534, 5.139506723129181, 5.05542360591864, 4.905891871947676, 4.905891871947676, 5.039423264572198, 5.039423264572198, 10.751791002386822, 11.037992689668169, 5.375895501193411, 5.544971831237345, 5.815262160977257, 6.004504160615785, 5.963682166095531, 6.879972897969686, 6.985333413627512, 7.390798521735676))"
7,rec.autos,5,"List(article, matthew, macintyre, nation, univers, seneg, write, jame, callison, write, go, argue, issue, carri, weapon, thought, serious, shoot, set, alarm, think, worse, thing, world, glad, anyth, give, reason, fire, think, people, right, kill, defend, properti, honest, realli, care, scum, watch, attributionsi, didnt)","List(matthew, macintyre, nation, univers, seneg, jame, callison, go, argue, issue, carri, weapon, thought, serious, shoot, set, alarm, worse, thing, world, glad, anyth, give, reason, fire, people, right, kill, defend, properti, honest, realli, care, scum, watch, attributionsi, didnt)","List(0, 7533, List(3, 15, 16, 34, 51, 54, 85, 91, 92, 111, 117, 133, 183, 203, 218, 231, 271, 306, 351, 364, 460, 591, 721, 743, 918, 942, 958, 1128, 1314, 1659, 2423, 3087, 3192, 3857), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","List(0, 7533, List(3, 15, 16, 34, 51, 54, 85, 91, 92, 111, 117, 133, 183, 203, 218, 231, 271, 306, 351, 364, 460, 591, 721, 743, 918, 942, 958, 1128, 1314, 1659, 2423, 3087, 3192, 3857), List(2.0974936970111835, 2.3823879226615317, 2.474473907110662, 2.606088421915507, 2.821255513390736, 2.8475037394656724, 3.046993099881992, 3.107211959875047, 3.147076558438975, 3.1615346416142045, 3.3303555111892567, 3.6452237239451946, 3.618037583641038, 3.6373805464841684, 3.6772264550313682, 3.75321236200929, 4.088807790156968, 4.040894434461071, 3.9951721851229762, 4.088807790156968, 4.519118896851664, 4.500426763839512, 4.716649872309147, 4.693921621231591, 4.81311013849671, 4.948451486366472, 4.751741192120417, 5.39836835704547, 5.105020547058012, 5.3539165944746365, 5.815262160977257, 6.40996926872395, 6.18682571740974, 6.879972897969686))"
7,rec.autos,6,"List(article, aviad, sheinfeld, write, think, electr, drill, change, suitable, turn, succeed, retighten, tight, safe, without, leak, tighten, bolt, specifi, torque, service, manual, wont, leak, strip, break, hope, thank, much, advance, winson, aviad, avoid, problem, entir, instal, drain, valve, place, bolt, car, leak, mile, combin, mile, car, deblock, that, number, rdb1, letter, bell, lab, somerset)","List(aviad, sheinfeld, electr, drill, change, suitable, turn, succeed, retighten, tight, safe, without, leak, tighten, bolt, specifi, torque, service, manual, wont, leak, strip, break, hope, thank, much, advance, winson, aviad, avoid, problem, entir, instal, drain, valve, place, bolt, car, leak, mile, combin, mile, car, deblock, that, number, rdb1, letter, bell, lab, somerset)","List(0, 7533, List(12, 22, 28, 45, 80, 89, 99, 141, 164, 214, 264, 313, 327, 367, 407, 435, 575, 669, 713, 716, 765, 1145, 1256, 1281, 1405, 1638, 2033, 2058, 2077, 2337, 2520, 2628, 2783, 3328, 3884, 3895, 4499, 6370, 7008), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","List(0, 7533, List(12, 22, 28, 45, 80, 89, 99, 141, 164, 214, 264, 313, 327, 367, 407, 435, 575, 669, 713, 716, 765, 1145, 1256, 1281, 1405, 1638, 2033, 2058, 2077, 2337, 2520, 2628, 2783, 3328, 3884, 3895, 4499, 6370, 7008), List(2.4209852221596755, 2.4326267971751614, 2.4611322901730874, 2.932582748704248, 3.044830936877497, 3.0688758111315, 3.1961059856792935, 3.4395548031542487, 3.4173668881788863, 3.7102873172922566, 3.9095584323999844, 8.23992591587353, 3.85468182217415, 3.914699831900403, 4.212744691387731, 4.2058242485431565, 9.01945831300365, 4.500426763839512, 4.671698484446881, 4.491210108734587, 4.89209854981534, 5.008170721068094, 5.193573944399457, 5.122114980417312, 5.571640078319507, 5.627209929474318, 5.8867211249594025, 5.599039052507621, 5.627209929474318, 18.013512481847357, 5.850353480788527, 12.476238023594581, 6.18682571740974, 6.543500661348473, 6.474507789861521, 6.543500661348473, 6.879972897969686, 7.236647841908418, 7.390798521735676))"
7,rec.autos,7,"List(1986, acura, integra, speed, mile, posit, worst, ever, own, prelude, 160k, mile, sold, still, go, strong, religi, attention, maintenance, change, car, driven, exactli, manner, gone, clutch, underr, set, tire, realli, eat, tire, front, even, care, align, strut, start, leak, 2530k, mile, windshield, wiper, motor, burn, service, note, seek, stop, work, radio, mile, time, belt, constant, error, signal, comput)","List(1986, acura, integra, speed, mile, posit, worst, ever, own, prelude, 160k, mile, sold, still, go, strong, religi, attention, maintenance, change, car, driven, exactli, manner, gone, clutch, underr, set, tire, realli, eat, tire, front, even, care, align, strut, start, leak, 2530k, mile, windshield, wiper, motor, burn, service, note, seek, stop, work, radio, mile, time, belt, constant, error, signal, comput)","List(0, 7533, List(4, 5, 18, 34, 47, 51, 58, 82, 169, 178, 203, 204, 214, 268, 281, 313, 377, 407, 408, 427, 575, 591, 611, 652, 731, 864, 868, 876, 1031, 1286, 1391, 1457, 1471, 1572, 1713, 1940, 2132, 2190, 2231, 2337, 2897, 2916, 3174, 3504, 3738, 4016, 4724, 4872, 6103), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 4.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","List(0, 7533, List(4, 5, 18, 34, 47, 51, 58, 82, 169, 178, 203, 204, 214, 268, 281, 313, 377, 407, 408, 427, 575, 591, 611, 652, 731, 864, 868, 876, 1031, 1286, 1391, 1457, 1471, 1572, 1713, 1940, 2132, 2190, 2231, 2337, 2897, 2916, 3174, 3504, 3738, 4016, 4724, 4872, 6103), List(2.04131286861324, 2.18770800513066, 2.364727419509581, 2.606088421915507, 2.856408517808632, 2.821255513390736, 2.861789696713149, 3.229314656675947, 3.442765078784497, 3.7753862195036123, 3.6373805464841684, 3.614213487202634, 3.7102873172922566, 3.8072795832795663, 4.064564178546976, 4.119962957936765, 4.082691563139533, 4.212744691387731, 4.248084057833039, 4.035063514150278, 18.0389166260073, 4.500426763839512, 4.547829002734095, 4.751741192120417, 4.567437474122472, 4.89209854981534, 4.89209854981534, 9.92610057157525, 4.905891871947676, 5.08821342874163, 5.375895501193411, 5.270534985535585, 5.250732358239405, 5.31135698005584, 5.444888372680363, 5.716822088164005, 5.716822088164005, 5.815262160977257, 6.047063775034581, 6.004504160615785, 6.617608633502194, 6.138035553240308, 6.2381190117972904, 6.349344646907515, 6.40996926872395, 6.543500661348473, 6.697651341175731, 6.879972897969686, 7.390798521735676))"
7,rec.autos,8,"List(article, thunderbird, write, owner, motorhead, guru, know, mr2, engine, sound, noisi, mr2, engine, noisi, best, time, even, nice, nose, ugli, noise, assum, turbo, gruff, characterist, large, inline, doesnt, balance, shaft, guess, toyota, didnt, care, little, detail, like, brag, engine, configuration, flashi, style)","List(thunderbird, owner, motorhead, guru, mr2, engine, sound, noisi, mr2, engine, noisi, best, time, even, nice, nose, ugli, noise, assum, turbo, gruff, characterist, large, inline, doesnt, balance, shaft, guess, toyota, didnt, care, little, detail, brag, engine, configuration, flashi, style)","List(0, 7533, List(4, 18, 68, 79, 91, 108, 146, 203, 247, 248, 256, 420, 535, 548, 633, 1105, 1316, 1365, 1369, 1536, 1816, 2485, 3005, 3029, 3051, 3246, 3292, 5176, 6240), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","List(0, 7533, List(4, 18, 68, 79, 91, 108, 146, 203, 247, 248, 256, 420, 535, 548, 633, 1105, 1316, 1365, 1369, 1536, 1816, 2485, 3005, 3029, 3051, 3246, 3292, 5176, 6240), List(2.04131286861324, 2.364727419509581, 2.9383910902999952, 3.053507780903186, 3.107211959875047, 3.216411251840039, 3.3895443825795875, 3.6373805464841684, 3.653128903452308, 3.6452237239451946, 3.7889304446113696, 13.585792922418623, 4.428967799857366, 4.2625770651356065, 4.346276084012253, 5.17522480573126, 5.193573944399457, 5.493678536849795, 5.231314272382304, 5.250732358239405, 5.599039052507621, 6.18682571740974, 12.476238023594581, 6.2381190117972904, 6.138035553240308, 6.349344646907515, 6.292186233067566, 6.985333413627512, 7.103116449283895))"
7,rec.autos,9,"List(article, scott, warren, rosand, write, article, george, write, mani, year, school, final, graduat, get, real, course, tri, make, plan, spend, extra, money, right, accord, good, real, sporti, think, sell, year, drop, around, sport, kind, think, better, idea, keep, accord, drop, alway, want, corvette, stingray, reason, 8000, accord, 8000, corvette, 16000, le, spend, anyway, basic, think, late, earli, around, question, good, year, consid, reliabl, look, horsepow, order, believe, horsepow, main, concern, want, fast, motorcycle, good, price, also, insurance, look, like, male, single, might, wait, lower, insurance, fact, mainli, drive, lower, type, classic, rare, driven, insurance, class, drive, mile, year, vette, classic, insurance, basic, mean, restrict, amount, drive, time, basic, mean, cant, use, everi, probabl, suit, need, limit, mileage, addition, restrict, mileage, mani, classic, insurance, carrier, also, require, vehicle, garag)","List(scott, warren, rosand, george, mani, year, school, final, graduat, get, real, course, tri, plan, spend, extra, money, right, accord, good, real, sporti, sell, year, drop, around, sport, kind, better, idea, keep, accord, drop, alway, want, corvette, stingray, reason, 8000, accord, 8000, corvette, 16000, le, spend, anyway, basic, late, earli, around, question, good, year, consid, reliabl, look, horsepow, order, believe, horsepow, main, concern, want, fast, motorcycle, good, price, insurance, look, male, single, might, wait, lower, insurance, fact, mainli, drive, lower, type, classic, rare, driven, insurance, class, drive, mile, year, vette, classic, insurance, basic, mean, restrict, amount, drive, time, basic, mean, cant, use, everi, probabl, suit, need, limit, mileage, addition, restrict, mileage, mani, classic, insurance, carrier, require, vehicle, garag)","List(0, 7533, List(1, 4, 7, 8, 9, 10, 13, 16, 20, 23, 27, 42, 49, 53, 57, 61, 62, 76, 83, 90, 92, 93, 94, 100, 101, 102, 105, 131, 136, 144, 145, 171, 191, 200, 228, 249, 255, 297, 319, 324, 347, 358, 390, 428, 478, 484, 502, 507, 537, 571, 574, 575, 577, 609, 618, 649, 667, 712, 741, 769, 777, 791, 824, 976, 1260, 1265, 1599, 1713, 1972, 1976, 2021, 2063, 2161, 2627, 2642, 3360, 4240, 4484, 4712, 5832, 6833, 7489), List(1.0, 1.0, 2.0, 3.0, 4.0, 2.0, 1.0, 1.0, 1.0, 3.0, 2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.0, 1.0, 1.0, 1.0, 3.0, 1.0, 5.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 3.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 1.0, 1.0))","List(0, 7533, List(1, 4, 7, 8, 9, 10, 13, 16, 20, 23, 27, 42, 49, 53, 57, 61, 62, 76, 83, 90, 92, 93, 94, 100, 101, 102, 105, 131, 136, 144, 145, 171, 191, 200, 228, 249, 255, 297, 319, 324, 347, 358, 390, 428, 478, 484, 502, 507, 537, 571, 574, 575, 577, 609, 618, 649, 667, 712, 741, 769, 777, 791, 824, 976, 1260, 1265, 1599, 1713, 1972, 1976, 2021, 2063, 2161, 2627, 2642, 3360, 4240, 4484, 4712, 5832, 6833, 7489), List(1.8753556761989927, 2.04131286861324, 4.362624737788511, 6.92507848653131, 9.415383677288188, 4.502172370728556, 2.3208466504830008, 2.474473907110662, 2.4781436359996243, 8.952237823414269, 5.173554954004839, 5.577934474026199, 2.799051356079725, 2.8229841222913534, 2.934515116455302, 2.963957871272002, 2.8965598964548667, 3.023462602471798, 3.114132402719621, 3.082239038943667, 3.147076558438975, 6.214423919750094, 3.102624795868141, 3.2423867382432996, 3.183621429017048, 3.2062070816657973, 3.2112961511732685, 3.5442033216299853, 3.336119215906007, 3.468825185454362, 6.803628950342803, 3.4888258521610314, 3.5442033216299853, 3.614213487202634, 3.602828164977509, 3.874290293562526, 3.7060944390322206, 3.8450199112624133, 3.8993542622257427, 4.000774440671646, 11.985516555368928, 4.012073995925579, 4.139132874044485, 4.139132874044485, 12.476032410352365, 4.226730933362471, 24.06555069248355, 4.3383709045051395, 4.370373635591314, 4.322745586602059, 4.330527727044114, 4.509729156501825, 4.528597640806208, 8.77353489073398, 4.618209799495895, 4.567437474122472, 4.639263208693727, 4.482077625171315, 9.15477560995128, 4.660769413914691, 9.365496641266931, 4.55758517767946, 5.105020547058012, 4.751741192120417, 5.08821342874163, 5.139506723129181, 5.3539165944746365, 5.444888372680363, 11.700706961577055, 11.773442249918805, 17.245712359435753, 5.599039052507621, 5.815262160977257, 6.047063775034581, 6.18682571740974, 6.2381190117972904, 14.20623289856779, 14.20623289856779, 6.697651341175731, 14.473295683816836, 7.236647841908418, 7.390798521735676))"
7,rec.autos,10,"List(craig, powderkeg, deforest, write, youre, plan, make, long, drive, 20w50, probabl, fine, summer, 10w40, youre, make, short, drive, stick, 10w40)","List(craig, powderkeg, deforest, youre, plan, long, drive, 20w50, probabl, fine, summer, 10w40, youre, short, drive, stick, 10w40)","List(0, 7533, List(23, 90, 98, 159, 305, 428, 520, 842, 1009, 1255), List(2.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","List(0, 7533, List(23, 90, 98, 159, 305, 428, 520, 842, 1009, 1255), List(5.968158548942846, 3.082239038943667, 3.1446870710415933, 6.904916415980313, 3.8402237389989202, 4.139132874044485, 4.2625770651356065, 4.81311013849671, 4.934062748914372, 5.157206300228582))"


In th above cell we have the output of IDF applied to count vectorizer (Term frequency(TF)) which as a result means TFIDF.

So till now we have made features using TFIDF and now we will fit a model (LDA) on this model.<br>
The model will try to find some prominent topics (terms) from the corpus

In [42]:
rescaledData.printSchema()

In [43]:
# We are choosing two columns only as only they are necessary for model training.
# Actually for model training only "features" column is needed, but to
# trace back the numeric features to their text form we will require ID column.

# We are doing caching so that it increases speed when we are running process
# iteratively
corpus = rescaledData.select("id", "features").cache()

In [44]:
corpus.show(5)

LDA (Latent Dirichlet Allocation) is a topic model and is used to classify text in a document to a particular topic.

In [46]:
from pyspark.ml.clustering import LDA

In [47]:
# k - number of toics that we want
# We specifically chose 20 because our news groups
# dataset too has 20  categories
lda = LDA(k=20, maxIter=50, optimizer="em")
model = lda.fit(corpus)

In [48]:
# In the case of topic modelling these measures are just indicative 
# and doesnt tell much. To check the performance of the model you have to go for 
# manual process of pre labelled dataset
ll = model.logLikelihood(corpus)
lp = model.logPerplexity(corpus)

In [49]:
print("The lower bound on the log likelihood of the entire column: ", str(ll))
print("The upper bound on Perplexity: ", str(lp))

In [50]:
# We are trying to know what 20 topics the model predicted
topicwords = 20
topics = model.describeTopics(topicwords)
print("The topics described by their top weighted terms: ")
topics.show(truncate=False)

We will not be able to understand what the above data meant because there is a lot of information. What we will do is convert the numbers back to the words using the vocabulary that we saved

In [52]:
# We are converting the topics model object into an rdd function
tRDD = topics.rdd.map(list)

In [53]:
topics.printSchema()

Explaination of above schema: <br>
* topic: An Integer corresponding to the predicted topics indexed from 0-19
* termIndices: The indices of the words in the vocabulary
* termWeights: Weights corresponding to each word

In [55]:
# This will give the same output but with better visualization
tRDD.collect()

In [56]:
# converting indices to words

def topic_vocab(topic):
  topicNum = topic[0]
  terms = topic[1]
  weights = topic[2]
  result = []
  for i in range(topicwords):
    term = vocab[terms[i]]
    out = str(topicNum) + ", " + term + ", " + str(weights[i])
    result.append(out)
  return result

In [57]:
# Here it is showing most prominent words for each topic with their respective 
# weights. We will process below to make a dataframe and visualize better
tRDD.map(lambda topic: topic_vocab(topic)).collect()

In [58]:
topic = tRDD.map(lambda topic: topic_vocab(topic))

In [59]:
ng_pd = spark.createDataFrame(topic).toPandas()

In [60]:
ng_pd.head()

Unnamed: 0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_10,_11,_12,_13,_14,_15,_16,_17,_18,_19,_20
0,"0, game, 0.011325368448791633","0, appear, 0.01080876424342009","0, april, 0.008826802461067353","0, pitch, 0.007728737402167395","0, roger, 0.007071707744826356","0, excel, 0.006901911097531067","0, home, 0.006838862772939392","0, last, 0.006819649976655621","0, basebal, 0.006035450254749769","0, good, 0.0056678035779175896","0, vote, 0.005381888385633295","0, miss, 0.005368310795903575","0, team, 0.005212347756998329","0, score, 0.005205179721519589","0, brave, 0.0051732085033005795","0, year, 0.005144689237371996","0, run, 0.005109685104831799","0, smith, 0.005018402468347846","0, cover, 0.004850221067366024","0, first, 0.004804354367227362"
1,"1, information, 0.010312125922297716","1, radio, 0.008345993683624299","1, office, 0.0074059069118285305","1, station, 0.007353996718443074","1, sport, 0.006981698268507522","1, organization, 0.006523589744169731","1, open, 0.006275105550652445","1, time, 0.006219767797972115","1, local, 0.006147906655092894","1, call, 0.006028440129298091","1, philadelphia, 0.005841240407518014","1, show, 0.005840668312681341","1, listen, 0.005815006179513005","1, search, 0.00577125328527287","1, contact, 0.005471867433676622","1, internet, 0.005325139844659035","1, street, 0.005302842624714736","1, robert, 0.005011747636015506","1, cycle, 0.0049667205024743425","1, citi, 0.004927657286065697"
2,"2, post, 0.016227354268054116","2, research, 0.01028877862936088","2, message, 0.009664328632267643","2, mail, 0.009511409947485151","2, center, 0.007759458610704467","2, copi, 0.007596051930290691","2, subject, 0.007485136604625999","2, product, 0.007137844204934834","2, request, 0.007035849117888377","2, greek, 0.0068714015109429495","2, server, 0.006848545797542039","2, response, 0.006766537216292625","2, interest, 0.00655348651438111","2, share, 0.006285517077096784","2, question, 0.006158533324155794","2, turkey, 0.005881491865183558","2, read, 0.005296042881221564","2, support, 0.005294202884064746","2, respond, 0.005179240244588465","2, user, 0.005010194669277551"
3,"3, govern, 0.016545758141229324","3, state, 0.010374910938079464","3, right, 0.008595869934207368","3, people, 0.00791058790007342","3, clinton, 0.007592327137508553","3, weapon, 0.006821234081079273","3, bill, 0.006707588851964443","3, crime, 0.00641827495705468","3, arm, 0.006268740514478087","3, gun, 0.006266482557870807","3, firearm, 0.006190774808285837","3, legal, 0.0061683808871629865","3, protect, 0.006120983860793481","3, citizen, 0.0060009501232036275","3, court, 0.005390306957701171","3, encryption, 0.005347258083080284","3, police, 0.0053159481590041874","3, feder, 0.00530714515072492","3, public, 0.005303364763165496","3, amend, 0.005128221855183646"
4,"4, moral, 0.010411543319654633","4, isra, 0.010224396072219198","4, value, 0.00887814700560107","4, israel, 0.008581556926932606","4, homosexu, 0.008214244355365058","4, white, 0.008127450680971421","4, objective, 0.007106230812651275","4, human, 0.006462629568735422","4, keith, 0.006430258215927984","4, male, 0.005636838687949224","4, women, 0.005252219948917281","4, population, 0.005177230641121537","4, sexual, 0.005168306872828175","4, frank, 0.005165557426239938","4, palestinian, 0.004995107252156121","4, wave, 0.0048401578097394945","4, child, 0.004834394671599112","4, people, 0.004754126311209977","4, noth, 0.004161081774345173","4, nazi, 0.004156550774811518"


In [61]:
# Here from topic 0-19 you can see the most prominent words and their weightage
ng_pd.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
_1,"0, game, 0.011325368448791633","1, information, 0.010312125922297716","2, post, 0.016227354268054116","3, govern, 0.016545758141229324","4, moral, 0.010411543319654633","5, move, 0.012293104216216611","6, sale, 0.012334425806344826","7, drive, 0.026227826960044127","8, card, 0.024762453824571578","9, space, 0.019433782822874224","10, science, 0.011737226610673683","11, maxaxaxaxaxaxaxaxaxaxaxaxaxaxax, 0.3615535...","12, bike, 0.014195459650417632","13, christian, 0.016728021572092917","14, fire, 0.01063866866200551","15, kill, 0.010572842991795159","16, chip, 0.009986183225199752","17, game, 0.018049362406292138","18, list, 0.0170255395768539","19, window, 0.03209075858808131"
_2,"0, appear, 0.01080876424342009","1, radio, 0.008345993683624299","2, research, 0.01028877862936088","3, state, 0.010374910938079464","4, isra, 0.010224396072219198","5, printer, 0.010814704289043683","6, offer, 0.011240922574939787","7, comput, 0.0149481443927998","8, monitor, 0.015017032465456537","9, launch, 0.006643369409879617","10, realli, 0.007769063243555803","11, 1993, 0.02199215115321189","12, light, 0.010535001941835048","13, jesu, 0.010733078104181022","14, level, 0.010403724097702016","15, food, 0.008498084259216375","16, clipper, 0.008935182713336242","17, team, 0.01757392600659536","18, point, 0.01100391876338351","19, file, 0.022234621052889533"
_3,"0, april, 0.008826802461067353","1, office, 0.0074059069118285305","2, message, 0.009664328632267643","3, right, 0.008595869934207368","4, value, 0.00887814700560107","5, player, 0.008724944546518065","6, wonder, 0.010965358172214723","7, disk, 0.013657353233559273","8, video, 0.012989555054538773","9, develop, 0.006588679886131246","10, rate, 0.007658284775116617","11, previou, 0.013872698631648318","12, engine, 0.008753601155224654","13, religion, 0.007881545657787313","14, build, 0.010309946066913074","15, effect, 0.008380145413327985","16, key, 0.008691545712729775","17, play, 0.012480669031860377","18, answer, 0.008350039848313744","19, program, 0.019782061398803903"
_4,"0, pitch, 0.007728737402167395","1, station, 0.007353996718443074","2, mail, 0.009511409947485151","3, people, 0.00791058790007342","4, israel, 0.008581556926932606","5, better, 0.008320090575311398","6, picture, 0.00995740356260069","7, data, 0.012498096104376154","8, color, 0.010629757674661192","9, cost, 0.006255280982788616","10, theori, 0.00697777744908929","11, cub, 0.011080086995279957","12, car, 0.008729580538931327","13, bible, 0.007629273028231969","14, switch, 0.008395890661297636","15, doctor, 0.007322889294043946","16, house, 0.008644440651460607","17, trade, 0.009958390891745458","18, dont, 0.008235991970313494","19, display, 0.010217485151925991"
_5,"0, roger, 0.007071707744826356","1, sport, 0.006981698268507522","2, center, 0.007759458610704467","3, clinton, 0.007592327137508553","4, homosexu, 0.008214244355365058","5, didnt, 0.008147877288140008","6, black, 0.009713111932790473","7, system, 0.01141957567264433","8, mode, 0.010459086816939711","9, moon, 0.005787496874645134","10, koresh, 0.006142845070681095","11, suck, 0.010584745197272727","12, front, 0.008273434860056456","13, church, 0.007206046160754015","14, work, 0.008236977456945507","15, pain, 0.007312579821757861","16, escrow, 0.006977609015826093","17, hockey, 0.009188400144920665","18, youre, 0.007241330746788356","19, application, 0.009877245807490106"
_6,"0, excel, 0.006901911097531067","1, organization, 0.006523589744169731","2, copi, 0.007596051930290691","3, weapon, 0.006821234081079273","4, white, 0.008127450680971421","5, away, 0.008115515552524303","6, driver, 0.009401819402508865","7, control, 0.011197985776575525","8, board, 0.009153174262519442","9, oper, 0.005543376062920493","10, little, 0.0060324030637526485","11, mb8f, 0.008214340625096262","12, ride, 0.007732434631491569","13, believe, 0.0067170898756197675","14, signal, 0.007316609672535587","15, disease, 0.007293986403293804","16, cool, 0.0069064347065883516","17, michael, 0.009154374690188816","18, people, 0.0071802209889730885","19, version, 0.009471721940252061"
_7,"0, home, 0.006838862772939392","1, open, 0.006275105550652445","2, subject, 0.007485136604625999","3, bill, 0.006707588851964443","4, objective, 0.007106230812651275","5, pick, 0.0077632158981825045","6, tape, 0.009265817978407101","7, hard, 0.010526792993960756","8, batteri, 0.008184503925552486","9, nasa, 0.005456831905815464","10, arent, 0.005585717313361117","11, r186, 0.006894009196671844","12, turn, 0.006796774464903803","13, word, 0.006658415628987955","14, inside, 0.007306848525626883","15, cause, 0.006979931867202534","16, andrew, 0.006773204891474485","17, year, 0.008574078119931373","18, thing, 0.006726354861194455","19, image, 0.009430335404783868"
_8,"0, last, 0.006819649976655621","1, time, 0.006219767797972115","2, product, 0.007137844204934834","3, crime, 0.00641827495705468","4, human, 0.006462629568735422","5, face, 0.007086825789624337","6, price, 0.009124195765988302","7, machine, 0.010117849767064351","8, work, 0.008085101053185372","9, news, 0.005161188200623939","10, understand, 0.005566849681854081","11, ryan, 0.006676541434189249","12, road, 0.006671599838623786","13, christ, 0.00636411094770183","14, design, 0.007232536080814378","15, death, 0.006624497723309861","16, instruction, 0.00661640203443218","17, mike, 0.008063189849383185","18, absolute, 0.0064910360320561845","19, use, 0.008847375944699394"
_9,"0, basebal, 0.006035450254749769","1, local, 0.006147906655092894","2, request, 0.007035849117888377","3, arm, 0.006268740514478087","4, keith, 0.006430258215927984","5, mouse, 0.006926636648660995","6, thank, 0.008890101980643907","7, email, 0.00976544039810096","8, problem, 0.0074042432348572565","9, fund, 0.005095577079546128","10, example, 0.005513540582964493","11, maxaxaxaxaxaxaxaxaxax, 0.006062961412109421","12, mile, 0.005966538469083406","13, life, 0.006009663024549756","14, ground, 0.006981600941553612","15, children, 0.006503774932786468","16, risc, 0.005629300049633296","17, playoff, 0.007627982233599213","18, question, 0.006395576613959762","19, code, 0.008573574730009949"
_10,"0, good, 0.0056678035779175896","1, call, 0.006028440129298091","2, greek, 0.0068714015109429495","3, gun, 0.006266482557870807","4, male, 0.005636838687949224","5, wing, 0.006924873041243963","6, anyone, 0.008513623619574752","7, software, 0.009159156699203001","8, support, 0.007156370265820084","9, shot, 0.004866939433080331","10, look, 0.00527420565770359","11, archivename, 0.006016374778454571","12, motorcycle, 0.005793387449458345","13, atheist, 0.006005447602783588","14, circuit, 0.0068962945652583235","15, medic, 0.005877642424415368","16, trust, 0.00540332224032617","17, season, 0.007563972597552037","18, helmet, 0.006210209220437423","19, run, 0.0083109886485012"


This is the output that we have. Now if you notice then you will find that the similar data words are together. These words will hint at the topic of the words. The 20 prominent topics in the dataset. We have to manually do the performance checking for these type of models.