In [3]:
import datasets as ds
import pandas as pd
import os
import numpy as np
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [4]:
CUR_DIR = os.path.abspath(os.curdir)
ROOT_DIR = os.path.dirname(CUR_DIR)
IMAGES_DIR = os.path.join(ROOT_DIR, "images")
DATA_DIR = os.path.join(ROOT_DIR, "data")
DATA_DIR

'/home/jovyan/projects/vector-nlp/data'

In [3]:
#ds.list_datasets()

In [5]:
app_dataset = ds.load_dataset('app_reviews',split='train')
app_df = app_dataset.to_pandas()

Using custom data configuration default
Reusing dataset app_reviews (/home/jovyan/.cache/huggingface/datasets/app_reviews/default/0.0.0/20335b51b604b9bc04b7be253cd8445caa9ba93f15f39a4b0492b9e9102853de)


In [6]:
app_dataset.homepage

'https://giograno.me/assets/pdf/workshop/wama17.pdf'

In [7]:
porter_stemmer = PorterStemmer()
def stem_sentences(sentence):
    tokens = sentence.split()
    stemmed_tokens = [porter_stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

app_df['review_stem'] = app_df['review'].apply(stem_sentences)

In [8]:
app_df.head()

Unnamed: 0,package_name,review,date,star,review_stem
0,com.mantz_it.rfanalyzer,Great app! The new version now works on my Bra...,October 12 2016,4,great app! the new version now work on my brav...
1,com.mantz_it.rfanalyzer,Great It's not fully optimised and has some is...,August 23 2016,4,great it' not fulli optimis and ha some issu w...
2,com.mantz_it.rfanalyzer,Works on a Nexus 6p I'm still messing around w...,August 04 2016,5,work on a nexu 6p i'm still mess around with m...
3,com.mantz_it.rfanalyzer,The bandwidth seemed to be limited to maximum ...,July 25 2016,3,the bandwidth seem to be limit to maximum 2 mh...
4,com.mantz_it.rfanalyzer,Works well with my Hackrf Hopefully new update...,July 22 2016,5,work well with my hackrf hope new updat will a...


In [10]:
vect = TfidfVectorizer(max_df=0.8, min_df=2, stop_words='english',ngram_range = (1,2))
doc_term_matrix = vect.fit_transform(app_df.review_stem)

In [18]:
LDA = LatentDirichletAllocation(n_components=5, random_state=42)
LDA.fit(doc_term_matrix)

LatentDirichletAllocation(n_components=5, random_state=42)

In [19]:
import random

for i in range(10):
    random_id = random.randint(0,len(vect.get_feature_names()))
    print(vect.get_feature_names()[random_id])



easi perfect
emulator howev
tabloid
releas just
ha essenti
look online
closing
plz upload
doe control
ois


In [20]:
first_topic = LDA.components_[0]

In [21]:
for i,topic in enumerate(LDA.components_):
    print(f'Top 10 words for topic #{i}:')
    print([vect.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic #0:
['nice', 'app', 'veri nice', 'easi', 'amaz', 'veri', 'work', 'ok', 'great', 'love']


Top 10 words for topic #1:
['phone', 'hi', 'use', 'wow', 'work', 'best app', 'thi', 'updat', 'app', 'best']


Top 10 words for topic #2:
['happi', 'awesom app', 'slow', 'good good', 'app', 'veri', 'good app', 'veri good', 'awesom', 'good']


Top 10 words for topic #3:
['superb', 'work', 'bad', 'perfect', 'game', 'app', 'nice app', 'use', 'super', 'nice']


Top 10 words for topic #4:
['googl', 'great app', 'use', 'great', 'excel', 'cool', 'thi', 'thank', 'app', 'like']




In [22]:
topic_values = LDA.transform(doc_term_matrix)
topic_values.shape

(288065, 5)

In [23]:
app_df['Topic'] = topic_values.argmax(axis=1)

In [24]:
app_df.to_csv(os.path.join(DATA_DIR, "1.1_LDA-Tfidf.csv"),index = False)