In [38]:
#***********************************************************************
#    Importing all necessary modules to run ML-based QAFE framework 
#***********************************************************************
import emoji
import re
import nltk
import pandas as pd
from six import python_2_unicode_compatible
from nltk.corpus import stopwords
from nltk.collocations import *
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from sklearn import preprocessing
from nltk.corpus import wordnet

#importing modules for Topic Modeling 
from sklearn.feature_extraction.text import CountVectorizer as countVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import GridSearchCV
import numpy as np
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
from spacy.lang.en import English
import gensim 

In [39]:
#***********************************************************************
# Step 0: Inserting Android Apps User Reviews as input of the framework 
#***********************************************************************


## We have Executed this same codes into 3 different dataset ; uncomment each dataset to test individual results

#originalSource = pd.read_csv("Reviews1_TopFreeAndroidGames.csv") #2265 User Remarks

#originalSource = pd.read_csv("Reviews2_TopGrossingAndroidGames.csv") #3960 User Remarks

originalSource = pd.read_csv("Reviews3_TopPaidAndroidGames.csv") #2198 User Remarks

In [40]:
#******************************************************************
# Step 1 : Pre-Processing User Reviews 
#******************************************************************

def unwantedCharacters_removing(text):
    text = re.sub('[^A-Za-z]+', ' ', text)
    text = re.sub(emoji.get_emoji_regexp(), r"", text)
    return text

def stopwords_removing(sentence):
    tokens = sentence.split(" ")
    text_tokens = word_tokenize(sentence)
    stop_words = set(stopwords.words('english'))
    tokens_filtered= [word for word in text_tokens if not word in stop_words]
    return (" ").join(tokens_filtered)
    
def noise_removing(text):
    blob = TextBlob(text)
    string = ' '.join([word for (word,tag) in blob.tags if tag == "NN" or tag == "NNS" or tag == "NNP" or tag == "NNPS" or tag == "JJ" or tag == "JJR" or tag == "JJS" or tag == "RB" or tag == "RBR" or tag == "RBS" or tag == "VB" or tag == "VBG" or tag == "VBD" or tag == "VBN" or tag == "VBP" or tag == "VPZ" or tag == "ADV"or tag == "VERB"])
    return string

def words_lemmatizing(text):
    lemmatizer = WordNetLemmatizer()
    word_list = nltk.word_tokenize(text)
    lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
    return lemmatized_output



#initial cleaning of Reviews 
df_initialProcessing= pd.DataFrame(originalSource['Remarks']).dropna()

df_initialProcessing['Cleaned_Reviews'] = df_initialProcessing['Remarks'].apply(unwantedCharacters_removing)
df_initialProcessing['Stopwords_Removed'] = df_initialProcessing['Cleaned_Reviews'].apply(stopwords_removing)
df_initialProcessing['Noise_Removed'] = df_initialProcessing['Stopwords_Removed'].apply(noise_removing)
df_initialProcessing['Words_Lemmatized'] = df_initialProcessing['Noise_Removed'].apply(words_lemmatizing)

df_initialProcessing

Unnamed: 0,Remarks,Cleaned_Reviews,Stopwords_Removed,Noise_Removed,Words_Lemmatized
0,Fun Game would like to know how the CPU is dec...,Fun Game would like to know how the CPU is dec...,Fun Game would like know CPU decided dice roll...,Fun Game like know CPU decided dice roll overa...,Fun Game like know CPU decided dice roll overa...
1,The bots are at an unfair advantage! The game ...,The bots are at an unfair advantage The game i...,The bots unfair advantage The game well made u...,bots unfair advantage game well made unfair ev...,bot unfair advantage game well made unfair eve...
2,"Absolutely LOVE this game, but it has not been...",Absolutely LOVE this game but it has not been ...,Absolutely LOVE game working past couple weeks...,Absolutely LOVE game working past couple weeks...,Absolutely LOVE game working past couple week ...
3,I've been unable to even get into the game for...,I ve been unable to even get into the game for...,I unable even get game weeks I sent reports he...,unable even get game weeks sent reports heard ...,unable even get game week sent report heard no...
4,It's a good game but doesn't have the live vid...,It s a good game but doesn t have the live vid...,It good game live video like advertising says ...,good game live video advertising Live video pl...,good game live video advertising Live video pl...
...,...,...,...,...,...
2193,Its a fun time passer but considering im payin...,Its a fun time passer but considering im payin...,Its fun time passer considering im paying play...,fun time passer considering im paying play id ...,fun time passer considering im paying play id ...
2194,Nothing like getting back to the classic Fruit...,Nothing like getting back to the classic Fruit...,Nothing like getting back classic Fruit Ninja ...,Nothing getting back classic Fruit Ninja alway...,Nothing getting back classic Fruit Ninja alway...
2195,Bought the chainsaw sword and requested a refu...,Bought the chainsaw sword and requested a refu...,Bought chainsaw sword requested refund glitch ...,Bought chainsaw sword requested refund glitch ...,Bought chainsaw sword requested refund glitch ...
2196,This game is better than the free one. I bough...,This game is better than the free one I bought...,This game better free one I bought blades lost...,game better free bought blades lost abilities ...,game better free bought blade lost ability ver...


In [41]:
#**********************************
#Step 2: Apply Sentimental Analysis
#**********************************

# function to calculate subjectivity
def getSubjectivity(review): 
    return TextBlob(review).sentiment.subjectivity

# function to calculate polarity
def getPolarity(review):
        return TextBlob(review).sentiment.polarity 

# function to analyze the reviews
def analysis(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'
    

#applying sentiment Analysis on Cleaned Reviews
df_sentimentAnalysis = pd.DataFrame()
df_sentimentAnalysis['Words_lemmatized'] = df_initialProcessing['Words_Lemmatized']
df_sentimentAnalysis['Subjectivity_score'] = df_initialProcessing['Words_Lemmatized'].apply(getSubjectivity)
df_sentimentAnalysis['Polarity_score'] = df_initialProcessing['Words_Lemmatized'].apply(getPolarity) 
df_sentimentAnalysis['Sentiment_analysis'] = df_sentimentAnalysis['Polarity_score'].apply(analysis)

df_sentimentAnalysis[["Words_lemmatized","Subjectivity_score","Polarity_score","Sentiment_analysis"]]


Unnamed: 0,Words_lemmatized,Subjectivity_score,Polarity_score,Sentiment_analysis
0,Fun Game like know CPU decided dice roll overa...,0.288889,-0.033333,Negative
1,bot unfair advantage game well made unfair eve...,0.765476,-0.117857,Negative
2,Absolutely LOVE game working past couple week ...,0.312500,-0.037500,Negative
3,unable even get game week sent report heard no...,0.540000,-0.540000,Negative
4,good game live video advertising Live video pl...,0.382418,0.067532,Positive
...,...,...,...,...
2193,fun time passer considering im paying play id ...,0.200000,0.250000,Positive
2194,Nothing getting back classic Fruit Ninja alway...,0.436667,0.226667,Positive
2195,Bought chainsaw sword requested refund glitch ...,0.287500,-0.012500,Negative
2196,game better free bought blade lost ability ver...,0.460000,0.180000,Positive


In [42]:
#*********************************************************************************************
#Step 3: Separating Negative Reviews for Analysing what are users Negative Remarks abouts Apps
#*********************************************************************************************

df_NegativeReviews = pd.DataFrame(df_sentimentAnalysis.query("Sentiment_analysis == 'Negative'"))

#Seperating negative Reviews with Subjectivity greater than 0.6 
df_mostExpressedNegativeReviews = pd.DataFrame(df_NegativeReviews.query("Subjectivity_score >= 0.6"))

df_mostExpressedNegativeReviews

Unnamed: 0,Words_lemmatized,Subjectivity_score,Polarity_score,Sentiment_analysis
1,bot unfair advantage game well made unfair eve...,0.765476,-0.117857,Negative
16,put star going jail time row statistically imp...,0.691111,-0.223333,Negative
21,play free late Microtransactions EVERYTHING no...,0.633333,-0.133333,Negative
106,Neat adventure game thing need solve cleverly ...,0.658333,-0.033333,Negative
131,install final fantasy Google Chromecast Androi...,0.640000,-0.060000,Negative
...,...,...,...,...
2110,playing hundred mobile game always favorite mi...,0.634000,-0.016500,Negative
2114,Unplayable zoomed way Would previously given s...,0.633333,-0.066667,Negative
2130,great game classic many released today Unfortu...,0.602778,-0.005556,Negative
2137,game still fantastic however control atrocious...,0.720000,-0.380000,Negative


In [43]:
#*********************************************************************************************
#Step 4: Separating Positive Reviews for Analysing what are users Positive Remarks abouts Apps
#*********************************************************************************************

df_PositiveReviews = pd.DataFrame(df_sentimentAnalysis.query("Sentiment_analysis == 'Positive'"))

#Seperating negative Reviews with Subjectivity greater than 0.6 
df_mostExpressedPositiveReviews = pd.DataFrame(df_PositiveReviews.query("Subjectivity_score >= 0.6"))

df_mostExpressedPositiveReviews

Unnamed: 0,Words_lemmatized,Subjectivity_score,Polarity_score,Sentiment_analysis
8,online part game perfect playing singleplayer ...,0.693750,0.185417,Positive
40,level hard goal work AND make level full versi...,0.605952,0.176190,Positive
43,Not happy app Asks ace medium decline app work...,1.000000,0.300000,Positive
55,Sad say work app longer playable IOS supported...,0.600000,0.100000,Positive
61,Theres lot glitch memory tournament saving cool,0.650000,0.350000,Positive
...,...,...,...,...
2167,Great Graphics Start stop whenever want Don on...,0.916667,0.466667,Positive
2170,Controls bit iffy work made amazing job GIANTS...,0.658974,0.466667,Positive
2185,beautiful game Doesn offer friggin ad Wonderfu...,0.800000,0.462500,Positive
2187,love game much really free version ad feel ori...,0.614286,0.192857,Positive


In [44]:
#*******************************************************************************************************************
#Step 5: Discovering Insights from User Reviews using Topic Modeling technique with LDA
#*******************************************************************************************************************
 
vectorizer = countVectorizer(analyzer='word',       
                             min_df=10,
                             stop_words='english',             
                             lowercase=True,                   
                             token_pattern='[a-zA-Z0-9]{3,}') 
data_vectorized = vectorizer.fit_transform(df_mostExpressedNegativeReviews['Words_lemmatized'])

lda_model = LatentDirichletAllocation(n_components=20,
                                      max_iter=10,
                                      learning_method='online',
                                      random_state=100,
                                      batch_size=128,
                                      evaluate_every = -1,
                                      n_jobs = -1)

lda_output = lda_model.fit_transform(data_vectorized)


LatentDirichletAllocation(batch_size=128, 
                          doc_topic_prior=None,
                          evaluate_every=-1, 
                          learning_decay=0.7,
                          learning_method="online",
                          learning_offset=10.0,
                          max_doc_update_iter=1897, 
                          max_iter=10, 
                          mean_change_tol=0.001,
                          n_components=10, 
                          n_jobs=-1, 
                          perp_tol=0.1,
                          random_state=100,
                          topic_word_prior=None,
                          total_samples=1000000.0, 
                          verbose=0)

# Define Search Param
search_params = {'n_components': [10, 20], 'learning_decay': [0.5, 0.9]}
# Init the Model
lda = LatentDirichletAllocation(max_iter=10, learning_method='online', learning_offset=50.,random_state=0)
# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)
# Do the Grid Search
model.fit(data_vectorized)


GridSearchCV(cv=None, 
             error_score='raise',
             estimator=LatentDirichletAllocation(batch_size=128, 
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1, 
                                                 learning_decay=0.7, 
                                                 learning_method=None,
                                                 learning_offset=10.0, 
                                                 max_doc_update_iter=1897, 
                                                 max_iter=10,
                                                 mean_change_tol=0.001,
                                                 n_components=10, 
                                                 n_jobs=1,
                                                 perp_tol=0.1,
                                                 random_state=None,
                                                 topic_word_prior=None,
                                                 total_samples=1000000.0,
                                                 verbose=0),
        n_jobs=1,
       param_grid={'n_components': [10, 20], 
                   'learning_decay': [0.5, 0.9]},
             pre_dispatch='2*n_jobs', 
             refit=True, 
             return_train_score='warn',
             scoring=None,
             verbose=0)

best_lda_model = model.best_estimator_

# Create Document — Topic Matrix
lda_output = best_lda_model.transform(data_vectorized)

topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
docnames = ["Doc" + str(i) for i in range(len(df_mostExpressedNegativeReviews['Words_lemmatized']))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic["dominant_topic"] = dominant_topic

# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(best_lda_model.components_)
# Assign Column and Index
df_topic_keywords.columns = vectorizer.get_feature_names()
df_topic_keywords.index = topicnames
# View
df_topic_keywords.head()
#print("***")
#print(vectorizer.get_feature_names())


# Show top n keywords for each topic
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=10):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

def tagging(topic_keywords):
    for key in topic_keywords:
        value = list(filter(None, topic_keywords[key])) # Get rid of empty items
        Tagged = nltk.pos_tag(value)
        return Tagged

def labelTopics(text):
    mylist = text
    mylist = unwantedCharacters_removing(mylist)
    mylist = stopwords_removing(mylist)
    mylist = list(mylist.split(" "))
    length = len(mylist)
    topic = []

#******************************************************************************************************************
##Step 6:                                      RBLSALT Automated Labelling Algorthim 
#******************************************************************************************************************

    # A = Noun
    # B = Verb
    # C = Adjective 

    for i in range(length):
        doc = nlp(mylist[i])
      

        ####Rule 1: if A => B => C 
        # Where Noun is followed by either Noun or Verb or Adjective

        if(doc[0].tag_ == 'NN'): # if the elemnet is Noun
            if(i <= (length - 2)):
                doc2 = nlp(mylist[i+1]) # find the adjacent element 

             ##Rule 1.1: if A => A
            if(doc2[0].tag_ == 'NN'): # if adjacent element is also Noun 
                topic.append(mylist[i]) # Then both of the elements are independ elements

             ##Rule 1.2: if A => B    
            if(doc2[0].tag_ == 'VB'): # if adjacent elemnt is verb 
                string = mylist[i] + " " + mylist[i+1] # Then concentarte the elements; i.e Noun + Verb (Game Stop)
                topic.append(string)

             ##Rule 1.3: if A => C 
            if(doc2[0].tag_ == 'JJ'): # if adjacent elemnt is Adjective 
                string = mylist[i] + " " + mylist[i+1] # Then concentarte the elements; i.e Noun + Adjective 
                topic.append(string)


        ####Rule 2: if B => A => C
         # Where Verb is followed by either Noun or Verb or Adjective

        elif(doc[0].tag_ == 'VB'): # if the elemnet is Verb
            if(i <= (length - 2)):
                doc2 = nlp(mylist[i+1])  # find the adjacent element 

             ##Rule 2.1: if B => A
            if(doc2[0].tag_ == 'NN'): # if adjacent element is Noun 
                string = mylist[i] + " " + mylist[i+1] # Then concentarte the elements; i.e Verb + Noun 
                topic.append(string) 

             ##Rule 2.2: if B => B    
            if(doc2[0].tag_ == 'VB'): # if adjacent elemnt is also verb 
                topic.append(mylist[i]) # Then both of the elements are independ element

             ##Rule 2.3: if B => C    
            if(doc2[0].tag_ == 'JJ'): # if adjacent elemnt is Adjective 
                string = mylist[i] + " " + mylist[i+1] # Then concentarte the elements; i.e Verb + Adjective
                topic.append(string)
                
                

         ####Rule 3: if C => A => B
         # Where Adjective is followed by either Noun or Verb or Adjective

        elif(doc[0].tag_ == 'JJ'): # if the elemnet is Adjective
            if(i <= (length - 2)):
                doc2 = nlp(mylist[i+1]) # find the adjacent element
                    
             ##Rule 3.1: if C => A
            if(doc2[0].tag_ == 'NN'): # if adjacent element is Noun
                string = mylist[i] + " " + mylist[i+1] # Then concentarte the elements; i.e Adjective + Noun 
                topic.append(string) 

             ##Rule 3.2: if C => B
            if(doc2[0].tag_ == 'VB'): # if adjacent elemnt is verb 
                string = mylist[i] + " " + mylist[i+1] # Then concentarte the elements; i.e Adjective + Verb 
                topic.append(string)

             ##Rule 3.3: if C => C
            if(doc2[0].tag_ == 'JJ'): # if adjacent elemnt is also adjective  
                topic.append(mylist[i]) # Then both of the elements are independ element
       
    return topic

#********************************* RBLSALT ends here **********************************************

topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=10)   

# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]

values = ' '.join([str(i) for i in df_topic_keywords.values.tolist()])

topic1_values = values.split("]",1)[0]
values = values.split("]",1)[1]
topic2_values = values.split("]",1)[0]
values = values.split("]",1)[1]
topic3_values = values.split("]",1)[0]
values = values.split("]",1)[1]
topic4_values = values.split("]",1)[0]
values = values.split("]",1)[1]
topic5_values = values.split("]",1)[0]
values = values.split("]",1)[1]
topic6_values = values.split("]",1)[0]
values = values.split("]",1)[1]
topic7_values = values.split("]",1)[0]
values = values.split("]",1)[1]
topic8_values = values.split("]",1)[0]
values = values.split("]",1)[1]
topic9_values = values.split("]",1)[0]
values = values.split("]",1)[1]
topic10_values = values.split("]",1)[0]


topic_1 = labelTopics(topic1_values)
topic_2 = labelTopics(topic2_values)
topic_3 = labelTopics(topic3_values)
topic_4 = labelTopics(topic4_values)
topic_5 = labelTopics(topic5_values)
topic_6 = labelTopics(topic6_values)
topic_7 = labelTopics(topic7_values)
topic_8 = labelTopics(topic8_values)
topic_9 = labelTopics(topic9_values)
topic_10 = labelTopics(topic10_values)



Topics = [topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10]
    
df_topic_keywords['Potential_keywords_for_labelling_terms']= Topics
print(df_topic_keywords)
print(topic_1)
print(topic_2)
print(topic_3)
print(topic_4)
print(topic_5)
print(topic_6)
print(topic_7)
print(topic_8)
print(topic_9)
print(topic_10)




         Word 0      Word 1      Word 2   Word 3   Word 4     Word 5  \
Topic 0    game       thing      really     star     hard       love   
Topic 1    hard     control  impossible  version     play      phone   
Topic 2    game     playing     control     play   mobile       love   
Topic 3    game      boring        good     star     love    control   
Topic 4    time       phone         far  control  version  difficult   
Topic 5  mobile   difficult     version     play     game        way   
Topic 6     fix  impossible        star     game     good       time   
Topic 7     way        play        game    phone  control  difficult   
Topic 8     far        play     control     game     good    version   
Topic 9  really        love      mobile  playing      fix    control   

             Word 6   Word 7      Word 8   Word 9  \
Topic 0   difficult     time      boring  version   
Topic 1        time      way        love   boring   
Topic 2       phone     good  impossible     tim

In [45]:
# Again performing this step for Positive Remarks
#*******************************************************************************************************************
# n Step 5: Discovering Insights from User Reviews using Topic Modeling technique with LDA
#*******************************************************************************************************************
 
vectorizer = countVectorizer(analyzer='word',       
                             min_df=10,
                             stop_words='english',             
                             lowercase=True,                   
                             token_pattern='[a-zA-Z0-9]{3,}') 
data_vectorized = vectorizer.fit_transform(df_mostExpressedPositiveReviews['Words_lemmatized'])

lda_model = LatentDirichletAllocation(n_components=20,
                                      max_iter=10,
                                      learning_method='online',
                                      random_state=100,
                                      batch_size=128,
                                      evaluate_every = -1,
                                      n_jobs = -1)

lda_output = lda_model.fit_transform(data_vectorized)


LatentDirichletAllocation(batch_size=128, 
                          doc_topic_prior=None,
                          evaluate_every=-1, 
                          learning_decay=0.7,
                          learning_method="online",
                          learning_offset=10.0,
                          max_doc_update_iter=1897, 
                          max_iter=10, 
                          mean_change_tol=0.001,
                          n_components=10, 
                          n_jobs=-1, 
                          perp_tol=0.1,
                          random_state=100,
                          topic_word_prior=None,
                          total_samples=1000000.0, 
                          verbose=0)

# Define Search Param
search_params = {'n_components': [10, 20], 'learning_decay': [0.5, 0.9]}
# Init the Model
lda = LatentDirichletAllocation(max_iter=10, learning_method='online', learning_offset=50.,random_state=0)
# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)
# Do the Grid Search
model.fit(data_vectorized)


GridSearchCV(cv=None, 
             error_score='raise',
             estimator=LatentDirichletAllocation(batch_size=128, 
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1, 
                                                 learning_decay=0.7, 
                                                 learning_method=None,
                                                 learning_offset=10.0, 
                                                 max_doc_update_iter=1897, 
                                                 max_iter=10,
                                                 mean_change_tol=0.001,
                                                 n_components=10, 
                                                 n_jobs=1,
                                                 perp_tol=0.1,
                                                 random_state=None,
                                                 topic_word_prior=None,
                                                 total_samples=1000000.0,
                                                 verbose=0),
        n_jobs=1,
       param_grid={'n_components': [10, 20], 
                   'learning_decay': [0.5, 0.9]},
             pre_dispatch='2*n_jobs', 
             refit=True, 
             return_train_score='warn',
             scoring=None,
             verbose=0)

best_lda_model = model.best_estimator_

# Create Document — Topic Matrix
lda_output = best_lda_model.transform(data_vectorized)

topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
docnames = ["Doc" + str(i) for i in range(len(df_mostExpressedPositiveReviews['Words_lemmatized']))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic["dominant_topic"] = dominant_topic

# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(best_lda_model.components_)
# Assign Column and Index
df_topic_keywords.columns = vectorizer.get_feature_names()
df_topic_keywords.index = topicnames
# View
df_topic_keywords.head()
#print("***")
#print(vectorizer.get_feature_names())


# Show top n keywords for each topic
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=10):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

def tagging(topic_keywords):
    for key in topic_keywords:
        value = list(filter(None, topic_keywords[key])) # Get rid of empty items
        Tagged = nltk.pos_tag(value)
        return Tagged

def labelTopics(text):
    mylist = text
    mylist = unwantedCharacters_removing(mylist)
    mylist = stopwords_removing(mylist)
    mylist = list(mylist.split(" "))
    length = len(mylist)
    topic = []

    
# Again performing this step for Positive Remarks
#******************************************************************************************************************
## Step 6:               RBLSALT Automated Labelling Algorthim 
#******************************************************************************************************************


    # A = Noun
    # B = Verb
    # C = Adjective 

    for i in range(length):
        doc = nlp(mylist[i])
      

        ####Rule 1: if A => B => C 
        # Where Noun is followed by either Noun or Verb or Adjective

        if(doc[0].tag_ == 'NN'): # if the elemnet is Noun
            if(i <= (length - 2)):
                doc2 = nlp(mylist[i+1]) # find the adjacent element 

             ##Rule 1.1: if A => A
            if(doc2[0].tag_ == 'NN'): # if adjacent element is also Noun 
                topic.append(mylist[i]) # Then both of the elements are independ elements

             ##Rule 1.2: if A => B    
            if(doc2[0].tag_ == 'VB'): # if adjacent elemnt is verb 
                string = mylist[i] + " " + mylist[i+1] # Then concentarte the elements; i.e Noun + Verb (Game Stop)
                topic.append(string)

             ##Rule 1.3: if A => C 
            if(doc2[0].tag_ == 'JJ'): # if adjacent elemnt is Adjective 
                string = mylist[i] + " " + mylist[i+1] # Then concentarte the elements; i.e Noun + Adjective 
                topic.append(string)


        ####Rule 2: if B => A => C
         # Where Verb is followed by either Noun or Verb or Adjective

        elif(doc[0].tag_ == 'VB'): # if the elemnet is Verb
            if(i <= (length - 2)):
                doc2 = nlp(mylist[i+1])  # find the adjacent element 

             ##Rule 2.1: if B => A
            if(doc2[0].tag_ == 'NN'): # if adjacent element is Noun 
                string = mylist[i] + " " + mylist[i+1] # Then concentarte the elements; i.e Verb + Noun 
                topic.append(string) 

             ##Rule 2.2: if B => B    
            if(doc2[0].tag_ == 'VB'): # if adjacent elemnt is also verb 
                topic.append(mylist[i]) # Then both of the elements are independ element

             ##Rule 2.3: if B => C    
            if(doc2[0].tag_ == 'JJ'): # if adjacent elemnt is Adjective 
                string = mylist[i] + " " + mylist[i+1] # Then concentarte the elements; i.e Verb + Adjective
                topic.append(string)
                
                

         ####Rule 3: if C => A => B
         # Where Adjective is followed by either Noun or Verb or Adjective

        elif(doc[0].tag_ == 'JJ'): # if the elemnet is Adjective
            if(i <= (length - 2)):
                doc2 = nlp(mylist[i+1]) # find the adjacent element
                    
             ##Rule 3.1: if C => A
            if(doc2[0].tag_ == 'NN'): # if adjacent element is Noun
                string = mylist[i] + " " + mylist[i+1] # Then concentarte the elements; i.e Adjective + Noun 
                topic.append(string) 

             ##Rule 3.2: if C => B
            if(doc2[0].tag_ == 'VB'): # if adjacent elemnt is verb 
                string = mylist[i] + " " + mylist[i+1] # Then concentarte the elements; i.e Adjective + Verb 
                topic.append(string)

             ##Rule 3.3: if C => C
            if(doc2[0].tag_ == 'JJ'): # if adjacent elemnt is also adjective  
                topic.append(mylist[i]) # Then both of the elements are independ element
       
    return topic

#********************************* RBLSALT ends here **********************************************


topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=10)   

# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]

values = ' '.join([str(i) for i in df_topic_keywords.values.tolist()])

topic1_values = values.split("]",1)[0]
values = values.split("]",1)[1]
topic2_values = values.split("]",1)[0]
values = values.split("]",1)[1]
topic3_values = values.split("]",1)[0]
values = values.split("]",1)[1]
topic4_values = values.split("]",1)[0]
values = values.split("]",1)[1]
topic5_values = values.split("]",1)[0]
values = values.split("]",1)[1]
topic6_values = values.split("]",1)[0]
values = values.split("]",1)[1]
topic7_values = values.split("]",1)[0]
values = values.split("]",1)[1]
topic8_values = values.split("]",1)[0]
values = values.split("]",1)[1]
topic9_values = values.split("]",1)[0]
values = values.split("]",1)[1]
topic10_values = values.split("]",1)[0]


topic_1 = labelTopics(topic1_values)
topic_2 = labelTopics(topic2_values)
topic_3 = labelTopics(topic3_values)
topic_4 = labelTopics(topic4_values)
topic_5 = labelTopics(topic5_values)
topic_6 = labelTopics(topic6_values)
topic_7 = labelTopics(topic7_values)
topic_8 = labelTopics(topic8_values)
topic_9 = labelTopics(topic9_values)
topic_10 = labelTopics(topic10_values)


Topics = [topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10]
    
df_topic_keywords['Potential_keywords_for_labelling_terms']= Topics
print(df_topic_keywords)

print(topic_1)
print(topic_2)
print(topic_3)
print(topic_4)
print(topic_5)
print(topic_6)
print(topic_7)
print(topic_8)
print(topic_9)
print(topic_10)




           Word 0 Word 1      Word 2     Word 3      Word 4     Word 5  \
Topic 0     great    app        want      start         far       play   
Topic 1    played   hour         buy       wish       phone  fantastic   
Topic 2   version   work     amazing        job      mobile        bit   
Topic 3      best    far         got     really         fun    graphic   
Topic 4      nice   game  controller    support     amazing       star   
Topic 5      good   game        love      think    original   favorite   
Topic 6   playing   good       music     puzzle        know        bit   
Topic 7      game  great        time     puzzle        play     played   
Topic 8      game   play        love    awesome  experience      great   
Topic 9  emulator  issue      highly  recommend   excellent        use   

              Word 6     Word 7      Word 8   Word 9  \
Topic 0          fix     change         bug     work   
Topic 1        thing       game        year     free   
Topic 2    excell