In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt 
import seaborn as sns
import demoji
import spacy

# Load DataSet and Data Dictionary

In [2]:
# main_dir = 
Dataset_name = 'TechLabsDataset.csv' 
DataDictionary_name = 'TechLabsDataset_Dictionary.csv'

Dataset = pd.read_csv(os.path.join('./Data/',Dataset_name), index_col = 0)
DataDictionary = pd.read_csv(os.path.join('./Data/',DataDictionary_name), index_col = 0)
Dataset.head()

Unnamed: 0,name,type_company,lat,lon,city,avg_stars_num,n_reviews_num,encoded_user,local_guide,clean_other_review_num,published_date,today_date,stars_num,review_EN,original_lang
0,Caretrex warehousing & logistics,Logistics service,51.593721,5.073492,Tilburg,5.0,2.0,HHpDM,0.0,4.0,2 years ago,2022-09-24,5.0,,
1,Caretrex warehousing & logistics,Logistics service,51.593721,5.073492,Tilburg,5.0,2.0,dYFWx,0.0,1.0,5 years ago,2022-09-24,5.0,,
2,FEFA Logistics,Trucking company,51.480074,5.446764,Eindhoven,,,,,,,,,,
3,Dekkers Transport Holland,Trucking company,51.597249,5.02799,Tilburg,4.6,53.0,Pp1)c,0.0,5.0,a year ago,2022-09-24,5.0,Best employer ever. Worked there for approxima...,ENG
4,Dekkers Transport Holland,Trucking company,51.597249,5.02799,Tilburg,4.6,53.0,*nk1l,0.0,2.0,a year ago,2022-09-24,5.0,better place with very good people everything ...,ENG


In [3]:
review_df = pd.DataFrame(Dataset[~Dataset['review_EN'].isnull()]['review_EN'])
review_df.head()


Unnamed: 0,review_EN
3,Best employer ever. Worked there for approxima...
4,better place with very good people everything ...
5,Nice
6,Good firm. The staff are very welcoming and f...
7,"Top company, and coffee, so recommended\n\n"


In [4]:
# find emo
def find_emo(text):
    # find emoji and add to list
    all_emoji = demoji.findall(text)
    emo = ''
    if len(all_emoji)>0:
        for i,j in enumerate(all_emoji):
            # list emo
            emo = emo+','+all_emoji[j]
    return emo


# replace emo with empty text
def replace_emo(text):
    # find emo and replce with empty text
    all_emoji = demoji.findall(text)

    if len(all_emoji)>0:
        for i,j in enumerate(all_emoji):
            text = text.replace(j,'')
    return text

In [5]:
# add emo
review_df['emo'] = review_df['review_EN'].apply(lambda x: find_emo(x))

In [6]:
# add review with no emo
review_df['review_no_emo'] = review_df['review_EN'].apply(lambda x: replace_emo(x))
# review_df[review_df['emo']!=''][['review_no_emo','emo']]

# NLP pre-processing on review_df['review_no_emo']


In [7]:
# get clean lemmas
def get_clean_lemma(text):  
    text = [token.lemma_ for token in nlp(text.lower()) if
                  not token.is_punct
                  and not token.is_currency
                  and not token.is_digit
                  and not token.is_punct
    #               and not token.is_oov# is out of vocabulary
                  and not token.is_space
                  and not token.is_stop
                  and not token.like_num
                  and not token.pos_== 'PUNCT'
                          ]
    return text


# # get tags per token
# def get_tag_lemma(text): 
#     tag_list = [token.pos_ for token in nlp(text.lower()) if
#                   not token.is_punct
#                   and not token.is_currency
#                   and not token.is_digit
#                   and not token.is_punct
#     #               and not token.is_oov# is out of vocabulary
#                   and not token.is_space
#                   and not token.is_stop
#                   and not token.like_num
#                           ]
#     # ['NN','NNS','NNP','NNPS'] nouns
#     # ['JJ','JJR','JJS'] adjectives
#     # ['RB','RBR','RBS'] adverbs
#     # ['VB','VBD','VBG','VBN','VBP','VBZ'] verbs
#     return tag_list


In [8]:

# check attribute of token in spacy: https://spacy.io/api/token

# list of stop words in spacy
# spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
# print('Number of stop words: %d' % len(spacy_stopwords))
# print('First ten stop words: %s' % list(spacy_stopwords))

nlp = spacy.load("en_core_web_sm")

review_df['text_lemmas'] = review_df['review_no_emo'].apply(lambda x: get_clean_lemma(x))
# review_df['lemmas_tags'] = review_df['review_no_emo'].apply(lambda x: get_tag_lemma(x))



In [9]:

concatenate_all_tokens = sum(review_df['text_lemmas'].tolist(),[])
from collections import Counter


word_counts = Counter(concatenate_all_tokens)


In [10]:
word_counts_df = pd.DataFrame.from_dict(word_counts, orient='index').reset_index()
word_counts_df.columns = ['word','#']
word_counts_df.sort_values('#',ascending = False, inplace = True)

In [11]:
word_counts_df['tag'] = word_counts_df['word'].apply(lambda x: nlp(x)[0].pos_)

In [12]:
np.unique(word_counts_df['tag'])

array(['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM',
       'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SPACE', 'SYM', 'VERB',
       'X'], dtype=object)

In [13]:

word_counts_df[word_counts_df['tag']=='X']

Unnamed: 0,word,#,tag
551,dutch,22,X
1617,e,13,X
707,counter,12,X
325,>,11,X
406,km,10,X
233,etc,9,X
1979,k,8,X
1091,de,8,X
1444,scammer,8,X
1347,+,7,X


In [14]:
word_counts_df['tag_short'] = word_counts_df['tag'].apply(lambda x: tag_short(x))

NameError: name 'tag_short' is not defined

In [None]:
word_counts_df[word_counts_df['tag_short']=='adverbs']['word'].tolist()
['ea','p.m.','not','a.m.','ear','har','eurostar','tattoo','semi','ex-','right.only',
 'there.fast','gtv','bezeike','alphen','off.a','cbk']

In [None]:
word_counts_df[word_counts_df['tag_short']=='verbs']['word'].tolist()
# ['ea','p.m.','not','a.m.','ear','har','eurostar','tattoo','semi','ex-','right.only',
#  'there.fast','gtv','bezeike','alphen','off.a','cbk']
['netherlands','cmr','ref','den','fl','tci','lug','b.v','kamado','luxembourg','cm','bo','bah','bio','m','nico',
'logistics&move','ist','lorentaweg','sweden','co','amir',
 'rabote','fajn','om','cove',
 'leadl''patrick',
 'toplogistiek','toitoi','swerve','lkw','aceo',
 'melden','ce',
 'satay',
 'manfre',]

In [None]:
word_counts_df[word_counts_df['tag_short']=='nouns']