In [69]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import html
import unicodedata
from gensim.models.fasttext import FastText
import seaborn as sns
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import string
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('omw-1.4')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [25]:
df=pd.read_json("../input/yelp-dataset/yelp_academic_dataset_tip.json",lines=True)


In [26]:
df.head()

In [27]:
df=df[:100000]

In [28]:
df

In [29]:
df.shape

In [30]:
df.columns

In [31]:
df.isnull().sum()

# **Preprocessing on the text feature**

* Remove white extra space from text
* Remove all special characters from the text
* Remove all single characters from the text
* Convert text to lower case
* Perform Word tokenization
* Lemmatization
* Remove stop words from the text
* Remove words length less than 3 from text


In [32]:
def remove_spaces(text):
    return re.sub(' +', ' ', text)
    
t= 'The     quick  brown    fox'
remove_spaces(t)

In [33]:
def remove_special_char(text):
    s1=re.sub("[^A-Za-z]","",text)
    
    return s1
s="Hello$@ Python3$#^&*fbdsw@*42 56 "
remove_special_char(s)


In [34]:
df_text=df.copy()
df_text["text"]=df_text["text"].apply(remove_spaces)

In [35]:
df_text["text"]

In [36]:
def remove_special_chars(text):
    re1 = re.compile(r'  +')
    x1 = text.lower().replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>', 'u_n').replace(' @.@ ', '.').replace(
        ' @-@ ', '-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x1))

In [37]:
df_text["text"]=df_text["text"].apply(remove_special_chars)
df_text["text"]

In [38]:
def remove_non_ascii(text):
    """Remove non-ASCII characters from list of tokenized words"""
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

In [39]:
df_text["text"]=df_text["text"].apply(remove_non_ascii)
df_text["text"]

In [40]:
def to_lower(text):
    return text.lower()



In [41]:
def tokenize(text):
    return word_tokenize(text)


df_text["text"]=df_text["text"].apply(tokenize)

In [42]:
df_text["text"]

In [43]:
def remove_stopwords(words, stop_words=stopwords.words('english')):
    
    return [word for word in words if word not in stop_words and len(word)>3]

In [44]:
df_text["text"]=df_text["text"].apply(remove_stopwords)

In [45]:
df_text["text"]

In [46]:
def remove_punct(words,punc=string.punctuation):
    return [word for word  in words if word not in punc]

In [47]:
x=['love', 'cubans', '!' ,'!']
remove_punct(x)

In [48]:
df_text["text"]=df_text["text"].apply(remove_punct)

In [49]:
df_text["text"]

In [50]:
def lemmtize_words(words,lemmtizer=WordNetLemmatizer()):
    
    return [lemmtizer.lemmatize(word, pos='v') for word in words]

In [51]:
df_text["text"]=df_text["text"].apply(lemmtize_words)

In [52]:
word_tokens=df_text["text"].tolist()
word_tokens

In [53]:
# Defining values for parameters
embedding_size =300
window_size = 5
min_word = 5
down_sampling =1e-2
fast_Text_model = FastText(word_tokens,vector_size=embedding_size,window=window_size, min_count=min_word,
                      sample=down_sampling,
                      sg=1,
                      epochs=100)

In [55]:
from gensim.models import Word2Vec
# Save fastText gensim model
fast_Text_model.save("./ft_model_yelp")
# Load saved gensim fastText model
fast_Text_model = Word2Vec.load("ft_model_yelp")

In [60]:
fast_Text_model.wv['great']

In [63]:
#most similar words of great
fast_Text_model.wv.most_similar("awesome", topn=10)

In [79]:
def tsne_plot(for_word, w2v_model):
    # trained fastText model dimention
    #print(for_word)
    dim_size = w2v_model.wv.vectors.shape[1]
    #print(dim_size)
    arrays = np.empty((0, dim_size), dtype='f')
    word_labels = [for_word]
    #print(word_labels)
    color_list  = ['red']
 
    # adds the vector of the query word
    arrays = np.append(arrays, w2v_model.wv.__getitem__([for_word]), axis=0)
    #print(arrays)
    # gets list of most similar words
    sim_words = w2v_model.wv.most_similar(for_word, topn=10)
 
    # adds the vector for each of the closest words to the array
    for wrd_score in sim_words:
        wrd_vector = w2v_model.wv.__getitem__([wrd_score[0]])
        word_labels.append(wrd_score[0])
        color_list.append('green')
        arrays = np.append(arrays, wrd_vector, axis=0)
 
    # fit 2d PCA model to the similar word vectors
    model_pca = PCA(n_components = 10).fit_transform(arrays)
 
    # Finds 2d coordinates t-SNE
    np.set_printoptions(suppress=True)
    Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(model_pca)
    # Sets everything up to plot
    df_plot = pd.DataFrame({'x': [x for x in Y[:, 0]],
                       'y': [y for y in Y[:, 1]],
                       'words_name': word_labels,
                       'words_color': color_list})
    # plot dots with color and position
    plot_dot = sns.regplot(data=df_plot,
                     x="x",
                     y="y",
                     fit_reg=False,
                     marker="o",
                     scatter_kws={'s': 40,
                                  'facecolors': df_plot['words_color']
                                 }
                    )
 
    # Adds annotations with color one by one with a loop
    for line in range(0, df_plot.shape[0]):
         plot_dot.text(df_plot["x"][line],
                 df_plot['y'][line],
                 '  ' + df_plot["words_name"][line].title(),
                 horizontalalignment='left',
                 verticalalignment='bottom', size='medium',
                 color=df_plot['words_color'][line],
                 weight='normal'
                ).set_size(15)
 
 
    plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50)
    plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50)
 
    plt.title('t-SNE visualization for word "{}'.format(for_word.title()) +'"')


In [80]:

# tsne plot for top 10 similar word to 'awesome'
tsne_plot(for_word='awesome', w2v_model=fast_Text_model)