In [1]:
import numpy as np                                  #for large and multi-dimensional arrays
import pandas as pd                                 #for data manipulation and analysis
import nltk                                         #Natural language processing tool-kit

from nltk.corpus import stopwords                   #Stopwords corpus
from nltk.stem import PorterStemmer                 # Stemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet

from sklearn.feature_extraction.text import CountVectorizer          #For Bag of words
from sklearn.feature_extraction.text import TfidfVectorizer          #For TF-IDF
from gensim.models import Word2Vec                                   #For Word2Vec

In [2]:
import re

In [3]:
#Could use Spacy also for Cleaning -- explore

In [9]:
#data_path = "C:\\Users\\KANUKMA2\\Projects\\mentor\\Reviews.csv"
data_path='../../Code/NLP/nlp codes-1/nlpdatasets/Reviews.csv'
data = pd.read_csv(data_path)

FileNotFoundError: [Errno 2] No such file or directory: '../../Code/NLP/nlp codes-1/nlpdatasets/Reviews.csv'

In [None]:
data.shape

In [None]:
data.head(2)

In [None]:
#find missing values
data.isnull().sum()

In [None]:
#sample 100 reviews
data_sel = data.head(100) 

In [None]:
# Shape of our data
data_sel.columns

In [None]:
data_sel.head(2)

In [None]:
#Score column, it has values 1,2,3,4,5 . Considering 1, 2 as Negative reviews and 4, 5 as Positive reviews. 
#For Score = 3 we will consider it as Neutral review

In [None]:
data_score_removed = data_sel[data_sel['Score']!=3]

In [None]:
data_score_removed.head()

In [None]:
#Converting Score values into class label either Posituve or Negative.
def partition(x):
    if x < 3:
        return 'negative'
    return 'positive'

In [None]:
score_upd = data_score_removed['Score']
t = score_upd.map(partition)
data_score_removed['Score']=t

In [None]:
data_score_removed.head()

In [None]:
final_data = data_score_removed.drop_duplicates(subset={"UserId","ProfileName","Time","Text"})

In [None]:
#HelfulnessNumerator says about number of people found that review usefull
#and HelpfulnessDenominator is about usefull review count + not so usefull count. 

In [None]:
final = final_data[final_data['HelpfulnessNumerator'] <= final_data['HelpfulnessDenominator']]

In [None]:
#final_X = final['Text']
#final_y = final['Score']

In [None]:
final_X =  final[['ProductId','Text']].copy()

In [None]:
final_X['tokenized_sents'] = final_X['Text'].map(lambda text:re.sub('[^a-zA-Z0-9]+', ' ',text)).apply(lambda x: (x.lower()).split())

In [None]:
final_X.head()

In [None]:
stop = set(stopwords.words('english')) 
print(stop)

In [None]:
final_X['tokens'] = final_X['tokenized_sents'].apply(lambda x: [i for i in x if i not in stop and len(i) >= 3])

In [None]:
final_X.head()

In [None]:
final_X['tokens'][1]

In [None]:
#Stemming is a method of normalization of words in Natural Language Processing.
#Stemming algorithm works by cutting the suffix from the word. 
#In a broader sense cuts either the beginning or end of the word.

In [None]:
ps =PorterStemmer()

In [None]:
for w in final_X['tokens'][1]:
    rootWord=ps.stem(w)
    print(rootWord)

In [None]:
#In linguistics, morphology is the study of words, how they are formed, and their relationship to 
#other words in the same language.
#It analyzes the structure of words and parts of words such as stems, root words, prefixes, and suffixes.

In [None]:
#Lemmatization usually refers to the morphological analysis of words, which aims to remove inflectional endings. 
#It helps in returning the base or dictionary form of a word known as the lemma.

In [None]:
lemma = WordNetLemmatizer()

In [None]:
for w in final_X['tokens'][1]:
    print("Lemma for {} is {}".format(w, lemma.lemmatize(w))) 

In [None]:
text = "studies studying cries cry"
tokenization = nltk.word_tokenize(text)
for w in tokenization:
    print("Lemma for {} is {}".format(w, lemma.lemmatize(w)))  

In [None]:
#Tag each word with corresponding Part of Speech
tagged = nltk.pos_tag(final_X['tokens'][1])
print('POS tagged words= ',tagged)

In [None]:
#Chunk definition : Verb -> Noun
chunkGram = r"""chunk: {<VB.>+<NN.>+}"""
#Passing the Chunk to a regex parser
chunkParser = nltk.RegexpParser(chunkGram)
#Parsing
chunked = chunkParser.parse(tagged)
print(chunked)

In [None]:
# Accessing the Chunk
for subtree in chunked.subtrees(filter=lambda t: t.label() == 'chunk'):
    print('Filtered chunks= ',subtree)
    chunked_output = ' '.join([w for w, t in subtree.leaves()])
     #Visualize the output
    chunked.draw()

In [None]:
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

def generate_ngrams(words_list, n):
    ngrams_list = []
 
    for num in range(len(words_list)):
        ngram = '_'.join(words_list[num:num + n])
        if num < len(words_list) - n + 1:
            ngrams_list.append(ngram)
 
    return ngrams_list

In [None]:
final_X['POS_tokens'] = final_X['tokens'].apply(lambda x: pos_tag(x))

final_X['tokens_lemmatized'] = final_X['POS_tokens'].apply(
    lambda x: [
        
            lemma.lemmatize(el[0], get_wordnet_pos(el[1])) 
            if get_wordnet_pos(el[1]) != '' else el[0] for el in x

    ]
)

In [None]:
final_X['tokens_2g'] = final_X['tokens_lemmatized'].apply(lambda x: generate_ngrams(x,2))

In [None]:
final_X['tokenized_clean'] = final_X['tokens_lemmatized']+final_X['tokens_2g']

In [None]:
final_X.head()

In [None]:
final_X['clean_sent'] = final_X['tokenized_clean'].apply(lambda x: ' '.join(str(e) for e in x))

In [None]:
final_X.head()

In [None]:
#final_X.drop(['tokenized_sents','tokens_2g','POS_tokens','tokens_lemmatized','tokens','Text','tokenized_clean'], axis=1, inplace=True)

In [None]:
final_clean = final_X['clean_sent']

In [None]:
print(final_clean[1])

In [None]:
#Techniques for Encoding

In [None]:
#BINARY BAG OF WORDS

In [None]:
count_vect = CountVectorizer(max_features=5000)
bow_data = count_vect.fit_transform(final_clean)
print(bow_data[1])

In [None]:
bow_dataframe=pd.DataFrame(bow_data.toarray(),columns=count_vect.get_feature_names())

In [None]:
bow_dataframe.shape

In [None]:
bow_dataframe.head()

In [None]:
#final_B_X =final_clean
#count_vect = CountVectorizer(ngram_range=(1,2))
#Bigram_data = count_vect.fit_transform(final_B_X)
#print(Bigram_data[1])

In [None]:
#bigram_dataframe=pd.DataFrame(Bigram_data.toarray(),columns=count_vect.get_feature_names())

In [None]:
#bigram_dataframe.shape

In [None]:
#bigram_dataframe.head()

In [None]:
#TF-IDF

In [None]:
final_tf = final_clean
tf_idf = TfidfVectorizer(max_features=5000)
tf_data = tf_idf.fit_transform(final_tf)
print(tf_data[1])

In [None]:
tfidf_dataframe=pd.DataFrame(tf_data.toarray(),columns=tf_idf.get_feature_names())

In [None]:
tfidf_dataframe.shape

In [None]:
tfidf_dataframe.head()

In [None]:
#It works by increasing proportionally to the number of times a word appears in a document,
#but is offset by the number of documents that contain the word. So, words that are common in every document,
#such as this, what, and if, rank low even though they may appear many times, since they don’t mean much 
#to that document in particular.

In [None]:
#1. Information retrieval:
#Ex: Word "Curiosity"
#Imagine you have a search engine and somebody looks for Curiosity. 
#The results will be displayed in order of relevance. 
#That’s to say the most relevant articles will be ranked higher because TF-IDF gives
#the word Curiosity a higher score.

In [None]:
#2. Keyword:
#The highest scoring words of a document are the most relevant to that document, and 
#therefore they can be considered keywords for that document. 

In [None]:
#Word2Vec

In [None]:
w2v_data = final_clean

In [None]:
#list of list of words
splitted = []
for row in w2v_data: 
    splitted.append([word for word in row.split()])     #splitting words

In [None]:
#Word2Vec parameters*********************************
#size: The number of dimensions of the embeddings and the default is 100.
#window: The maximum distance between a target word and words around the target word. The default window is 5.
#min_count: The minimum count of words to consider when training the model; words with occurrence less than this count will be ignored. The default for min_count is 5.
#workers: The number of partitions during training and the default workers is 3.
#sg: The training algorithm, either CBOW(0) or skip gram(1). The default training algorithm is CBOW.
#****************************************************

In [None]:
model = Word2Vec(splitted,min_count=1,size=50, workers=4)

In [None]:
model.most_similar('jumbo')[:5]

In [None]:
#Play with above parameters in WordVec

In [None]:
#Glove - Explore