In [None]:
# -*- coding: utf-8 -*-
"""
Created on Monday Dec 23 2020
@author: Aruna Devi R
"""

'\nCreated on Monday Dec 22 2020\n@author: Aruna Devi R\n'

In [None]:
import nltk
nltk.download('punkt')
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from nltk.stem import PorterStemmer 
from nltk.corpus import stopwords

In [None]:
paragraph = """Neuro-linguistic programming (NLP) is a psychological approach that involves analyzing strategies used by successful individuals and applying them to reach a personal goal. It relates thoughts, language, and patterns of behavior learned through experience to specific outcomes."""

#### Tokenization

In [None]:
# Tokenizing sentences
sentences = nltk.sent_tokenize(paragraph)
print(len(sentences), sentences)

2 ['Neuro-linguistic programming (NLP) is a psychological approach that involves analyzing strategies used by successful individuals and applying them to reach a personal goal.', 'It relates thoughts, language, and patterns of behavior learned through experience to specific outcomes.']


In [None]:
# Tokenizing words
words = nltk.word_tokenize(paragraph)
print(len(words), words)

43 ['Neuro-linguistic', 'programming', '(', 'NLP', ')', 'is', 'a', 'psychological', 'approach', 'that', 'involves', 'analyzing', 'strategies', 'used', 'by', 'successful', 'individuals', 'and', 'applying', 'them', 'to', 'reach', 'a', 'personal', 'goal', '.', 'It', 'relates', 'thoughts', ',', 'language', ',', 'and', 'patterns', 'of', 'behavior', 'learned', 'through', 'experience', 'to', 'specific', 'outcomes', '.']


### Stemming

In [None]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
sentences

['Neuro-linguistic programming (NLP) is a psychological approach that involves analyzing strategies used by successful individuals and applying them to reach a personal goal.',
 'It relates thoughts, language, and patterns of behavior learned through experience to specific outcomes.']

In [None]:
stemmer = PorterStemmer()
# Stemming Logic
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
    sentences[i] = ' '.join(words)

In [None]:
sentences

['neuro-linguist program ( nlp ) psycholog approach involv analyz strategi use success individu appli reach person goal .',
 'It relat thought , languag , pattern behavior learn experi specif outcom .']

### Lemmatization

In [None]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
sentences = nltk.sent_tokenize(paragraph)

In [None]:
lemmatizer = WordNetLemmatizer()

# Lemmatization
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i]) # word tokenization in each sentence
    words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    sentences[i] = ' '.join(words)   

In [None]:
sentences

['Neuro-linguistic programming ( NLP ) psychological approach involves analyzing strategy used successful individual applying reach personal goal .',
 'It relates thought , language , pattern behavior learned experience specific outcome .']

### BagOfWords with Stemmer

In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

sentences = nltk.sent_tokenize(paragraph)

pstem = PorterStemmer()
sentences = nltk.sent_tokenize(paragraph)
corpus = []

# Basic Text Pre-processing
for i in range(len(sentences)):
    corp = re.sub('[^a-zA-Z]', ' ', sentences[i])
    corp = re.sub(r'\[[0-9]*\]',' ',corp)
    corp = re.sub(r'\s+',' ',corp)
    corp = re.sub(r'\d',' ',corp)
    corp = corp.lower()
    corp = corp.split()
    corp = [pstem.stem(word) for word in corp if not word in set(stopwords.words('english'))]
    corp = ' '.join(corp)
    corpus.append(corp)
corpus

['neuro linguist program nlp psycholog approach involv analyz strategi use success individu appli reach person goal',
 'relat thought languag pattern behavior learn experi specif outcom']

In [None]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 500)
X = cv.fit_transform(corpus).toarray()

In [None]:
X

array([[1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1,
        1, 0, 1],
       [0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0,
        0, 1, 0]])

### Term Frequency and Inverse Document Frequency(TF-IDF)

In [None]:
# Creating the TF-IDF model
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X = cv.fit_transform(corpus).toarray()

In [None]:
X

array([[0.25      , 0.25      , 0.25      , 0.        , 0.        ,
        0.25      , 0.25      , 0.25      , 0.        , 0.        ,
        0.25      , 0.25      , 0.25      , 0.        , 0.        ,
        0.25      , 0.25      , 0.25      , 0.25      , 0.        ,
        0.        , 0.25      , 0.25      , 0.        , 0.25      ],
       [0.        , 0.        , 0.        , 0.33333333, 0.33333333,
        0.        , 0.        , 0.        , 0.33333333, 0.33333333,
        0.        , 0.        , 0.        , 0.33333333, 0.33333333,
        0.        , 0.        , 0.        , 0.        , 0.33333333,
        0.33333333, 0.        , 0.        , 0.33333333, 0.        ]])

### Word2Vec (Word Embedding)

In [None]:
lemmatizer = WordNetLemmatizer()

# Preprocessing the data
text = re.sub(r'\[[0-9]*\]',' ',paragraph)
text = re.sub(r'\s+',' ',text)
text = text.lower()
text = re.sub(r'\d',' ',text)
text = re.sub(r'\s+',' ',text)

# Preparing the dataset
sentences = nltk.sent_tokenize(text)
sentences = [nltk.word_tokenize(sentence) for sentence in sentences]

for i in range(len(sentences)):
    sentences[i] = [word for word in sentences[i] if word not in stopwords.words('english')]

In [None]:
sentences

In [None]:
from gensim.models import Word2Vec
# Training the Word2Vec model
model = Word2Vec(sentences, min_count=1)
words = model.wv.vocab

In [None]:
vector = model.wv['approach']

In [None]:
# Most similar words
model.wv.most_similar('approach')

[('neuro-linguistic', 0.2270651012659073),
 ('strategies', 0.10953246802091599),
 ('patterns', 0.09466398507356644),
 ('.', 0.09436996281147003),
 ('behavior', 0.07975082099437714),
 ('relates', 0.07506164908409119),
 ('reach', 0.06882995367050171),
 ('personal', 0.03917991369962692),
 ('analyzing', 0.033424291759729385),
 ('language', 0.0220712348818779)]