In [1]:
#!pip install nltk
#nltk.download("punkt")

In [2]:
paragraph="Deep learning (also known as deep structured learning) is part of a broader family of machine learning methods based on artificial neural networks with representation learning. Learning can be supervised, semi-supervised or unsupervised.Deep-learning architectures such as deep neural networks, deep belief networks, deep reinforcement learning, recurrent neural networks, convolutional neural networks and Transformers have been applied to fields including computer vision, speech recognition, natural language processing, machine translation, bioinformatics, drug design, medical image analysis, climate science, material inspection and board game programs, where they have produced results comparable to and in some cases surpassing human expert performance.Artificial neural networks (ANNs) were inspired by information processing and distributed communication nodes in biological systems. ANNs have various differences from biological brains. Specifically, artificial neural networks tend to be static and symbolic, while the biological brain of most living organisms is dynamic (plastic) and analogue.The adjective 'deep' in deep learning refers to the use of multiple layers in the network. Early work showed that a linear perceptron cannot be a universal classifier, but that a network with a nonpolynomial activation function with one hidden layer of unbounded width can. Deep learning is a modern variation which is concerned with an unbounded number of layers of bounded size, which permits practical application and optimized implementation, while retaining theoretical universality under mild conditions. In deep learning the layers are also permitted to be heterogeneous and to deviate widely from biologically informed connectionist models, for the sake of efficiency, trainability and understandability, hence the 'structured' part."

In [3]:
import re
import nltk
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.corpus import stopwords

In [4]:
#Breakdown the corpus into documents
documents=nltk.sent_tokenize(paragraph)

In [5]:
print(documents)

['Deep learning (also known as deep structured learning) is part of a broader family of machine learning methods based on artificial neural networks with representation learning.', 'Learning can be supervised, semi-supervised or unsupervised.Deep-learning architectures such as deep neural networks, deep belief networks, deep reinforcement learning, recurrent neural networks, convolutional neural networks and Transformers have been applied to fields including computer vision, speech recognition, natural language processing, machine translation, bioinformatics, drug design, medical image analysis, climate science, material inspection and board game programs, where they have produced results comparable to and in some cases surpassing human expert performance.Artificial neural networks (ANNs) were inspired by information processing and distributed communication nodes in biological systems.', 'ANNs have various differences from biological brains.', "Specifically, artificial neural networks 

In [6]:
# Stemming (Faster but need not give useful root words)
stemmer=PorterStemmer()
stemmer.stem("history")

'histori'

In [7]:
# Lemmatization (Comparatively slow but gives correct root words)
lemmatizer=WordNetLemmatizer()
lemmatizer.lemmatize("history")

'history'

### Preprocessing
- Remove punctuation
- Remove stopwords
- Stemming / Lemmatize


In [8]:
len(documents)

7

In [9]:
corpus=[]
for i in range(len(documents)):
    review=re.sub('[^a-zA-Z]',' ',documents[i]) #Replacing any non alphabet character with space
    review=review.lower()
    review=review.split()
    review=[lemmatizer.lemmatize(word) for word in review if not word in set(stopwords.words("english"))]
    review= ' '.join(review)
    corpus.append(review)
            

In [10]:
corpus

['deep learning also known deep structured learning part broader family machine learning method based artificial neural network representation learning',
 'learning supervised semi supervised unsupervised deep learning architecture deep neural network deep belief network deep reinforcement learning recurrent neural network convolutional neural network transformer applied field including computer vision speech recognition natural language processing machine translation bioinformatics drug design medical image analysis climate science material inspection board game program produced result comparable case surpassing human expert performance artificial neural network anns inspired information processing distributed communication node biological system',
 'anns various difference biological brain',
 'specifically artificial neural network tend static symbolic biological brain living organism dynamic plastic analogue adjective deep deep learning refers use multiple layer network',
 'early wo

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [15]:
X = vectorizer.fit_transform(corpus)

In [22]:
vectorizer.get_feature_names_out()

array(['activation', 'adjective', 'also', 'analogue', 'analysis', 'anns',
       'application', 'applied', 'architecture', 'artificial', 'based',
       'belief', 'bioinformatics', 'biological', 'biologically', 'board',
       'bounded', 'brain', 'broader', 'cannot', 'case', 'classifier',
       'climate', 'communication', 'comparable', 'computer', 'concerned',
       'condition', 'connectionist', 'convolutional', 'deep', 'design',
       'deviate', 'difference', 'distributed', 'drug', 'dynamic', 'early',
       'efficiency', 'expert', 'family', 'field', 'function', 'game',
       'hence', 'heterogeneous', 'hidden', 'human', 'image',
       'implementation', 'including', 'information', 'informed',
       'inspection', 'inspired', 'known', 'language', 'layer', 'learning',
       'linear', 'living', 'machine', 'material', 'medical', 'method',
       'mild', 'model', 'modern', 'multiple', 'natural', 'network',
       'neural', 'node', 'nonpolynomial', 'number', 'one', 'optimized',
       

In [30]:
X.toarray()

array([[0.        , 0.        , 0.20774089, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.17757002,
        0.25026433, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.25026433, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.27009316, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.25026433, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.25026433, 0.        , 0.        , 0.54018632, 0.        ,
        0.        , 0.20774089, 0.        , 0.        , 0.25026433,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.15416768, 0.17757002, 0.        , 0.  

In [23]:
print(X.shape)

(7, 125)


##### There are 7 sentences in the corpus and each document will be represented by a vector of length 125

In [16]:
vectorizer.vocabulary_  #The index of the features

{'deep': 30,
 'learning': 58,
 'also': 2,
 'known': 55,
 'structured': 103,
 'part': 78,
 'broader': 18,
 'family': 40,
 'machine': 61,
 'method': 64,
 'based': 10,
 'artificial': 9,
 'neural': 71,
 'network': 70,
 'representation': 92,
 'supervised': 104,
 'semi': 97,
 'unsupervised': 117,
 'architecture': 8,
 'belief': 11,
 'reinforcement': 91,
 'recurrent': 89,
 'convolutional': 29,
 'transformer': 111,
 'applied': 7,
 'field': 41,
 'including': 50,
 'computer': 25,
 'vision': 121,
 'speech': 101,
 'recognition': 88,
 'natural': 69,
 'language': 56,
 'processing': 85,
 'translation': 112,
 'bioinformatics': 12,
 'drug': 35,
 'design': 31,
 'medical': 63,
 'image': 48,
 'analysis': 4,
 'climate': 22,
 'science': 96,
 'material': 62,
 'inspection': 53,
 'board': 15,
 'game': 43,
 'program': 87,
 'produced': 86,
 'result': 93,
 'comparable': 24,
 'case': 20,
 'surpassing': 105,
 'human': 47,
 'expert': 39,
 'performance': 80,
 'anns': 5,
 'inspired': 54,
 'information': 51,
 'distribut

In [17]:
# In a sorted way
import collections
print(dict(collections.OrderedDict(sorted(vectorizer.vocabulary_.items()))))

{'activation': 0, 'adjective': 1, 'also': 2, 'analogue': 3, 'analysis': 4, 'anns': 5, 'application': 6, 'applied': 7, 'architecture': 8, 'artificial': 9, 'based': 10, 'belief': 11, 'bioinformatics': 12, 'biological': 13, 'biologically': 14, 'board': 15, 'bounded': 16, 'brain': 17, 'broader': 18, 'cannot': 19, 'case': 20, 'classifier': 21, 'climate': 22, 'communication': 23, 'comparable': 24, 'computer': 25, 'concerned': 26, 'condition': 27, 'connectionist': 28, 'convolutional': 29, 'deep': 30, 'design': 31, 'deviate': 32, 'difference': 33, 'distributed': 34, 'drug': 35, 'dynamic': 36, 'early': 37, 'efficiency': 38, 'expert': 39, 'family': 40, 'field': 41, 'function': 42, 'game': 43, 'hence': 44, 'heterogeneous': 45, 'hidden': 46, 'human': 47, 'image': 48, 'implementation': 49, 'including': 50, 'information': 51, 'informed': 52, 'inspection': 53, 'inspired': 54, 'known': 55, 'language': 56, 'layer': 57, 'learning': 58, 'linear': 59, 'living': 60, 'machine': 61, 'material': 62, 'medical'

##### We see there are 124 words in our vocabulary which have got the feature number alphabetically

In [18]:
first_document=corpus[0]

In [19]:
first_document

'deep learning also known deep structured learning part broader family machine learning method based artificial neural network representation learning'

In [20]:
X[0].toarray() #See the first feature is "activation" and this word is not present in the first document, hence it has value 0 

array([[0.        , 0.        , 0.20774089, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.17757002,
        0.25026433, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.25026433, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.27009316, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.25026433, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.25026433, 0.        , 0.        , 0.54018632, 0.        ,
        0.        , 0.20774089, 0.        , 0.        , 0.25026433,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.15416768, 0.17757002, 0.        , 0.  

In [21]:
X[0].toarray().shape
# Hence every document will be represented by a vector of length 158

(1, 125)

### n-gram TF-IDF

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,3)) #From unigrams to trigrams

In [32]:
X = vectorizer.fit_transform(corpus)
# In a sorted way
import collections
print(dict(collections.OrderedDict(sorted(vectorizer.vocabulary_.items()))))

{'activation': 0, 'activation function': 1, 'activation function one': 2, 'adjective': 3, 'adjective deep': 4, 'adjective deep deep': 5, 'also': 6, 'also known': 7, 'also known deep': 8, 'also permitted': 9, 'also permitted heterogeneous': 10, 'analogue': 11, 'analogue adjective': 12, 'analogue adjective deep': 13, 'analysis': 14, 'analysis climate': 15, 'analysis climate science': 16, 'anns': 17, 'anns inspired': 18, 'anns inspired information': 19, 'anns various': 20, 'anns various difference': 21, 'application': 22, 'application optimized': 23, 'application optimized implementation': 24, 'applied': 25, 'applied field': 26, 'applied field including': 27, 'architecture': 28, 'architecture deep': 29, 'architecture deep neural': 30, 'artificial': 31, 'artificial neural': 32, 'artificial neural network': 33, 'based': 34, 'based artificial': 35, 'based artificial neural': 36, 'belief': 37, 'belief network': 38, 'belief network deep': 39, 'bioinformatics': 40, 'bioinformatics drug': 41, 'b

In [35]:
X.toarray().shape

(7, 433)

##### There are 7 sentences in the corpus and each document will be represented by a vector of length 433

In [33]:
first_document=corpus[0]

In [34]:
X[0].toarray() #See the first feature is "activation" and this word is not present in the first document, hence it has value 0 

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.11886579, 0.14319698, 0.14319698, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.10160254, 0.10160254, 0.10160254, 0.14319698,
        0.14319698, 0.14319698, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.14319698,
        0.14319698, 0.14319698, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

In [36]:
X[0].toarray().shape
# Hence every document will be represented by a vector of length 433

(1, 433)