In [1]:
import goose3
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import spacy
from spacy.lang.en.examples import sentences 

In [2]:
goose= goose3.Goose()
data=goose.extract("https://en.wikipedia.org/wiki/Natural_language_processing")

In [3]:
data=data.cleaned_text

In [4]:
data

'Natural language processing (NLP) is an interdisciplinary subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data. The goal is a computer capable of "understanding" the contents of documents, including the contextual nuances of the language within them. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.\n\nNatural language processing has its roots in the 1950s. Already in 1950, Alan Turing published an article titled "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence, though at the time that was not articulated as a problem separate from artificial intelligence. The proposed test includes a task that involves the automated interpre

In [5]:
data[:16]

'Natural language'

In [6]:
sentences = [sentence for sentence in nltk.sent_tokenize(data)]

In [7]:
sentences

['Natural language processing (NLP) is an interdisciplinary subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.',
 'The goal is a computer capable of "understanding" the contents of documents, including the contextual nuances of the language within them.',
 'The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.',
 'Natural language processing has its roots in the 1950s.',
 'Already in 1950, Alan Turing published an article titled "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence, though at the time that was not articulated as a problem separate from artificial intelligence.',
 'The proposed test includes a task that involves the 

In [8]:
sentences[0]

'Natural language processing (NLP) is an interdisciplinary subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.'

In [17]:
processed_sentences

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [18]:
English_Model = spacy.load("en_core_web_sm", disable = ["ner", "parser"])

In [19]:
def Process_Data(sentence):
    sen = English_Model(sentence)
    
    temp = []
    
    for token in sen: 
        if not token.is_stop and not token.is_punct and not token.like_num and not token.is_space:
            temp_word = token.lemma_
            temp.append(temp_word.lower())
            
            
    return " ".join(temp)

In [20]:
processed_sentences = []

for sentence in sentences:
    processed_sentences.append(Process_Data(sentence))

In [21]:
processed_sentences

['natural language processing nlp interdisciplinary subfield linguistic computer science artificial intelligence concern interaction computer human language particular program computer process analyze large amount natural language datum',
 'goal computer capable understand content document include contextual nuance language',
 'technology accurately extract information insight contain document categorize organize document',
 'natural language processing root 1950s',
 'alan turing publish article title computing machinery intelligence propose call ture test criterion intelligence time articulate problem separate artificial intelligence',
 'propose test include task involve automate interpretation generation natural language',
 'premise symbolic nlp summarize john searle chinese room experiment give collection rule e.g. chinese phrasebook question matching answer computer emulate natural language understanding nlp task apply rule datum confront',
 '1950 georgetown experiment involve full

In [23]:
len(processed_sentences)

88

# Bag of Words/Count Vectorization Creation!

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

In [25]:
vectorizer = CountVectorizer()

In [26]:
vectorized_sentences = vectorizer.fit_transform(processed_sentences)

In [41]:

print(vectorized_sentences)

  (0, 377)	2
  (0, 318)	3
  (0, 443)	1
  (0, 383)	1
  (0, 302)	1
  (0, 547)	1
  (0, 336)	1
  (0, 124)	3
  (0, 499)	1
  (0, 69)	1
  (0, 298)	1
  (0, 127)	1
  (0, 301)	1
  (0, 274)	1
  (0, 410)	1
  (0, 445)	1
  (0, 442)	1
  (0, 53)	1
  (0, 319)	1
  (0, 51)	1
  (0, 151)	1
  (1, 318)	1
  (1, 124)	1
  (1, 248)	1
  (1, 91)	1
  :	:
  (84, 345)	1
  (84, 436)	1
  (84, 129)	1
  (84, 26)	1
  (84, 34)	1
  (85, 383)	1
  (85, 63)	1
  (85, 33)	1
  (85, 111)	2
  (85, 278)	1
  (85, 465)	1
  (85, 487)	1
  (85, 212)	1
  (85, 389)	1
  (85, 43)	1
  (86, 383)	2
  (86, 380)	1
  (86, 368)	1
  (86, 111)	1
  (86, 278)	1
  (86, 331)	1
  (86, 291)	1
  (86, 374)	1
  (86, 461)	1
  (86, 213)	1


In [39]:
vectorized_sentences

<88x629 sparse matrix of type '<class 'numpy.int64'>'
	with 1227 stored elements in Compressed Sparse Row format>

In [28]:
6
vectorized_sentences.toarray().shape

(88, 629)

In [46]:
vectorized_sentences.toarray()[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,

In [31]:
# Getting the unique features/words
len(vectorizer.get_feature_names())



629

In [32]:
import numpy as np

In [33]:
np.unique(vectorized_sentences.toarray()[0], return_counts = True)


(array([0, 1, 2, 3], dtype=int64), array([608,  18,   1,   2], dtype=int64))

In [34]:
# length bata rah hia neche wala
for i in range(len(vectorized_sentences.toarray()[0])):
    if vectorized_sentences.toarray()[0][i] == 3:
        print(i)
        
for i in range(len(vectorized_sentences.toarray()[0])):
    if vectorized_sentences.toarray()[0][i] == 2:
        print(i)

124
318
377


In [36]:
vectorizer.get_feature_names()[123], vectorizer.get_feature_names()[316], vectorizer.get_feature_names()[374]


('computational', 'knowledge', 'multimodal')

In [None]:
#sentiment analaysis kar sakte hai
