In [2]:
import nltk
import pandas as pd
import torch

from gensim import models
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, SnowballStemmer,WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import torchtext.vocab as vocab
import gensim.downloader as api



In [3]:
nltk.download('punkt') #Descargamos el tokenizador llamado punkt
nltk.download("stopwords") # Descargamos las stop words
nltk.download('snowball_data') # Algoritmo Porter para stemming version 2
nltk.download('wordnet') # Necesaria para la Lematization

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package snowball_data to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
text_example ="""
Natural language processing (NLP) is an interdisciplinary subfield of computer science and linguistics.
It is primarily concerned with giving computers the ability to support and manipulate speech.
It involves processing natural language datasets, such as text corpora or speech corpora, using either rule-based or probabilistic (i.e. statistical and, most recently, neural network-based) machine learning approaches.
The goal is a computer capable of "understanding" the contents of documents, including the contextual nuances of the language within them.
The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves."""
text_example

'\nNatural language processing (NLP) is an interdisciplinary subfield of computer science and linguistics.\nIt is primarily concerned with giving computers the ability to support and manipulate speech.\nIt involves processing natural language datasets, such as text corpora or speech corpora, using either rule-based or probabilistic (i.e. statistical and, most recently, neural network-based) machine learning approaches.\nThe goal is a computer capable of "understanding" the contents of documents, including the contextual nuances of the language within them.\nThe technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.'

## Tokenizacion

In [5]:
sentences_lst = sent_tokenize(text_example)
sentences_lst

['\nNatural language processing (NLP) is an interdisciplinary subfield of computer science and linguistics.',
 'It is primarily concerned with giving computers the ability to support and manipulate speech.',
 'It involves processing natural language datasets, such as text corpora or speech corpora, using either rule-based or probabilistic (i.e.',
 'statistical and, most recently, neural network-based) machine learning approaches.',
 'The goal is a computer capable of "understanding" the contents of documents, including the contextual nuances of the language within them.',
 'The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.']

In [6]:
words_lst = word_tokenize(text_example)
words_lst

['Natural',
 'language',
 'processing',
 '(',
 'NLP',
 ')',
 'is',
 'an',
 'interdisciplinary',
 'subfield',
 'of',
 'computer',
 'science',
 'and',
 'linguistics',
 '.',
 'It',
 'is',
 'primarily',
 'concerned',
 'with',
 'giving',
 'computers',
 'the',
 'ability',
 'to',
 'support',
 'and',
 'manipulate',
 'speech',
 '.',
 'It',
 'involves',
 'processing',
 'natural',
 'language',
 'datasets',
 ',',
 'such',
 'as',
 'text',
 'corpora',
 'or',
 'speech',
 'corpora',
 ',',
 'using',
 'either',
 'rule-based',
 'or',
 'probabilistic',
 '(',
 'i.e',
 '.',
 'statistical',
 'and',
 ',',
 'most',
 'recently',
 ',',
 'neural',
 'network-based',
 ')',
 'machine',
 'learning',
 'approaches',
 '.',
 'The',
 'goal',
 'is',
 'a',
 'computer',
 'capable',
 'of',
 '``',
 'understanding',
 "''",
 'the',
 'contents',
 'of',
 'documents',
 ',',
 'including',
 'the',
 'contextual',
 'nuances',
 'of',
 'the',
 'language',
 'within',
 'them',
 '.',
 'The',
 'technology',
 'can',
 'then',
 'accurately',
 '

## Stop words

In [7]:
stop_words_english = set(stopwords.words("english"))
print(f"Cantidad de stop words en ingles: {len(stop_words_english)}")
print(f"Algunos ejemplos: {list(stop_words_english)[:20]}")

Cantidad de stop words en ingles: 179
Algunos ejemplos: ['am', 've', 'aren', 'than', "didn't", 'so', 'above', 'few', 'from', "you've", 'ma', 'under', 'over', "aren't", "weren't", 'out', 'by', 'y', "isn't", 'after']


In [8]:
stop_words_spanish = set(stopwords.words("spanish"))
print(f"Cantidad de stop words en español: {len(stop_words_spanish)}")
print(f"Algunos ejemplos: {list(stop_words_spanish)[:20]}")

Cantidad de stop words en español: 313
Algunos ejemplos: ['habría', 'estuviesen', 'habrían', 'fuerais', 'hubiésemos', 'tendríamos', 'tanto', 'fueseis', 'eso', 'estaríamos', 'o', 'erais', 'fuéramos', 'con', 'sin', 'hayan', 'porque', 'tenida', 'tuyo', 'otro']


In [9]:
filtered_list_eng = []
for word in words_lst:
  # casefold es una manera de trabajar indistintamente mayusculas y minusculas
  # ya que las stop words están todas en minúscula
   if word.casefold() not in stop_words_english:
        filtered_list_eng.append(word)

print(f"Cantidad de palabras iniciales: {len(words_lst)}")
print(words_lst)
print('\n')
print(f"Cantidad de palabras luego de remover stop words: {len(filtered_list_eng)}")
print(filtered_list_eng)

Cantidad de palabras iniciales: 115
['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'an', 'interdisciplinary', 'subfield', 'of', 'computer', 'science', 'and', 'linguistics', '.', 'It', 'is', 'primarily', 'concerned', 'with', 'giving', 'computers', 'the', 'ability', 'to', 'support', 'and', 'manipulate', 'speech', '.', 'It', 'involves', 'processing', 'natural', 'language', 'datasets', ',', 'such', 'as', 'text', 'corpora', 'or', 'speech', 'corpora', ',', 'using', 'either', 'rule-based', 'or', 'probabilistic', '(', 'i.e', '.', 'statistical', 'and', ',', 'most', 'recently', ',', 'neural', 'network-based', ')', 'machine', 'learning', 'approaches', '.', 'The', 'goal', 'is', 'a', 'computer', 'capable', 'of', '``', 'understanding', "''", 'the', 'contents', 'of', 'documents', ',', 'including', 'the', 'contextual', 'nuances', 'of', 'the', 'language', 'within', 'them', '.', 'The', 'technology', 'can', 'then', 'accurately', 'extract', 'information', 'and', 'insights', 'contained', 'in'

**Ejercicio:** Hacer el mismo proceso partiendo de un texto ejemplo en español.

## Stemming

In [10]:
stemmer = PorterStemmer()

In [12]:
words_lst[:10]

['Natural',
 'language',
 'processing',
 '(',
 'NLP',
 ')',
 'is',
 'an',
 'interdisciplinary',
 'subfield']

In [13]:
stemmed_words = [stemmer.stem(word) for word in words_lst]
stemmed_words[:15]

['natur',
 'languag',
 'process',
 '(',
 'nlp',
 ')',
 'is',
 'an',
 'interdisciplinari',
 'subfield',
 'of',
 'comput',
 'scienc',
 'and',
 'linguist']

In [14]:
examples_words = ['Discovery'	,'discovered'	,'discoveries'	,'Discovering']
#examples_words = ['play'	,'played'	,'plays'	,'playing']
for word in examples_words:
  print(f"Original word: {word}")
  print(f"Stemmed word: {stemmer.stem(word)}")
  print('--------')


Original word: Discovery
Stemmed word: discoveri
--------
Original word: discovered
Stemmed word: discov
--------
Original word: discoveries
Stemmed word: discoveri
--------
Original word: Discovering
Stemmed word: discov
--------


Veamos si hay algún cambio en el resultado cambiando el algoritmo de stemming...

In [15]:
stemmer = SnowballStemmer("english")
examples_words = ['Discovery'	,'discovered'	,'discoveries'	,'Discovering']
for word in examples_words:
  print(f"Original word: {word}")
  print(f"Stemmed word: {stemmer.stem(word)}")
  print('--------')


Original word: Discovery
Stemmed word: discoveri
--------
Original word: discovered
Stemmed word: discov
--------
Original word: discoveries
Stemmed word: discoveri
--------
Original word: Discovering
Stemmed word: discov
--------


## Lemmatizing

In [16]:
lemmatizer = WordNetLemmatizer()
examples_words = ['discovery'	,'discovered'	,'discoveries'	,'discovering']
for word in examples_words:
  print(f"Original word: {word}")
  print(f"Stemmed word: {lemmatizer.lemmatize(word)}")
  print('--------')

Original word: discovery
Stemmed word: discovery
--------
Original word: discovered
Stemmed word: discovered
--------
Original word: discoveries
Stemmed word: discovery
--------
Original word: discovering
Stemmed word: discovering
--------


In [17]:
lemmatizer.lemmatize("worst") #Por defecto toma considera "nouns"

'worst'

El parametro `pos` es por defecto `'n'` donde `n` está asociado a `noun`. Sin embargo, para asegurarnos que `"worst"` sea tratado como un adjetivo  necesitamos cambiar el parámetro por defecto a `pos="a"`.

In [18]:
lemmatizer.lemmatize("worst", pos="a")

'bad'

## Bag of words

In [19]:
new_text_example = """
In 2003, word n-gram model, at the time the best statistical algorithm, was overperformed by a multi-layer perceptron (with a single hidden layer and context length of several words trained on up to 14 million of words with a CPU cluster in language modelling) by Yoshua Bengio with co-authors.[8]

In 2010, Tomáš Mikolov (then a PhD student at Brno University of Technology) with co-authors applied a simple recurrent neural network with a single hidden layer to language modelling,=[9] and in the following years he went on to develop Word2vec. In the 2010s, representation learning and deep neural network-style (featuring many hidden layers) machine learning methods became widespread in natural language processing."""

In [20]:
sentences = sent_tokenize(new_text_example)
sentences

['\nIn 2003, word n-gram model, at the time the best statistical algorithm, was overperformed by a multi-layer perceptron (with a single hidden layer and context length of several words trained on up to 14 million of words with a CPU cluster in language modelling) by Yoshua Bengio with co-authors.',
 '[8]\n\nIn 2010, Tomáš Mikolov (then a PhD student at Brno University of Technology) with co-authors applied a simple recurrent neural network with a single hidden layer to language modelling,=[9] and in the following years he went on to develop Word2vec.',
 'In the 2010s, representation learning and deep neural network-style (featuring many hidden layers) machine learning methods became widespread in natural language processing.']

In [21]:
len(sentences)

3

In [22]:
count_vectorizer = CountVectorizer()
X = count_vectorizer.fit_transform(sentences)
X = X.toarray()
X

array([[1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 2, 1, 1, 1, 1, 0, 0, 0, 0,
        1, 0, 1, 2, 1, 2, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 2, 1,
        1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 2, 0, 1, 1, 0, 1, 0, 1, 1,
        0, 0, 3, 1, 0, 2, 0, 1],
       [0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1,
        0, 1, 1, 2, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1,
        0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 2, 1, 0, 1, 0, 0,
        1, 0, 2, 0, 1, 0, 1, 0],
       [0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
        0, 0, 1, 2, 1, 0, 1, 2, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
        0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0]])

In [23]:
X.shape

(3, 74)

In [24]:
count_vectorizer.vocabulary_.keys()


dict_keys(['in', '2003', 'word', 'gram', 'model', 'at', 'the', 'time', 'best', 'statistical', 'algorithm', 'was', 'overperformed', 'by', 'multi', 'layer', 'perceptron', 'with', 'single', 'hidden', 'and', 'context', 'length', 'of', 'several', 'words', 'trained', 'on', 'up', 'to', '14', 'million', 'cpu', 'cluster', 'language', 'modelling', 'yoshua', 'bengio', 'co', 'authors', '2010', 'tomáš', 'mikolov', 'then', 'phd', 'student', 'brno', 'university', 'technology', 'applied', 'simple', 'recurrent', 'neural', 'network', 'following', 'years', 'he', 'went', 'develop', 'word2vec', '2010s', 'representation', 'learning', 'deep', 'style', 'featuring', 'many', 'layers', 'machine', 'methods', 'became', 'widespread', 'natural', 'processing'])

In [25]:
count_vectorizer2 = CountVectorizer(ngram_range=(2,2))
X2 = count_vectorizer2.fit_transform(sentences)
X2 = X2.toarray()
X2

array([[1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
        0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0,
        0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1,
        0, 0, 1, 1, 1, 1, 1, 1, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
        0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1,
        0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0,
        1, 0, 1, 0, 1, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0,
        1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0,

In [26]:
X2.shape

(3, 98)

In [27]:
sorted(count_vectorizer2.vocabulary_.keys())


['14 million',
 '2003 word',
 '2010 tomáš',
 '2010s representation',
 'algorithm was',
 'and context',
 'and deep',
 'and in',
 'applied simple',
 'at brno',
 'at the',
 'authors applied',
 'became widespread',
 'bengio with',
 'best statistical',
 'brno university',
 'by multi',
 'by yoshua',
 'cluster in',
 'co authors',
 'context length',
 'cpu cluster',
 'deep neural',
 'develop word2vec',
 'featuring many',
 'following years',
 'gram model',
 'he went',
 'hidden layer',
 'hidden layers',
 'in 2003',
 'in 2010',
 'in language',
 'in natural',
 'in the',
 'language modelling',
 'language processing',
 'layer and',
 'layer perceptron',
 'layer to',
 'layers machine',
 'learning and',
 'learning methods',
 'length of',
 'machine learning',
 'many hidden',
 'methods became',
 'mikolov then',
 'million of',
 'model at',
 'modelling and',
 'modelling by',
 'multi layer',
 'natural language',
 'network style',
 'network with',
 'neural network',
 'of several',
 'of technology',
 'of words

## TF- IDF

La intuición detrás de esto es que cuanto más común es una palabra en todos los documentos, menor es su importancia para el documento actual.

In [28]:
sentences

['\nIn 2003, word n-gram model, at the time the best statistical algorithm, was overperformed by a multi-layer perceptron (with a single hidden layer and context length of several words trained on up to 14 million of words with a CPU cluster in language modelling) by Yoshua Bengio with co-authors.',
 '[8]\n\nIn 2010, Tomáš Mikolov (then a PhD student at Brno University of Technology) with co-authors applied a simple recurrent neural network with a single hidden layer to language modelling,=[9] and in the following years he went on to develop Word2vec.',
 'In the 2010s, representation learning and deep neural network-style (featuring many hidden layers) machine learning methods became widespread in natural language processing.']

In [35]:
tfidf = TfidfVectorizer()
transformed = tfidf.fit_transform(sentences)

In [38]:
transformed[0]

<1x74 sparse matrix of type '<class 'numpy.float64'>'
	with 40 stored elements in Compressed Sparse Row format>

In [30]:
df = pd.DataFrame(transformed[0].T.todense(),
    	index=tfidf.get_feature_names_out(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
df.shape

(74, 1)

In [34]:
df.sort_values(by=['TF-IDF'], ascending=False)

Unnamed: 0,TF-IDF
with,0.326879
words,0.286538
by,0.286538
layer,0.217920
of,0.217920
...,...
tomáš,0.000000
university,0.000000
word2vec,0.000000
he,0.000000


## Embeddings

Algunos links interesantes:

- https://ronxin.github.io/wevi/

### Word2Vec

In [None]:
path = api.load("word2vec-google-news-300", return_path=True)
print(path)

/root/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz


In [None]:
w2v = models.KeyedVectors.load_word2vec_format('/root/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz', binary=True)

In [None]:
vect = w2v['worst']
print(vect.shape)
print(vect)

(300,)
[ 1.66015625e-01 -8.20312500e-02  4.51171875e-01  1.22558594e-01
  1.34765625e-01  1.33789062e-01 -3.88671875e-01 -3.66210938e-02
  5.15625000e-01  1.04003906e-01  1.82617188e-01 -1.66015625e-01
  2.00195312e-01  2.12890625e-01 -1.90429688e-02  8.74023438e-02
 -7.75146484e-03 -2.87109375e-01 -1.54296875e-01 -3.86718750e-01
  2.15820312e-01  2.04101562e-01 -3.14453125e-01 -4.00390625e-01
  2.57812500e-01  3.17382812e-02  5.73730469e-03  1.16699219e-01
  2.20703125e-01  1.01074219e-01 -1.53320312e-01 -3.83300781e-02
  1.38549805e-02  9.22851562e-02 -1.68457031e-02  2.27539062e-01
  1.07421875e-01 -2.43164062e-01 -9.81445312e-02  3.49609375e-01
  1.97265625e-01 -1.80664062e-01 -4.27246094e-02 -2.09960938e-01
  1.14257812e-01  4.37500000e-01  1.00585938e-01  5.22460938e-02
 -1.41601562e-02  1.16210938e-01  1.32446289e-02  2.73437500e-01
 -1.41906738e-03  3.30078125e-01 -8.34960938e-02 -2.49023438e-01
 -1.33789062e-01 -1.84570312e-01  8.59375000e-02  2.48046875e-01
 -1.74804688e-01  

In [None]:
w2v.most_similar('worst')


[('Worst', 0.6146091818809509),
 ('weakest', 0.6143776774406433),
 ('scariest', 0.5957257747650146),
 ('ugliest', 0.5931181311607361),
 ('best', 0.5835111141204834),
 ('bleakest', 0.5718506574630737),
 ('strongest', 0.567145586013794),
 ('nastiest', 0.5644308924674988),
 ('lousiest', 0.563145101070404),
 ('toughest', 0.5624396204948425)]

In [None]:
sentences = [sent for sent in sentences]
custom_model = models.Word2Vec(sentences, min_count=1,workers=4)
custom_model




<gensim.models.word2vec.Word2Vec at 0x79540dfad990>

In [None]:

# Supongamos que tienes una lista de oraciones donde cada oración es una lista de palabras.
sentences = [["esto", "es", "una", "oración"], ["esto", "es", "otra", "oración"]]

# Entrenar un modelo Word2Vec
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=0)

In [None]:
# Obtener el embedding de una palabra
embedding = model.wv["esto"]
embedding

array([ 9.4563962e-05,  3.0773198e-03, -6.8126451e-03, -1.3754654e-03,
        7.6685809e-03,  7.3464094e-03, -3.6732971e-03,  2.6427018e-03,
       -8.3171297e-03,  6.2054861e-03, -4.6373224e-03, -3.1641065e-03,
        9.3113566e-03,  8.7338570e-04,  7.4907029e-03, -6.0740625e-03,
        5.1605068e-03,  9.9228229e-03, -8.4573915e-03, -5.1356913e-03,
       -7.0648370e-03, -4.8626517e-03, -3.7785638e-03, -8.5361991e-03,
        7.9556061e-03, -4.8439382e-03,  8.4236134e-03,  5.2625705e-03,
       -6.5500261e-03,  3.9578713e-03,  5.4701497e-03, -7.4265362e-03,
       -7.4057197e-03, -2.4752307e-03, -8.6257253e-03, -1.5815723e-03,
       -4.0343284e-04,  3.2996845e-03,  1.4418805e-03, -8.8142155e-04,
       -5.5940580e-03,  1.7303658e-03, -8.9737179e-04,  6.7936908e-03,
        3.9735902e-03,  4.5294715e-03,  1.4343059e-03, -2.6998555e-03,
       -4.3668128e-03, -1.0320747e-03,  1.4370275e-03, -2.6460087e-03,
       -7.0737829e-03, -7.8053069e-03, -9.1217868e-03, -5.9351693e-03,
      

### Glove

Algunos links útiles:


- https://notebook.community/spro/practical-pytorch/glove-word-vectors/glove-word-vectors

In [None]:
glove = vocab.GloVe(name='6B', dim=100)

print('Loaded {} words'.format(len(glove.itos)))

.vector_cache/glove.6B.zip: 862MB [21:41, 663kB/s]                            
100%|█████████▉| 399999/400000 [00:24<00:00, 16058.96it/s]


Loaded 400000 words


In [None]:
def get_word(word):
    return glove.vectors[glove.stoi[word]]

In [None]:
def closest(vec, n=10):
    """
    Find the closest words for a given vector
    """
    all_dists = [(w, torch.dist(vec, get_word(w))) for w in glove.itos]
    return sorted(all_dists, key=lambda t: t[1])[:n]

In [None]:
def print_tuples(tuples):
    for tuple in tuples:
        print('(%.4f) %s' % (tuple[1], tuple[0]))

In [39]:
get_word('google')

NameError: ignored

In [None]:
print_tuples(closest(get_word('google')))


(0.0000) google
(3.0772) yahoo
(3.8836) microsoft
(4.1048) web
(4.1082) aol
(4.1165) facebook
(4.3917) ebay
(4.4122) msn
(4.4540) internet
(4.4651) netscape


In [None]:
print_tuples(closest(get_word('science')))


(0.0000) science
(3.9004) sciences
(4.0609) physics
(4.1800) mathematics
(4.2983) studies
(4.3231) institute
(4.4395) biology
(4.4422) psychology
(4.4600) research
(4.4724) scientific
