In [83]:
#!/usr/bin/env python3
"""Task 0:  Bag of Words"""
import string
from sklearn.feature_extraction.text import CountVectorizer


def bag_of_words(sentences, vocab=None):
    """creates a bag of words embedding matrix
    Args:
        sentences: list of sentences to analyze
        vocab: list of the vocabulary words to use for the analysis
    Returns:
        embeddings, features
        embeddings: the embeddings matrix
        features: the feature names
    """

    translator = str.maketrans('', '', string.punctuation)
    features = []

    # lowercase it all
    sentences = [i.lower() for i in sentences]
    sentences = [i.replace("'s", '') for i in sentences]

    # remove punctuation
    for i in range(len(sentences)):
        sentences[i] = sentences[i].translate(translator)

    corpus = sentences.copy()

    # split words up
    for elem in sentences:
        append = elem.split()
        features.extend(append)

    # filters the features by vocab passed in
    if vocab is not None:
        features = vocab

    # sorts by alphabetical order
    if vocab is None:
        features = sorted(list(set(features)))

    cv = CountVectorizer(vocabulary=features)
    embedding = cv.fit_transform(corpus).toarray()

    return embedding, features

In [84]:
sentences = ["Holberton school is Awesome!",
             "Machine learning is awesome",
             "NLP is the future!",
             "The children are our future",
             "Our children's children are our grandchildren",
             "The cake was not very good",
             "No one said that the cake was not very good",
             "Life is beautiful"]
vocab = ["children", "is", "awesome", "cake", "are", "our", "future"]
E, F = bag_of_words(sentences, vocab)
print(E)
print(F)

[[0 1 1 0 0 0 0]
 [0 1 1 0 0 0 0]
 [0 1 0 0 0 0 1]
 [1 0 0 0 1 1 1]
 [2 0 0 0 1 2 0]
 [0 0 0 1 0 0 0]
 [0 0 0 1 0 0 0]
 [0 1 0 0 0 0 0]]
['children', 'is', 'awesome', 'cake', 'are', 'our', 'future']


In [8]:
#!/usr/bin/env python3
"""Task 1: TF-IDF"""
import string
from sklearn.feature_extraction.text import TfidfVectorizer


def tf_idf(sentences, vocab=None):
    """creates a bag of words embedding matrix
    Args:
        sentences: list of sentences to analyze
        vocab: list of the vocabulary words to use for the analysis
    Returns:
        embeddings, features
        embeddings: the embeddings matrix
        features: the feature names
    """

    translator = str.maketrans('', '', string.punctuation)
    features = []

    # lowercase it all
    sentences = [i.lower() for i in sentences]
    sentences = [i.replace("'s", '') for i in sentences]

    # remove punctuation
    for i in range(len(sentences)):
        sentences[i] = sentences[i].translate(translator)

    corpus = sentences.copy()

    # split words up
    for elem in sentences:
        append = elem.split()
        features.extend(append)

    # filters the features by vocab passed in
    if vocab is not None:
        features = vocab

    # sorts by alphabetical order
    if vocab is None:
        features = sorted(list(set(features)))

    cv = TfidfVectorizer(vocabulary=features)
    embedding = cv.fit_transform(corpus).toarray()

    return embedding, features

In [9]:
sentences = ["Holberton school is Awesome!",
             "Machine learning is awesome",
             "NLP is the future!",
             "The children are our future",
             "Our children's children are our grandchildren",
             "The cake was not very good",
             "No one said that the cake was not very good",
             "Life is beautiful"]
vocab = ["awesome", "learning", "children", "cake", "good", "none", "machine"]
E, F = tf_idf(sentences, vocab)
print(E)
print(F)

[[1.         0.         0.         0.         0.         0.
  0.        ]
 [0.5098139  0.60831315 0.         0.         0.         0.
  0.60831315]
 [0.         0.         0.         0.         0.         0.
  0.        ]
 [0.         0.         1.         0.         0.         0.
  0.        ]
 [0.         0.         1.         0.         0.         0.
  0.        ]
 [0.         0.         0.         0.70710678 0.70710678 0.
  0.        ]
 [0.         0.         0.         0.70710678 0.70710678 0.
  0.        ]
 [0.         0.         0.         0.         0.         0.
  0.        ]]
['awesome', 'learning', 'children', 'cake', 'good', 'none', 'machine']


In [2]:
"""Task 2: Train Word2Vec"""
from gensim.models import Word2Vec


def word2vec_model(sentences, size=100, min_count=5, window=5, negative=5,
                   cbow=True, iterations=5, seed=0, workers=1):
    """Creates and trains a gensim word2vec model
    Args:
        sentences: list of sentences to be trained on
        size: dimensionality of the embedding layer
        min_count: minimum number of occurrences of a word for use in training
        window: maximum distance between the current and predicted word
                within a sentence
        negative: size of negative sampling
        cbow: boolean to determine the training type; True is for CBOW;
              False is for Skip-gram
        iterations: number of iterations to train over
        seed: seed for the random number generator
        workers: number of worker threads to train the model
    Returns:
        the trained model"""

    sg = 1
    if cbow:
        sg = 0

    model = Word2Vec(sentences, vector_size=size, min_count=min_count,
                     window=window, negative=negative, seed=seed,
                     workers=workers, sg=sg, epochs=iterations)

    return model


In [3]:
from gensim.test.utils import common_texts

print(common_texts[:2])
w2v = word2vec_model(common_texts, min_count=1)
print(w2v.wv["computer"])

[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time']]
[-9.17425146e-04  4.23241127e-03  5.63164940e-03  6.88221911e-03
 -6.18189573e-03  3.55597492e-03 -4.59551578e-03 -2.62356992e-03
 -2.58884183e-03  1.51444075e-03  1.76495546e-03  1.26824854e-03
 -8.70202854e-03  8.73132143e-03  7.04515446e-03 -2.24651699e-03
  1.43263815e-03 -6.70434721e-03  2.69516581e-03  7.53865717e-03
  8.56675580e-03  7.89457001e-03 -8.89756717e-03 -9.03468858e-03
  4.73744608e-03 -6.03551976e-03 -6.21854421e-03  2.72567268e-03
 -8.80681351e-03  5.77690266e-03 -6.42151944e-03  2.13384978e-03
  2.60995259e-03 -6.16821647e-03 -1.97864044e-03 -7.64716882e-03
  9.61878430e-03  1.19452474e-04 -7.03770155e-03  6.31020777e-03
  4.79384791e-03 -5.65865776e-03 -3.22094793e-03 -8.49734619e-03
  1.13402959e-03  1.02089881e-03 -8.94187670e-03 -6.16365811e-03
 -9.08331887e-04 -8.65152571e-03  4.83665941e-03  5.46529191e-03
  4.19504056e-03  6.42453181e-03  6.02116482e-03 -2.0332

In [6]:
"""Task 3: Extract Word2Vec"""
from tensorflow.keras.layers import Embedding

def gensim_to_keras(model):
    """Converts a gensim word2vec model to a keras Embedding layer
    Args:
        model: trained gensim word2vec model
    Returns:
        the trainable keras Embedding"""

    keyed_vectors = model.wv
    weights = keyed_vectors.vectors  # vectors themselves, a 2D numpy array    

    layer = Embedding(
        input_dim=weights.shape[0],
        output_dim=weights.shape[1],
        weights=[weights],
        trainable=False,
    )
    return layer

In [7]:
print(common_texts[:2])
w2v = word2vec_model(common_texts, min_count=1)
print(gensim_to_keras(w2v))

[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time']]
<keras.layers.core.embedding.Embedding object at 0x7fa50d14ef40>


In [10]:
"""Task 4: FastText"""
from gensim.models import FastText


def fasttext_model(sentences, size=100, min_count=5, negative=5, window=5,
                   cbow=True, iterations=5, seed=0, workers=1):
    """Creates and trains a gensim fastText model
    Args:
        sentences: list of sentences to be trained on
        size: dimensionality of the embedding layer
        min_count: minimum number of occurrences of a word for use in training
        window: maximum distance between the current and predicted word
                within a sentence
        negative: size of negative sampling
        cbow: boolean to determine the training type; True is for CBOW;
              False is for Skip-gram
        iterations: number of iterations to train over
        seed: seed for the random number generator
        workers: number of worker threads to train the model
    Returns:
        the trained model"""

    sg = 1
    if cbow:
        sg = 0

    model = FastText(sentences, vector_size=size, min_count=min_count,
                     window=window, negative=negative, seed=seed,
                     epochs=iterations, workers=workers, sg=sg)
    return model

In [11]:
print(common_texts[:2])
ft = fasttext_model(common_texts, min_count=1)
print(ft.wv["computer"])

[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time']]
[-4.4518875e-04  1.9057443e-04  7.1344204e-04  1.5088863e-04
  7.3785416e-04  2.0828047e-03 -1.4264339e-03 -6.6978252e-04
 -3.9446630e-04  6.1643129e-04  3.7035978e-04 -1.7527672e-03
  2.0829479e-05  1.0929988e-03 -6.6954875e-04  7.9767447e-04
 -9.0742309e-04  1.9187949e-03 -6.9725298e-04  3.7622583e-04
 -5.0849823e-05  1.6160590e-04 -8.3575735e-04 -1.4309353e-03
  1.8365250e-04 -1.1365860e-03 -2.1796341e-03  3.3816829e-04
 -1.0266158e-03  1.9360909e-03  9.3765622e-05 -1.2577525e-03
  1.7052694e-04 -1.0470246e-03  9.1582153e-04 -1.1945128e-03
  1.2874184e-03 -3.1551000e-04 -1.1084992e-03  2.2345960e-04
  5.9021922e-04 -5.7232735e-04  1.6017178e-04 -1.0333696e-03
 -2.6842864e-04 -1.2489735e-03 -3.4248878e-05  2.0717620e-03
  1.0997808e-03  4.9419136e-04 -4.3252495e-04  7.6816598e-04
  3.0231036e-04  6.4548600e-04  2.5580439e-03 -1.2883682e-04
 -3.8391326e-04 -2.1800243e-04  6.5950496e-04 -