## Loading data and libraries
For this I  take a few rows of a tweet dataset. 

In [None]:
import pandas as pd
import tensorflow as tf
import nltk
import numpy as np
import gensim
import gensim.downloader as api
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')
tqdm.pandas()

In [None]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_cat.zip.
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_esp.zip.
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Unzipp

True

In [None]:
df = pd.read_csv('/content/drive/My Drive/NLP/input/tweet_sentiment.csv')

In [None]:
df.head(2)

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative


In [None]:
text = df['text'].sample(100) # randomly taking 100 rows

## Stop Word Removal

In [None]:
stop_words = nltk.corpus.stopwords.words('english')

cleaned_text = []

for i in text:

  cleaned_text.append([word for word in i.split() if word not in stop_words])

## Tokenization 

In [None]:
word_tokenized = []
tweet_tokenized = []

for i in text:
  word_tokenized.append(nltk.tokenize.word_tokenize(i))
  tweet_tokenized.append(nltk.tokenize.TweetTokenizer().tokenize(i))

tensorflow_tokenizer = tf.keras.preprocessing.text.Tokenizer()
tensorflow_tokenizer.fit_on_texts(text)
sequence = tensorflow_tokenizer.texts_to_sequences(text)
tf_tokenized = tensorflow_tokenizer.sequences_to_texts(sequence)

## Stemming and Lemmetization

In [None]:
lemmatized = []
stemmed = []

for i in text:
  lemmatized.append(nltk.stem.WordNetLemmatizer().lemmatize(i))
  stemmed.append(nltk.stem.SnowballStemmer('english', ignore_stopwords=True).stem(i))

## Bag of Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
bow_uni_gram = CountVectorizer(stop_words=stop_words) # Uni-gram
bow_bi_gram = CountVectorizer(stop_words=stop_words, ngram_range=(1,2)) # Uni as well as bi gram (n-gram)
bow_uni_gram.fit(text)
bow_bi_gram.fit(text);

In [None]:
bow_uni_text = bow_uni_gram.transform(text)
bow_bi_text = bow_bi_gram.transform(text)

## Tf-idf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf_uni_gram = TfidfVectorizer(stop_words=stop_words) # Uni gram
tfidf_bi_gram = TfidfVectorizer(stop_words=stop_words, ngram_range=(1,2)) # Uni as well as bi gram (n-gram)
tfidf_uni_gram.fit(text)
tfidf_bi_gram.fit(text);

In [None]:
tfidf_uni_text = tfidf_uni_gram.transform(text)
tfidf_bi_text = tfidf_bi_gram.transform(text)

## Word2Vec

In [None]:
''' For creating a word2vec we need a large dataset and we have only
 taken a few rows so we will create a word2vec using another dataset and use it
 for feature extraction , for the sake of knowing we will create word2vec using our own data as well'''

dataset = api.load('text8')

# There are several other parameters such as "sg": 0 for skip-gram and 1 for CBOW
# Details can be seen after uncommenting and running the next cell
word2vec_model = gensim.models.Word2Vec(sentences=dataset, size=100)



In [None]:
#?gensim.models.Word2Vec

In [None]:
# Creating the embeddings for the text
embedded_text = np.zeros((100,100))

for index, value in enumerate(text):
  sent_vec = np.zeros(100)

  for j in value.split():

    if j in word2vec_model:
      word_vec = word2vec_model.wv[j]
    else:
      word_vec = np.zeros(100)

    sent_vec += word_vec

  embedded_text[index] = sent_vec

In [None]:
# Creating Word2Vec using our data
our_word2vec_model = gensim.models.Word2Vec(sentences=text, size=20) # Limiting the embedding dimension to 20

In [None]:
text

7829                                  that makes me sad...
17624    is having a jam session in her room and then s...
1787     Hey  wow cheers for the insight ppl  looks FUN...
16081     Actually, by the time i get there, the train ...
13797    Scratch that. Now we`re watching `marley and m...
                               ...                        
16876           Im @ the dentist  ....scary people here...
10102    On the airport in Philadelphia at the moment, ...
12294                                   Painting my room =
207        Grabbing coffee from  then making mom breakfast
19259    Happy Mother`s Day to all the moms! If you`re ...
Name: text, Length: 100, dtype: object

## Average_word2Vec

In [None]:
# Creating the average embeddings for the text
embedded_text = np.zeros((100,100))

for index, value in enumerate(text):
  sent_vec = np.zeros(100)

  for j in value.split():

    if j in word2vec_model:
      word_vec = word2vec_model.wv[j]
    else:
      word_vec = np.zeros(100)

    sent_vec += word_vec

  embedded_text[index] = sent_vec / len(value.split())

## Tf-idf weighted Word2Vec

In [None]:
# Creating the tfidf weighted embeddings for the text

# dict key:word and value:tf-idf score
word2tfidf = dict(zip(tfidf_uni_gram.get_feature_names(), tfidf_uni_gram.idf_))

embedded_text = np.zeros((100,100))

for index, value in enumerate(text):
  sent_vec = np.zeros(100)

  for j in value.split():

    if j in word2vec_model:
      word_vec = word2vec_model.wv[j]
      try:
        idf = word2tfidf[j]
      except:
        idf = 0
        
      word_vec = word_vec * idf

    else:
      word_vec = np.zeros(100)

    sent_vec += word_vec

  embedded_text[index] = sent_vec

## Glove Embeddings

In [None]:
api.info()['models'].keys()

dict_keys(['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis'])

In [None]:
# For this we will use the 'glove-wiki-gigaword-100'

glove_model = api.load('glove-wiki-gigaword-100')



In [None]:
# Creating the embeddings for the text
embedded_text = np.zeros((100,100))

for index, value in enumerate(text):
  sent_vec = np.zeros(100)

  for j in value.split():

    if j in glove_model:
      word_vec = glove_model.wv[j]
    else:
      word_vec = np.zeros(100)

    sent_vec += word_vec

  embedded_text[index] = sent_vec

## Fast_text Embeddings

In [None]:
# For this we will use 'fasttext-wiki-news-subwords-300'

fast_text_model = api.load('fasttext-wiki-news-subwords-300')

In [None]:
# Creating the embeddings for the text
embedded_text = np.zeros((100,300))

for index, value in enumerate(text):
  sent_vec = np.zeros(300)

  for j in value.split():

    if j in fast_text_model:
      word_vec = fast_text_model.wv[j]
    else:
      word_vec = np.zeros(300)

    sent_vec += word_vec

  embedded_text[index] = sent_vec

## Word2Vec and Glove ensembled Embedding

In [None]:
ensembled_model = api.load('conceptnet-numberbatch-17-06-300') # this has embeddings for many languages



In [None]:
# Creating the embeddings for the text
embedded_text = np.zeros((100,300))

for index, value in enumerate(text):
  sent_vec = np.zeros(300)

  for j in value.split():

    word = '/c/en/' + str(j)

    if word in ensembled_model:
      word_vec = ensembled_model.wv[word]
    else:
      word_vec = np.zeros(300)

    sent_vec += word_vec

  embedded_text[index] = sent_vec

## Dynamic Embeddings

In [None]:
import tensorflow_hub as hub

In [None]:
elmo_model = hub.KerasLayer("https://tfhub.dev/google/elmo/2")

In [None]:
elmo_embeddings = elmo_model(np.array(text))

In [None]:
elmo_embeddings.shape

TensorShape([100, 1024])