<a href="https://www.kaggle.com/code/aleksandrmorozov123/deep-learning-for-nlp?scriptVersionId=186726953" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Checking statistics of the Corpus**

In [None]:
# import required libraries
import pandas as pd

# we get first 10 000 values for fast running
ratings = pd.read_csv ('/kaggle/input/massive-stock-news-analysis-db-for-nlpbacktests/raw_analyst_ratings.csv')[0:10000]
ratings.info()

In [None]:
# comparing the text of two selected ratings
print (repr(ratings.iloc[3399]['headline'][0:300]))
print (repr(ratings.iloc[5487]['headline'][0:300]))

In [None]:
# ignore spaces after the stop words
import re
ratings ["paragraphs"] = ratings ["headline"].map (lambda text: re.split ('[.?!]\s*\n', text))
ratings ['number_of_paragraphs'] = ratings ["paragraphs"].map (len)

**Preparations**

In [None]:
# import required libraries
import sklearn
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.de.stop_words import STOP_WORDS

tfidf_text_vectorizer = TfidfVectorizer(stop_words=list(STOP_WORDS))
vectors_text = tfidf_text_vectorizer.fit_transform (ratings ['headline'])
vectors_text.shape

In [None]:
# flatten the paragraphs keeping the sentiment
paragraph_df = pd.DataFrame ([{'headline': paragraph, 'publisher': publisher}
                             for paragraphs, publisher in \
                             zip (ratings ['paragraphs'], ratings ['publisher'])
                             for paragraph in paragraphs if paragraph])
tfidf_para_vectorizer = TfidfVectorizer(stop_words=list(STOP_WORDS))
tfidf_para_vectors = tfidf_para_vectorizer.fit_transform (paragraph_df ['headline'])
tfidf_para_vectors.shape

**Nonnegative matrix factorization** - $ V \approx W \cdot H $

In [None]:
# import required library
from sklearn.decomposition import NMF

nmf_text_model = NMF (n_components = 10, random_state = 42)
W_text_matrix = nmf_text_model.fit_transform (vectors_text)
H_text_matrix = nmf_text_model.components_

# define a function for outputtin a summary
def display_topics (model, features, no_top_words=5):
    for topic, word_vector in enumerate (nmf_text_model.components_):
        total = word_vector.sum ()
        largest = word_vector.argsort ()[::-1]  # invert sort order
        print ("\ntopic %02d" % topic)
        for i in range (0, no_top_words):
            print ("  %s (%2.2f)" % (features [largest [i]],
                                    word_vector [largest[i]] * 100.0/total))
            
# calling the function
display_topics (nmf_text_model, tfidf_text_vectorizer.get_feature_names_out())

In [None]:
# normalizing topics
W_text_matrix.sum (axis=0)/W_text_matrix.sum()*100.0

**Create a topic model for paragraphs using NMF**

In [None]:
nmf_para_model = NMF (n_components = 10, random_state = 42)
W_para_matrix = nmf_para_model.fit_transform (tfidf_para_vectors)
H_para_matrix = nmf_para_model.components_

display_topics (nmf_para_model, tfidf_para_vectorizer.get_feature_names_out ())

**Latent semantic analysis with singular value decomposition** - any $ m \times n $ matrix V can be decomposed as follows
$V = U \cdot \Sigma \cdot V^* $

In [None]:
# import required module
from sklearn.decomposition import TruncatedSVD

svd_para_model = TruncatedSVD (n_components = 10, random_state = 42)
W_svd_para_matrix = svd_para_model.fit_transform (tfidf_para_vectors)
H_svd_para_matrix = svd_para_model.components_

display_topics (svd_para_model, tfidf_para_vectorizer.get_feature_names_out ())

**Latent Dirichlet Allocation**

In [None]:
# import required modules
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

count_para_vectorizer = CountVectorizer (stop_words=list(STOP_WORDS))
count_para_vectors = count_para_vectorizer.fit_transform (paragraph_df ['headline'])

lda_para_model = LatentDirichletAllocation (n_components = 10, random_state = 42)
W_lda_para_matrix = lda_para_model.fit_transform (count_para_vectors)
H_lda_para_matrix = lda_para_model.components_

display_topics (lda_para_model, tfidf_para_vectorizer.get_feature_names_out ())

**Create Word Clouds to display and compare topic models**

In [None]:
# import required libraries
import matplotlib.pyplot as plt
from wordcloud import WordCloud

def wordcloud_topics (model, features, no_top_words = 40):
    for topic, words in enumerate (model.components_):
        size = {}
        largest = words.argsort ()[::-1]  # invert sort order
        for i in range (0, no_top_words):
            size [features [largest [i]]] = abs (words [largest [i]])
        wc = WordCloud (background_color = "white", max_words = 100,
                       width = 960, height = 540)
        wc.generate_from_frequencies (size)
        plt.figure (figsize = (12, 12))
        plt.imshow (wc, interpolation = 'bilinear')
        plt.axis ('off')
        
# compare NMF and LDA model
wordcloud_topics (nmf_para_model, tfidf_para_vectorizer.get_feature_names_out())
wordcloud_topics (lda_para_model, count_para_vectorizer.get_feature_names_out ())

**Building a neural network using Pytorch**

In [None]:
# import required libraries
import torch
import torch.nn as nn

x = [[2, 5], [7, 9], [4, 8], [6, 9]]
y = [[4], [9], [12], [17]]

X = torch.tensor (x).float ()
Y = torch.tensor (y).float ()

device = 'cuda' if torch.cuda.is_available () else 'cpu'
X = X.to(device)
Y = Y.to(device)

class MyNeuralNet (nn.Module):
    def __init__(self):
        super().__init__()
        self.input_to_hidden_layer = nn.Linear (2, 8)
        self.hidden_layer_activation = nn.ReLU()
        self.hidden_to_output_layer = nn.Linear (8, 1)
    def forward (self, x):
        x = self.input_to_hidden_layer (x)
        x = self.hidden_layer_activation (x)
        x = self.hidden_to_output_layer (x)
        return x
    
mynet = MyNeuralNet().to(device)
loss_func = nn.MSELoss()

_Y = mynet(X)
loss_value = loss_func (_Y, Y)
print (loss_value)

In [None]:
from torch.optim import SGD
import matplotlib.pyplot as plt
%matplotlib inline

opt = SGD (mynet.parameters(), lr = 0.001)

loss_history = []
for _ in range(50):
    opt.zero_grad()
    loss_value = loss_func (mynet (X), Y)
    loss_value.backward ()
    opt.step ()
    loss_history.append (loss_value.item())
    
plt.plot(loss_history)
plt.title ('Loss variation over increasing epochs')
plt.xlabel ('epochs')
plt.ylabel ('loss value')

**Resnet block architecture**

In [None]:
import torch
from torch import nn

class ResLayer (nn.Module):
    def __init__ (self, ni, no, kernel_size, stride=1):
        super (ResLayer, self).__init__()
        padding = kernel_size - 2
        self_conv = nn.Sequential (
        nn.Conv2d (ni, no, kernel_size, stride,
                  padding = padding),
        nn.ReLU ())
        
    def forward (self, x):
        return self.conv (x) + x

In [None]:
import torchvision
import torch.nn as nn
import torch
import torch.nn.functional as F
from torchvision import transforms,models,datasets
!pip install torch_summary
from torchsummary import summary
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = models.vgg16(pretrained=True).to(device)
summary(model, torch.zeros(1,3,224,224))

In [None]:
model

**RNN with TensorFlow**

In [5]:
import tensorflow as tf
import numpy as np
import re

def download_and_read (urls):
    texts = []
    for i, url in enumerate (urls):
        p = tf.keras.utils.get_file ("ex1-{:d}.txt".format (i), url, cache_dir = ".")
        text = open (p, "r").read ()
        # remove byte order mark
        text = text.replace ("\ufeff", "")
        # remove new lines
        text = text.replace ('\n', ' ')
        text = re.sub (r'\s+', " ", text)
        # add it to the list
        texts.extend (text)
    return texts

texts = download_and_read (["http://www.gutenberg.org/cache/epub/28885/pg28885.txt",
"https://www.gutenberg.org/files/12/12-0.txt"])

Downloading data from https://www.gutenberg.org/files/12/12-0.txt
