<a href="https://colab.research.google.com/github/FahimS45/Python_mini_projects/blob/master/DataTextAnalyzer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Analysing Entropy**

In [3]:
# Importing all necessary libraries

import numpy as np
import plotly.graph_objects as go
import requests
import re
import string


In [4]:
# Getting raw text from internet

book = requests.get('https://www.gutenberg.org/files/35/35-0.txt')

# Extracting just the text

text = book.text
print(len(text))
print(text[:10])



204357
ï»¿The Pro


In [5]:
# Special characters

spe_char = ['\r\n\r\nâ\x80\x9c', # new paragraph
                 'â\x80\x9c',         # open quote
                 'â\x80\x9d',         # close quote
                 '\r\n',              # new line
                 'â\x80\x94',         # hyphen
                 'â\x80\x99',         # single apostrophe
                 'â\x80\x98',         # single quote
                 '_',                 # underscore, used for stressing
            ]
print(spe_char)

['\r\n\r\nâ\x80\x9c', 'â\x80\x9c', 'â\x80\x9d', '\r\n', 'â\x80\x94', 'â\x80\x99', 'â\x80\x98', '_']


In [17]:
# Replacing special characters or above strings with a space using Regular Expression(re)

for each in spe_char:
  regexp = re.compile(r'%s'%each)
  text = regexp.sub(' ', text)

# Cleaned text

print(text[:200])

ï»¿The Project Gutenberg eBook of The Time Machine, by H. G. Wells  This eBook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no re


In [7]:
# Distribution of Word Length

# Converting to word

words = text.split()
print(len(words))
print(words[:50])

35811
['ï»¿The', 'Project', 'Gutenberg', 'eBook', 'of', 'The', 'Time', 'Machine,', 'by', 'H.', 'G.', 'Wells', 'This', 'eBook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'in', 'the', 'United', 'States', 'and', 'most', 'other', 'parts', 'of', 'the', 'world', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever.', 'You', 'may', 'copy', 'it,', 'give', 'it', 'away', 'or', 're-use']


In [8]:
# Initializing word lengths

word_len = np.zeros(len(words))

for i in range(len(words)):
  word_len[i] = len(words[i])

print(word_len)

# Plotting using plotly

# Histogram
word_len_fig = go.Figure(data = go.Histogram(x = word_len))
word_len_fig.update_layout(title = 'Word Length Analysis(Histogram)', xaxis_title = 'Each Word length', yaxis_title = 'Frequencies')
word_len_fig.show()

# lineplot
word_freq = np.zeros(40)
for each_word in words:
  word_freq[len(each_word)] += 1
#print(word_freq[20:-1])

word_len_fig = go.Figure(data = go.Scatter(x = list(range(len(word_freq))), y = word_freq))
word_len_fig.update_layout(title = 'Word Length Analysis(Line Plot)', xaxis_title = 'Each Word length', yaxis_title = 'Frequencies')
word_len_fig.show()

[6. 7. 9. ... 5. 3. 7.]


In [9]:
# Letter Frequency

letters = string.ascii_lowercase
letters_len = len(letters)
letters_freq = np.zeros(letters_len)
print(letters_freq)

n = 0
for i in letters:
  letters_freq[n] = text.lower().count(i)
  n += 1
print(letters_freq)

# Plotting using plotly
# Bar Chart
letters_freq_fig = go.Figure(data = go.Bar(x = list(letters), y = letters_freq))
letters_freq_fig.update_layout(title = 'Letter Frequency', xaxis_title = 'Letters', yaxis_title = 'Frequency')
letters_freq_fig.show()



[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0.]
[12752.  2171.  4049.  6868. 19781.  3749.  3512.  8841. 11312.   187.
  1231.  6658.  4420. 11012. 11157.  2867.   106.  8895.  9313. 15155.
  4328.  1435.  3524.   282.  3024.   108.]


In [10]:
# Letter Entropy

# Converting frequency to probability

letters_prob = letters_freq/sum(letters_freq)
print(letters_prob)
lett_prob = sum(letters_prob)
print(lett_prob)

#Computing the Letter Entropy

letter_entropy = -sum(letters_prob * np.log2(letters_prob+np.finfo(float).eps))
# H = - Σ (P(i) * log2(P(i)))

# Plotting using plotly
lett_prob_fig = go.Figure(data = go.Bar(x = list(letters), y = letters_prob))
lett_prob_fig.update_layout(title =
 {'text' : 'Entropy = %0.3f bits' % letter_entropy,
  'x' : 0.5, 'y' : 0.95,
  'xanchor' : 'center', 'yanchor' : 'top'},
                            xaxis_title = 'Letters',
                            yaxis_title = 'Probability')
lett_prob_fig.show()



[0.08135922 0.01385123 0.02583308 0.04381863 0.12620504 0.02391905
 0.02240696 0.05640659 0.07217185 0.00119308 0.00785392 0.0424788
 0.02820011 0.07025782 0.07118294 0.01829179 0.00067629 0.05675112
 0.05941801 0.09669063 0.02761314 0.00915546 0.02248352 0.00179919
 0.01929347 0.00068905]
0.9999999999999997


In [11]:
# Conditional (Sequence) Entropy

# Matrix
probmat = np.zeros((letters_len, letters_len))

for i in range(len(text)-1):
  curr_lett = text[i].lower()
  next_lett = text[i+1].lower()

  # Proceed only curr_lett and next_lett are letters
  if curr_lett in letters and next_lett in letters:
    probmat[(letters.index(curr_lett), letters.index(next_lett))] += 1
#print(probmat)

# Plotting the matrix using plotly

matrix_fig = go.Figure(data = go.Heatmap(z=probmat, zmax=500))

matrix_fig.update_layout(
    xaxis = dict(title = 'Next Letter', tickmode = 'array', tickvals = list(range(letters_len)), ticktext = list(letters)),
    yaxis = dict(title = 'Current Letter', tickmode = 'array', tickvals = list(range(letters_len)), ticktext = list(letters)),
    width = 700,
    height = 700,)

matrix_fig.show()




In [12]:
# Computing Conditional Entropy

cond_entr = np.zeros(letters_len)

for i in range(letters_len):
  prob = probmat[i,:]
  prob = prob/sum(prob)

  cond_entr[i] = -sum(prob * np.log2(prob+np.finfo(float).eps))

# Plotting Conditional Entropy using plotly
cond_entr_fig = go.Figure(data = go.Bar(x = list(letters), y = list(cond_entr)))
cond_entr_fig.update_layout(title =
 {'text' : 'Conditional Entropy per letter',
  'x' : 0.5, 'y' : 0.95,
  'xanchor' : 'center', 'yanchor' : 'top'},
                            xaxis_title = 'Letters',
                            yaxis_title = 'Entropy')
cond_entr_fig.show()

# **Analysing The Sentiment**

In [13]:
# Sentiment analysis using a pre-trained model
# Importing necessary libraries

import nltk
nltk.download ('punkt')
from textblob import TextBlob

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
# Tokenizing the whole text-data
sentences = TextBlob(text).sentences

# Sentiment Scoring
sentiment_scores = []
for each in sentences:
    sentiment = each.sentiment.polarity
    sentiment_scores.append(sentiment)

# Aggregation
text_sentiment = sum(sentiment_scores) / len(sentiment_scores)

# Interpretation
if text_sentiment > 0:
    sentiment_label = "Positive"
elif text_sentiment < 0:
    sentiment_label = "Negative"
else:
    sentiment_label = "Neutral"

print("Sentiment Score:", text_sentiment)
print("Text Sentiment Label:", sentiment_label)

Sentiment Score: 0.03453752666940809
Text Sentiment Label: Positive


# **Wordcloud**

In [15]:
# Creating a WordCloud


from wordcloud import WordCloud


wordcloud = WordCloud().generate(text)

fig = go.Figure(data=go.Image(z=wordcloud.to_array()))

fig.update_layout( xaxis = dict(showgrid = False, zeroline = False, showticklabels = False),
                  yaxis = dict(showgrid = False, zeroline = False, showticklabels = False),
                  width = 900,
                  height = 900)
fig.show()
