<a href="https://colab.research.google.com/github/AnimeshKulshrestha/Text-Processing-on-textbook/blob/main/Contingent_zeroes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Importing Libraries and downloading required packages

In [None]:
import re
import nltk
import string
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.probability import FreqDist
from wordcloud import WordCloud
from collections import defaultdict
import operator

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

#Defining necessary functions

###Defining a function for basic text preprocessing

In [11]:
def text_preprocessing(raw):
  # lower case
  raw = raw.lower()
  # remove numbers
  raw = re.sub(r'\d+', '', raw)
  # remove punctuation
  raw = re.sub(r'[^\w\s]', '', raw)
  # white space removal
  raw = raw.strip()
  #removing any non alphanumeric characters
  raw = re.sub('\W', ' ', raw)  
  #removing extra white spaces
  raw = re.sub('\s+', ' ', raw) 
  #removing chapter numbers and running sections
  raw = re.sub('chapter \w*', '', raw)   
  return (raw)

###Function to calculate frequency 

In [12]:
def freq(tokens):
# frequency of tokens
  freq = {}
  for t in tokens:
    if t in freq:
      freq[t]+=1
    else:
      freq[t]=1
  return freq

###Function to plot frequency graph

In [13]:
def freq_graph(tokens):
#frequency graph
  FreqDist(tokens)
  all_fdist = FreqDist(tokens).most_common(60)

  all_fdist = pd.Series(dict(all_fdist))

  fig, ax = plt.subplots(figsize=(10,10))

  all_plot = sns.barplot(x=all_fdist.index, y=all_fdist.values, ax=ax)
  plt.xticks(rotation=90)

###Function to generate and display word cloud

In [14]:
def word_cloud(freq):
  wc = WordCloud(height = 1000, width = 1500, background_color = 'white', max_words = 1000)
  wc.generate_from_frequencies(freq)
  return wc

In [15]:
def showCloud(cloud):
    plt.figure(figsize=(10,10))
    plt.imshow(cloud)
    plt.axis('off')

###Function to remove stop words

In [16]:
def stopwords_text(text):
  stop_words = set(nltk.corpus.stopwords.words('english'))
  x = 65
  y = 97
  for i in range(26):
    stop_words.add(chr(x))
    stop_words.add(chr(y))
    x += 1
    y += 1
  tokens = word_tokenize(text)
  result = [i for i in tokens if not i in stop_words]
  return result

###Function to count words

In [17]:
words ={}
def word_counter(text):
  words.clear()
  for word in text.split():
    if(len(word)not in words ):
      words[len(word)]=1
    else:
      words[len(word)]+=1

#Loading and Frequency Analysis a text book

###Opening a txt file

####Text T1

In [18]:
f = open('/content/Stinson-Paterson_CryptographyTheoryAndPractice.txt')
text = f.read()

####Text T2

In [19]:
f2 = open('/content/Software-Engineering-9th-Edition-by-Ian-Sommerville.txt')
text2 = f2.read()

###Text Preprocessing

####for T1

In [20]:
t1 = text_preprocessing(text)

####for T2

In [21]:
t2 = text_preprocessing(text2)

###Tokenisation

####T1

In [None]:
t1tokens = nltk.word_tokenize(t1)
t1tokens

In [None]:
print("No. of tokens in T1 :")
print(len(t1tokens))

####T2

In [None]:
t2tokens = nltk.word_tokenize(t2)
t2tokens

In [None]:
print("No. of tokens in T2 :")
print(len(t2tokens))

###Freqency analysis

####Calculating frequencies

#####for T2

In [None]:
t1freq = freq(t1tokens)
print(t1freq)

#####for T2

In [None]:
t2freq = freq(t2tokens)
print(t2freq)

####Frequency graph of most frequent 60 words 

#####for T1

In [None]:
t1freq_graph = freq_graph(t1tokens)

#####for T2

In [None]:
t2freq_graph = freq_graph(t2tokens)

###Generating word cloud without removing stopwords

####T1

In [None]:
t1wordcloud = word_cloud(t1freq)
showCloud(t1wordcloud)

####T2

In [None]:
t2wordcloud = word_cloud(t2freq)
showCloud(t2wordcloud)

###Generating wordcloud without stopwords

####Removing stopwords

######T1

In [32]:
t1nostoptokens = stopwords_text(t1)

#####T2

In [33]:
t2nostoptokens = stopwords_text(t2)

####Frequency graph of most frequent 60 words after removing stopwords

---



#####T1

In [None]:
t1nostopfreq = freq(t1nostoptokens)
t1nostopfreq_graph = freq_graph(t1nostoptokens)

#####T2

In [None]:
t2nostopfreq = freq(t2nostoptokens)
t2nostopfreq_graph = freq_graph(t2nostoptokens)

####Wordcloud after removing stopwords

#####T1

In [None]:
t1nostopwordcloud = word_cloud(t1nostopfreq)
showCloud(t1nostopwordcloud)

#####T2

In [None]:
t2nostopwordcloud = word_cloud(t2nostopfreq)
showCloud(t2nostopwordcloud)

#Frequency relation with length of words

####T1

In [None]:
word_counter(t1)

list_count_t1 = sorted(words.items())
x1, y1 = zip(*list_count_t1)
plt.plot(x1, y1)
plt.xticks(range(0, 30))
plt.rcParams["figure.figsize"] = (15, 10)
plt.xlabel("Wordlength")
plt.ylabel("Frequency")
plt.show()

####T2

In [None]:
list_count_t2 = sorted(words.items())
x2, y2 = zip(*list_count_t2)
plt.plot(x2, y2)
plt.xticks(range(0, 30))
plt.rcParams["figure.figsize"] = (15, 10)
plt.xlabel("Wordlength")
plt.ylabel("Frequency")
plt.show()

#POS Tagging the contents of textbook

###Tagging words

####T1

In [None]:
tagged1 = nltk.pos_tag(t1tokens)
tagged1

####T2

In [None]:
tagged2 = nltk.pos_tag(t2tokens)
tagged2

###Plotting the freqency bar graph for POS tags

####T1

In [None]:
dict1 = {}
for a, b in tagged1:
    if(b not in dict1):
        dict1[b] = 1
    else:
        dict1[b] += 1

sorted_d1 = dict(sorted(dict1.items(), key=operator.itemgetter(1), reverse=True))


N = 20
out1 = dict(list(sorted_d1.items())[0: N])

plt.bar(out1.keys(), out1.values())
plt.xlabel('TAGS')
plt.ylabel('Count')
plt.show()

####T2

In [None]:
dict2 = {}
for a, b in tagged2:
    if(b not in dict2):
        dict2[b] = 1
    else:
        dict2[b] += 1

sorted_d2 = dict(sorted(dict2.items(), key=operator.itemgetter(1), reverse=True))


N = 20
out1 = dict(list(sorted_d2.items())[0: N])

plt.bar(out1.keys(), out1.values())
plt.xlabel('TAGS')
plt.ylabel('Count')
plt.show()