In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import syllables
import numpy as np
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import chardet
import os

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
def detect_encoding(file_path):
    with open(file_path, 'rb') as file:
        result = chardet.detect(file.read())
    return result['encoding']

In [None]:
path="/content/"
dir_list = os.listdir(path)
newList=[]
for filename in dir_list:
  if "StopWords" in filename:
    newList.append(filename)

stops=''

In [None]:
for file in newList:
  file_encoding = detect_encoding(file)
  with open(file, 'r+', encoding=file_encoding) as content:
    data1=content.read()
    data1=data1.upper()
    data1=data1.strip(' ')
    stops+=data1

In [None]:
stops=stops.replace('\n','|')
stops=stops.split('|')
for i in range(len(stops)):
  stops[i]=stops[i].strip(' ')

In [None]:
file_encoding_n = detect_encoding("/content/negative-words.txt")
with open("/content/negative-words.txt", 'r+', encoding=file_encoding_n) as content_n:
    negatives=content_n.read()
    negatives=negatives.upper()
negative_tokens=negatives.split('\n')

In [None]:
file_encoding_p = detect_encoding("/content/positive-words.txt")
with open("/content/positive-words.txt", 'r+', encoding=file_encoding_p) as content_p:
    positives=content_p.read()
    positives=positives.upper()
positive_tokens=positives.split('\n')

In [None]:
df = pd.read_excel('/content/updated_input.xlsx')

In [None]:
punctuations = ["'", ".", ",", "?", "/","(", ")", "#", "@", "!", "'", ";", "-",":","’",'”',"$",'“',"%"]

In [None]:
pronouns = ['I', 'WE', 'MY', 'OURS', 'US']

In [None]:
class AnalyseText:
    def __init__(self, file_path, stops, punctuations, positive_tokens, negative_tokens, pronouns):
        self.file_path = file_path
        self.stops = stops
        self.punctuations = punctuations
        self.pronouns = pronouns
        self.df = pd.read_excel(file_path)
        self.positive_tokens = positive_tokens
        self.negative_tokens = negative_tokens

    def num_pronouns(self):
        result = []
        for i in range(len(self.df)):
            res = 0
            text = self.df.iloc[i]['text']
            for word in self.return_wo_stopwords(text):
                if word.upper() in self.pronouns:
                    res += 1
            result.append(res)
        return result

    def return_wo_stopwords(self, text):
        words = self.return_tokens(text)
        result = [i for i in words if i not in self.stops and i not in self.punctuations]
        return result

    def return_tokens(self, text):
      words = []
      if str(text) != 'nan':
        words = word_tokenize(text)
      return words

    def return_sent_tokens(self, text):
        lines = []
        if str(text) != 'nan':
          lines = sent_tokenize(text)
        return lines

    def num_words(self):
        result = []
        for i in range(len(self.df)):
            text = self.df.iloc[i]['text']
            words = self.return_wo_stopwords(text)
            res = 0
            for word in words:
                if len(word) > 2 or (len(word) == 1 and word not in self.punctuations and word not in self.stops):
                    res += 1
            result.append(res)
        return result

    def num_sentences(self):
        result = []
        for i in range(len(self.df)):
            text = self.df.iloc[i]['text']
            sentences = self.return_sent_tokens(text)
            result.append(len(sentences))
        return result

    # complex words
    def analyse_readability(self):
        fog = []
        percent_complex = []
        num_complex = []
        avg_sent_len = self.avg_sent_len()
        num_words = self.num_words()
        for i in range(len(self.df)):
            print(i)
            words = self.return_tokens(self.df.iloc[i]['text'])
            print(words)
            print(fog)
            if num_words[i] > 0:
              fog.append(0.4 * (avg_sent_len[i] + (self.count_complex_words(words) / num_words[i])*100))
              percent_complex.append(self.count_complex_words(words) / num_words[i])
            else:
              fog.append('')
              percent_complex.append('')
            num_complex.append(self.count_complex_words(words))
        return fog, percent_complex, num_complex

    def avg_sent_len(self):
      num_words = self.num_words()
      num_sentences = self.num_sentences()
      return [num_words[i] / num_sentences[i] if num_sentences[i] > 0 else 0 for i in range(len(num_words))]

    def avg_word_length(self):
      result = []
      num_words = self.num_words()
      for i in range(len(self.df)):
        words = self.return_wo_stopwords(self.df.iloc[i]['text'])
        res = 0
        for word in words:
            if len(word) > 2 or (len(word) == 1 and word not in self.punctuations):
              res += len(word)
        if num_words[i] > 0:
          print(res / num_words[i])
          result.append(res / num_words[i])
        else:
          result.append('')
      return result

    def count_complex_words(self, words):
        return sum(1 for word in words if syllables.estimate(word) > 2)

    def positive_score(self):
        result = []
        for i in range(len(self.df)):
            res = 0
            text = self.df.iloc[i]['text']
            words = self.return_tokens(text)
            for word in words:
                if word.upper() in self.positive_tokens:
                    res += 1
            result.append(res)
        return result

    def negative_score(self):
        result = []
        for i in range(len(self.df)):
            res = 0
            text = self.df.iloc[i]['text']
            words = self.return_tokens(text)
            for word in words:
                if word.upper() in self.negative_tokens:
                    res += 1
            result.append(res)
        return result

    def num_syllables(self):
        result = []
        num_words = self.num_words()
        for i in range(len(self.df)):
          words = self.return_wo_stopwords(self.df.iloc[i]['text'])
          res = 0
          for word in words:
            res += syllables.estimate(word)
          if num_words[i] > 0:
            result.append(res / num_words[i])
          else:
            result.append(0)
        return result

    def polarity_and_subjectivity(self):
        pos = self.positive_score()
        neg = self.negative_score()
        words = self.num_words()
        polarity = []
        subjectivity = []
        for i in range(len(self.df)):
          polarity_value = (pos[i] - neg[i]) / (pos[i] + neg[i] + 0.000001)
          subjectivity_value = (pos[i] + neg[i]) / (words[i] + 0.000001)
          polarity.append(polarity_value)
          subjectivity.append(subjectivity_value)

        return polarity, subjectivity

In [None]:
analyser = AnalyseText('/content/updated_input.xlsx', stops, punctuations, positive_tokens, negative_tokens, pronouns)
num_pronouns = analyser.num_pronouns()

In [None]:
pos_score = analyser.positive_score()
neg_score = analyser.negative_score()
polarity, subjectivity = analyser.polarity_and_subjectivity()
num_words = analyser.num_words()
num_sentences = analyser.num_sentences()
num_pronouns = analyser.num_pronouns()
fog, percent_complex, count_complex = analyser.analyse_readability()
syllables_per_word = analyser.num_syllables()
avg_sent_len = analyser.avg_sent_len()
avg_word_length = analyser.avg_word_length()

In [None]:
df['POSITIVE SCORE'] = pos_score
df['NEGATIVE SCORE'] = neg_score
df['POLARITY SCORE'] = polarity
df['SUBJECTIVITY SCORE'] = subjectivity
df['AVG SENTENCE LENGTH'] = avg_sent_len
df['PERCENTAGE OF COMPLEX WORDS'] = percent_complex
df['FOG INDEX'] = fog
df['AVG NUMBER OF WORDS PER SENTENCE'] = avg_sent_len
df['COMPLEX WORD COUNT'] = count_complex
df['WORD COUNT'] = num_words
df['SYLLABLE PER WORD'] = syllables_per_word
df['PERSONAL PRONOUNS'] = num_pronouns
df['AVERAGE WORD LENGTH'] = avg_word_length

In [None]:
df = df.drop('text', axis = 'columns')

In [None]:
df.to_excel('output.xlsx')