TEMP

In [36]:
import pandas as pd
import numpy as np
import json

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

import sys
from pathlib import Path

# Set the project root directory manually
project_root = Path("C:/Users/User/Documents/Ali/Projects/public-twitterSentiment-pak")

# Construct the absolute path to the config directory
config_directory = project_root / "config"

sys.path.append(str(config_directory))

# Now you can import modules from the config package
import config_loader as cl

config = cl.load_config()

In [5]:
# Use the .glob() method to find all files with a .json extension
json_files = list(config["RAW_DATA_DIR"].glob("*.json"))

# # Now you can work with the list of JSON files
# for json_file in json_files:
#     # Load or process each JSON file

# tweets = pd.DataFrame()

# Read the contents of the JSON file as a string
json_string = json_files[0].read_text()

# Parse the JSON content into a Python object (dictionary or list)
json_data = json.loads(json_string)

tweets = pd.DataFrame.from_dict(json_data)

In [6]:
tweets = tweets[['rawContent']]

In [9]:
import re
import string
from unidecode import unidecode
import contractions
from textblob import TextBlob

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

In [8]:
from sklearn.base import TransformerMixin, BaseEstimator

In [29]:
pos_tag_dict = {
            "J": wordnet.ADJ,
            "N": wordnet.NOUN,
            "V": wordnet.VERB,
            "R": wordnet.ADV
        }

def lemmatize_pos_tagged_text(text, lemmatizer, post_tag_dict):
  sentences = nltk.sent_tokenize(text)
  new_sentences = []

  for sentence in sentences:
    sentence = sentence.lower()
    new_sentence_words = []

    pos_tuples = nltk.pos_tag(nltk.word_tokenize(sentence)) 

    for word_idx, word in enumerate(nltk.word_tokenize(sentence)):
      nltk_word_pos = pos_tuples[word_idx][1]
      wordnet_word_pos = pos_tag_dict.get(
                          nltk_word_pos[0].upper(), None)
      if wordnet_word_pos is not None:
        new_word = lemmatizer.lemmatize(word, wordnet_word_pos)
      else:
        new_word = lemmatizer.lemmatize(word)

      new_sentence_words.append(new_word)

    new_sentence = " ".join(new_sentence_words)
    new_sentences.append(new_sentence)

  return " ".join(new_sentences)

In [25]:
def download(path, name):
    try:
        nltk.data.find(path)
    except LookupError:
        print(f'resource {path} not found. Downloading now...')
        nltk.download(name)

class TextProcessor:
    def __init__(self, X) -> None:
        self.X = X

        download('corpora/stopwords', 'stopwords')
        download('tokenizers/punkt', 'punkt')
        download('taggers/averaged_perceptron_tagger',
                                'averaged_perceptron_tagger')
        download('corpora/wordnet', 'wordnet')
        # download('corpora/omw-1.4', 'omw-1.4')

        self.sw_nltk = stopwords.words('english')
        self.sw_nltk.remove('not')
        self.lemmatizer = WordNetLemmatizer()

        self.pos_tag_dict = {
            "J": wordnet.ADJ,
            "N": wordnet.NOUN,
            "V": wordnet.VERB,
            "R": wordnet.ADV
        }


    def to_lower(self):
        self.X = self.X.apply(lambda x: x.lower())
        return self

    def expand(self):
        self.X = self.X.apply(lambda x: " ".join([contractions.fix(word) for word in x.split()]))
        return self

    def remove_num(self):
        self.X = self.X.apply(lambda x: re.sub(r'\d+', '', x))
        return self

    def remove_punc(self):
        self.X = self.X.apply(lambda x: re.sub('[{}]'.format(re.escape(string.punctuation)), '', x))
        return self
    
    def replace_diacritics(self):
        self.X = self.X.apply(lambda x: unidecode(str(x), errors="preserve"))
        return self
    
    def spellcheck(self):
        self.X = self.X.apply(lambda x: str(TextBlob(x).correct()))
        return self
    
    def remove_stopwords(self):
        self.X = self.X.apply(lambda x: " ".join([ word for word in x.split() if word not in self.sw_nltk]))
        return self
    
    def lemmatize(self):
        self.X = self.X.apply(lambda x: lemmatize_pos_tagged_text(x, self.lemmatizer, self.pos_tag_dict))
        return self
    
    def get_processed_text(self):
        return self.X


In [32]:
class SentimentTextProcessor(TransformerMixin, BaseEstimator):
    def __init__(self) -> None:
        pass

    def fit(self, X):
        return self
    
    def transform(self, X):
        txt_preproc = TextProcessor(X.copy())

        processed_text = txt_preproc.replace_diacritics()\
        .expand().remove_num().spellcheck().remove_punc()\
        .lemmatize().remove_stopwords().get_processed_text()

        return processed_text

class TopicModellingTextProcessor(TransformerMixin, BaseEstimator):
    def __init__(self) -> None:
        pass

    def fit(self, X):
        return self
    
    def transform(self, X):
        txt_preproc = TextProcessor(X.copy())

        processed_text = txt_preproc.replace_diacritics()\
        .expand().spellcheck().get_processed_text()

        return processed_text
    

In [13]:
# class to translate from urdu to english
from googletrans import Translator

class TextTranslator(TransformerMixin, BaseEstimator):
    def __init__(self, dest_lang, source_lang = None) -> None:
        self.translator = Translator()
        self.dest_lang = dest_lang
        self.source_lang = source_lang


    def fit(self, X):
        return self
    
    def transform(self, X):
        return self._batch_translate_texts(X)
    
    def _batch_translate_texts(self, texts):
        if self.source_lang:
            return self._translate_batch_with_source(texts)
        else:
            return self._translate_batch_without_source(texts)
    

    def _translate_batch_without_source(self, texts):
        non_english_texts = [text for text in texts if not self._is_english(text)]
        
        # Perform batch translation only for non-English texts
        if non_english_texts:
            translated_texts = self._translate_batch(non_english_texts)
            translated_idx = 0
            final_translated_texts = []
            
            for text in texts:
                if self._is_english(text):
                    final_translated_texts.append(text)
                else:
                    final_translated_texts.append(translated_texts[translated_idx])
                    translated_idx += 1
        else:
            final_translated_texts = texts
        
        return final_translated_texts
    
    def _translate_batch_with_source(self, texts):
        # to be implemented later
        pass
    

    def _is_english(self, text):
        return self.translator.detect(text).lang == 'en'
    
    def _batch_translate(self, texts):
        translations = self.translator.translate(texts, dest = self.dest_lang)
        
        translated_texts = []
        for translation in translations:
            translated_texts.append(translation.text) 

        return translated_texts

# translator = Translator()
# print(translator.translate('你好'))

In [26]:
type(tweets)

pandas.core.frame.DataFrame

In [27]:
type(tweets.loc[0])

pandas.core.series.Series

In [33]:
test_processor = SentimentTextProcessor()
processed_tweets = test_processor.fit_transform(tweets)


resource corpora/wordnet not found. Downloading now...


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [40]:
print(processed_tweets)

rawContent    adeelasifpk shkhlyn dykhhwn news part oh hot f...
dtype: object
