TEMP

In [1]:
import pandas as pd
import numpy as np
import json

import sys
from pathlib import Path

# Set the project root directory manually
project_root = Path("C:/Users/User/Documents/Ali/Projects/public-twitterSentiment-pak")

# Construct the absolute path to the config directory
config_directory = project_root / "config"

sys.path.append(str(config_directory))

# Now you can import modules from the config package
import config_loader as cl

config = cl.load_config()

In [2]:
# Use the .glob() method to find all files with a .json extension
json_files = list(config["RAW_DATA_DIR"].glob("*.json"))

# # Now you can work with the list of JSON files
# for json_file in json_files:
#     # Load or process each JSON file

# tweets = pd.DataFrame()

# Read the contents of the JSON file as a string
json_string = json_files[0].read_text()

# Parse the JSON content into a Python object (dictionary or list)
json_data = json.loads(json_string)

tweets = pd.DataFrame.from_dict(json_data)

In [3]:
tweets = tweets[['rawContent']]

In [11]:
import re
import string
from unidecode import unidecode
import contractions
from textblob import TextBlob

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

In [14]:
from sklearn.base import TransformerMixin, BaseEstimator

In [None]:
pos_tag_dict = {
            "J": wordnet.ADJ,
            "N": wordnet.NOUN,
            "V": wordnet.VERB,
            "R": wordnet.ADV
        }

def lemmatize_pos_tagged_text(text, lemmatizer, post_tag_dict):
  sentences = nltk.sent_tokenize(text)
  new_sentences = []

  for sentence in sentences:
    sentence = sentence.lower()
    new_sentence_words = []

    pos_tuples = nltk.pos_tag(nltk.word_tokenize(sentence)) 

    for word_idx, word in enumerate(nltk.word_tokenize(sentence)):
      nltk_word_pos = pos_tuples[word_idx][1]
      wordnet_word_pos = pos_tag_dict.get(
                          nltk_word_pos[0].upper(), None)
      if wordnet_word_pos is not None:
        new_word = lemmatizer.lemmatize(word, wordnet_word_pos)
      else:
        new_word = lemmatizer.lemmatize(word)

      new_sentence_words.append(new_word)

    new_sentence = " ".join(new_sentence_words)
    new_sentences.append(new_sentence)

  return " ".join(new_sentences)

In [12]:
class TextProcessor:
    def __init__(self, X) -> None:
        self.X = X
        self.sw_nltk = stopwords.words('english')
        self.sw_nltk.remove('not')
        self.lemmatizer = WordNetLemmatizer()

        self.pos_tag_dict = {
            "J": wordnet.ADJ,
            "N": wordnet.NOUN,
            "V": wordnet.VERB,
            "R": wordnet.ADV
        }


    def to_lower(self):
        self.X = self.X.apply(lambda x: x.lower())
        return self

    def expand(self):
        self.X = self.X.apply(lambda x: " ".join([contractions.fix(word) for word in x.split()]))
        return self

    def remove_num(self):
        self.X = self.X.apply(lambda x: re.sub(r'\d+', '', x))
        return self

    def remove_punc(self):
        self.X = self.X.apply(lambda x: re.sub('[{}]'.format(re.escape(string.punctuation)), '', x))
        return self
    
    def remove_diacritics(self):
        self.X = self.X.apply(lambda x: unidecode(x, errors="preserve"))
        return self
    
    def spellcheck(self):
        self.X = self.X.apply(lambda x: str(TextBlob(x).correct()))
        return self
    
    def remove_stopwords(self):
        self.X = self.X.apply(lambda x: " ".join([ word for word in x.split() if word not in self.sw_nltk]))
        return self
    
    def lemmatize(self):
        self.X = self.X.apply(lambda x: lemmatize_pos_tagged_text(x, self.lemmatizer, self.pos_tag_dict))
        return self
    
    def get_processed_text(self):
        return self.X


In [None]:
# class to translate from urdu to english
