# DATA PROCEESSING

# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
import spacy
import emoji
import os
import json
from deep_translator import GoogleTranslator
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import seaborn as sns

## Load NLTK stopwords and lemmatizer

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

## Load spaCy model for lemmatization

In [None]:
nlp = spacy.load("en_core_web_sm")

## Set the option to display full column width and all rows

In [None]:
pd.set_option("display.max_colwidth",None)
pd.set_option("display.max_rows",None)

# Load your ABSA dataset\

In [None]:
df = pd.read_csv("training_dataset/Restaurant_Dataset.csv")
df.head()

### Load database information

In [None]:
df.info()

### Finding number of missing value

In [None]:
missing_values = df.isnull().sum()
print(missing_values)

### Handling missing values of Opinion_Target Column

In [None]:
def missing_value_handler(df):
    return df['Opinion_Category'].split('#')[0].lower() if pd.isna(df['Opinion_Target']) else df['Opinion_Target']

df['Opinion_Target'] = df.apply(missing_value_handler, axis = 1)

### Checking number of missing value

In [None]:
missing_values = df.isnull().sum()
print(missing_values)

### Data Cleaning

In [None]:
def clean_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove special characters and punctuation
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    return text

### Tokenization

In [None]:
def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

### Rejoins the tokens to form the sentence back

In [None]:
def join_tokens(tokens):
    return ' '.join(tokens)

### Remove stopwords

In [None]:
def remove_stopwords(tokens):
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

### Lemmatization

In [None]:
def lemmatize_text(tokens):
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return lemmatized_tokens

### Removes the word with length less than or equal to 2

In [None]:
def short_remover(tokens):
    new_tokens = [word for word in tokens if len(word) > 2 ]
    return new_tokens

## Translate text into English

In [None]:
def translate(text):
    translator = GoogleTranslator(source='auto', target='en')
    return translator.translate(text)

# text1 = "बढ़िया खाना था"
# translate(text1)

### Converting emojies into words 

In [None]:
positive_emojis = """😄😀😁😆😂🤣😊🙂😎😉😍🥰😘😋😛😝😜🤪🤩🥳🥰❤️👍👌🤟🍔🍕🍣🍰🍹🍷🍺
                    🍦🍯🥞🍟🍩🥼🎉🎊🥳🍚🍘🍥🥠🥮🍢🍡🍧🍨🍦🥧🧁🍰🎂🍮🍭🍬🍫🍿🍩🍪🌰🥜
                    🍯🍻🥂🍷🍾😀😃😄😆😅🥲\😊😇🙂😀😃😄😁😅😆😂🙂😊🤒👏"""

    
negative_emojis = """😔😞😢😭😤😠😡🤬😩😫🥺😖😣😠😤😷🤒🤕😐😶😒😏🙁🥶😨😱😰😳🥵😳😵
                     🤯🤐🤮🥴🤢👎😈👿💔😩😔😞😢😭"""


# add space between word and emoji
def handle_emoji_helper(text):
    i = 0
    lst = list(text)
    for word in text:
        if word !=" " and (word in positive_emojis or word in negative_emojis):
            lst.insert(i," ")
        i+=1
    return "".join(lst)

def handle_emoji(sentence):
    sentence = handle_emoji_helper(sentence)
    words = sentence.split()
    converted_sentence = []
    for word in words:
        if word in positive_emojis:
            converted_sentence.append(" good good") # Don't change ( one word sentences will be removed in
        elif word in negative_emojis:                                     #      preprocessing )
            converted_sentence.append("bad bad")
        else:
            converted_sentence.append(word)
    return " ".join(converted_sentence)

# # Example usage:
# input_sentence = "😀"
# input_sentence = "बढ़िया खाना था😀"
# converted = handle_emoji(input_sentence)
# print(converted)

## Apply data cleaning and preprocessing

In [None]:
df['Text'] = df['Text'].apply(clean_text)
# df['Text'] = df['Text'].apply(translate)  
df['Text'] = df['Text'].apply(handle_emoji)
df['Token_Text'] = df['Text'].apply(tokenize_text)
df['Token_Text'] = df['Token_Text'].apply(remove_stopwords)
df['Token_Text'] = df['Token_Text'].apply(lemmatize_text)
df['Token_Text'] = df['Token_Text'].apply(short_remover)
df['Token_Text'] = df['Token_Text'].apply(join_tokens)

### Rearrange the column order

In [None]:
# Create a list of column names in the desired order
desired_order = ['Review_ID', 'Sentence_ID', 'Text','Token_Text', 'Opinion_Target',
       'Opinion_Category', 'Opinion_Polarity']

# Rearrange the columns
df = df[desired_order]

In [None]:
df.head()

### Count the number of occurrences of each unique value (Need correction)

In [None]:
df.Opinion_Polarity.value_counts()

### Correction ( Deleting all other except ['negative', 'positive','neutral'] )

In [None]:
filtered_df = df[~df['Opinion_Polarity'].isin(['negative', 'positive','neutral'])]
filtered_df

### Correcting Wrong Data in Dataset

In [None]:
filtered_df.loc[:, 'Opinion_Target'] = filtered_df['Opinion_Category']
filtered_df.loc[:, 'Opinion_Category'] = filtered_df['Opinion_Polarity']

filtered_df.loc[filtered_df['Sentence_ID'] == "1300636:2", 'Opinion_Polarity'] = "positive"
filtered_df.loc[filtered_df['Sentence_ID'] == "1352948:1", 'Opinion_Polarity'] = "neutral"
filtered_df.loc[filtered_df['Sentence_ID'] == "1410878:0", 'Opinion_Polarity'] = "positive"
filtered_df.loc[filtered_df['Sentence_ID'] == "1615322:3", 'Opinion_Polarity'] = "neutral"
filtered_df.loc[filtered_df['Sentence_ID'] == "737999:2", 'Opinion_Polarity'] = "positive"

In [None]:
filtered_df

### Updating main dataset df with filter_df dataset

In [None]:
df.update(filtered_df)

### Removing Rows where Opinion_Polarity is neutral and where Token_Text is empty

In [None]:
df = df[df['Opinion_Polarity'] != 'neutral']
df = df[df.Token_Text != ""]

### count the number of occurrences of each unique value

In [None]:
df.Opinion_Polarity.value_counts()

### Opinion_Polarity distribution in our dataset

In [None]:
sns.countplot(x = "Opinion_Polarity", data = df)
plt.show()

### Adding dummy negative Opinion reviews in dataset 

In [None]:
temp_df = df.copy()
df_negative = df[df['Opinion_Polarity'] == 'negative']
df = pd.concat([temp_df, df_negative], ignore_index=True) 

### Opinion_Polarity distribution in our dataset

In [None]:
sns.countplot(x = "Opinion_Polarity", data = df)
plt.show()

## Save the preprocessed data to a new CSV file

In [None]:
df.to_csv("training_dataset/Preprocessed_Restuarant_Dataset.csv", index=False)