# Comments Preprocessing

## Import

In [1]:
# Visual Python: Data Analysis > Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
carrefour_comments = pd.read_excel('./drive/MyDrive/Colab Notebooks/ESG/Comments_Carrefour_Combined.xlsx')
cocacola_comments = pd.read_excel('./drive/MyDrive/Colab Notebooks/ESG/Comments_Cocacola_Combined.xlsx')

## Preprocessing

### Translation FR to EN - DeepL 사용함
* 이 과정에서 행의 수가 달라짐

In [3]:
carrefour_comments_en_1 = pd.read_excel('./drive/MyDrive/Colab Notebooks/ESG/Comments_Carrefour_EN_1.xlsx')
carrefour_comments_en_2 = pd.read_excel('./drive/MyDrive/Colab Notebooks/ESG/Comments_Carrefour_EN_2.xlsx')
carrefour_comments_en_3 = pd.read_excel('./drive/MyDrive/Colab Notebooks/ESG/Comments_Carrefour_EN_3.xlsx')
carrefour_comments_en_4 = pd.read_excel('./drive/MyDrive/Colab Notebooks/ESG/Comments_Carrefour_EN_4.xlsx')
carrefour_comments_en_5 = pd.read_excel('./drive/MyDrive/Colab Notebooks/ESG/Comments_Carrefour_EN_5.xlsx')

In [4]:
carrefour_comments = pd.concat([carrefour_comments_en_1,carrefour_comments_en_2,carrefour_comments_en_3,carrefour_comments_en_4,carrefour_comments_en_5], join='outer', axis=0, ignore_index=True)

In [5]:
carrefour_comments.dropna(axis = 0, how = 'any', ignore_index = True, inplace = True)

In [6]:
carrefour_comments

Unnamed: 0,Comments
0,carrefour groceries delivered : half of the st...
1,this company supports state. therefore it sup...
2,is your plan to continue being complicit with ...
3,viva palestina
4,your forgot an important detail but i'm glad t...
...,...
15302,happy new year
15303,happy new year and long live carrefour my favo...
15304,happy new year and happy new year!
15305,Happy New Year to the whole team. beautiful vi...


### Preprocessing

In [7]:
# 불필요 열 제거
cocacola_comments.drop(columns = ["Unnamed: 0"], inplace = True)
cocacola_comments.rename(columns = {"Cocacola": "Comments"}, inplace = True)

#### Real Preproocessing

In [10]:
import re
from nltk.corpus import stopwords

# nltk의 불용어 리스트를 다운로드
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
# 불용어 리스트 (영어로 설정)
stop_words = set(stopwords.words('english'))

def clean_comment(comment):
    # 입력이 문자열이 아닌 경우 빈 문자열 반환
    if not isinstance(comment, str):
        return ""

    # 0) 소문자화
    comment = comment.lower()

    # 7) 접속사, 대명사, 조사, 비동사 등 불용어 삭제
    tokens = comment.split()
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

    # 1) .다음 문자가 나오는 단어 삭제
    comment = re.sub(r'\.\S+', '', comment)

    # 2) # 다음 문자가 나오는 단어 삭제
    comment = re.sub(r'#\S+', '', comment)

    # 3) @ 다음 문자가 나오는 단어 삭제
    comment = re.sub(r'@\S+', '', comment)

    # 4) 숫자 삭제
    comment = re.sub(r'\d+', '', comment)

    # 5) 동일한 문자가 세번 연속 등장 시 두 개만 남기기
    comment = re.sub(r'(.)\1\1+', r'\1\1', comment)

    # 6) 이모지 및 키보드에 없는 특수 문자 삭제
    # 유니코드 범위 내에서 이모지를 제거하는 정규 표현식
    comment = re.sub(r'[^\w\s,]', '', comment)  # 기존 코드

    # 이모지를 포함한 다양한 유니코드 문자를 제거
    comment = re.sub(r'[\U00010000-\U0010FFFF]', '', comment)

    # 리스트를 공백으로 연결하여 하나의 문자열로 반환
    return ' '.join(filtered_tokens)

In [14]:
# 데이터프레임에 적용
carrefour_comments['Preprocessed'] = carrefour_comments['Comments'].apply(clean_comment)
cocacola_comments['Preprocessed'] = cocacola_comments['Comments'].apply(clean_comment)

In [15]:
import pandas as pd
import re

# 이모티콘을 제거하는 함수
def remove_emojis(text):
    if isinstance(text, str):  # 문자열인지 확인
        emoji_pattern = re.compile(
            "["
            u"\U0001F600-\U0001F64F"  # 이모티콘 범위
            u"\U0001F300-\U0001F5FF"  # 기호 & 상징
            u"\U0001F680-\U0001F6FF"  # 교통 & 기계
            u"\U0001F1E0-\U0001F1FF"  # 국기
            u"\U00002700-\U000027BF"  # 기타 기호
            u"\U000024C2-\U0001F251"
            "]+", flags=re.UNICODE
        )
        return emoji_pattern.sub(r'', text)
    return text  # 문자열이 아닐 경우 원본 값 반환

In [16]:
# Comments 열에 있는 모든 데이터에서 이모티콘 제거
cocacola_comments['Preprocessed'] = cocacola_comments['Comments'].apply(remove_emojis)
carrefour_comments['Preprocessed'] = carrefour_comments['Comments'].apply(remove_emojis)

In [17]:
carrefour_comments

Unnamed: 0,Comments,Preprocessed
0,carrefour groceries delivered : half of the st...,carrefour groceries delivered : half of the st...
1,this company supports state. therefore it sup...,this company supports state. therefore it sup...
2,is your plan to continue being complicit with ...,is your plan to continue being complicit with ...
3,viva palestina,viva palestina
4,your forgot an important detail but i'm glad t...,your forgot an important detail but i'm glad t...
...,...,...
15302,happy new year,happy new year
15303,happy new year and long live carrefour my favo...,happy new year and long live carrefour my favo...
15304,happy new year and happy new year!,happy new year and happy new year!
15305,Happy New Year to the whole team. beautiful vi...,Happy New Year to the whole team. beautiful vi...


#### Stemming

In [None]:
import pandas as pd
from nltk.stem import SnowballStemmer
import nltk

# nltk 다운로드 (필요시)
nltk.download('punkt')

# SnowballStemmer 초기화 (영어)
stemmer = SnowballStemmer('english')

def stem_comment(comment):
    # 입력이 문자열이 아닌 경우 빈 문자열 반환
    if not isinstance(comment, str):
        return ""

    # 문장을 단어로 분리
    words = nltk.word_tokenize(comment)

    # 각 단어를 스테밍
    stemmed_words = [stemmer.stem(word) for word in words]

    # 스테밍된 단어들을 공백으로 연결하여 다시 문장으로 반환
    return ' '.join(stemmed_words)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# 데이터프레임에 적용
carrefour_comments['Stemmed'] = carrefour_comments['Preprocessed'].apply(stem_comment)
cocacola_comments['Stemmed'] = cocacola_comments['Preprocessed'].apply(stem_comment)

In [None]:
carrefour_comments

Unnamed: 0,Comments,Preprocessed,Stemmed
0,carrefour groceries delivered : half of the st...,carrefour groceries delivered : half of the st...,carrefour groceri deliv : half of the stuff i ...
1,this company supports state. therefore it sup...,this company supports state. therefore it sup...,this compani support state . therefor it suppo...
2,is your plan to continue being complicit with ...,is your plan to continue being complicit with ...,is your plan to continu be complicit with huma...
3,viva palestina,viva palestina,viva palestina
4,your forgot an important detail but i'm glad t...,your forgot an important detail but i'm glad t...,your forgot an import detail but i 'm glad to ...
...,...,...,...
15302,happy new year,happy new year,happi new year
15303,happy new year and long live carrefour my favo...,happy new year and long live carrefour my favo...,happi new year and long live carrefour my favo...
15304,happy new year and happy new year!,happy new year and happy new year!,happi new year and happi new year !
15305,Happy New Year to the whole team. beautiful vi...,Happy New Year to the whole team. beautiful vi...,happi new year to the whole team . beauti vide...


In [None]:
cocacola_comments

Unnamed: 0,Comments,Preprocessed,Stemmed
0,You have reached us at The Coca-Cola Company w...,You have reached us at The Coca-Cola Company w...,you have reach us at the coca-cola compani wor...
1,"Hi, I recently bought one of your coke product...","Hi, I recently bought one of your coke product...","hi , i recent bought one of your coke product ..."
2,Hi. Thanks for reaching out. We'd like to lo...,Hi. Thanks for reaching out. We'd like to lo...,hi . thank for reach out . we 'd like to look ...
3,Can't wait to try it.,Can't wait to try it.,ca n't wait to tri it .
4,The flavor of the future,The flavor of the future,the flavor of the futur
...,...,...,...
13985,😍🔥🔥,,
13986,🔥❤️,,
13987,😍🔥🔥,,
13988,😍🔥🔥,,


## Export

In [18]:
carrefour_comments['Preprocessed'].to_excel('./drive/MyDrive/Colab Notebooks/ESG/carrefour_comments_final_test.xlsx')
cocacola_comments['Preprocessed'].to_excel('./drive/MyDrive/Colab Notebooks/ESG/cocacola_comments_final_test.xlsx')