## 1.Import Library

In [100]:
import pandas as pd
import string
import re
from pyvi import ViTokenizer

## 2.Explore Data Analysis

In [101]:
df_train = pd.read_csv('../data/data_origin/Trainfull.csv')
df_test=pd.read_csv('../data/data_origin/Test.csv')

In [102]:
df_train.columns

Index(['index', 'comment', 'n_star', 'date_time', 'label'], dtype='object')

In [103]:
df_train.head(5)

Unnamed: 0,index,comment,n_star,date_time,label
0,0,M·ªõi mua m√°y n√†y T·∫°i thegioididong th·ªët n·ªët c·∫£m...,5,2 tu·∫ßn tr∆∞·ªõc,{CAMERA#Positive};{FEATURES#Positive};{BATTERY...
1,1,Pin k√©m c√≤n l·∫°i mi·ªÖn ch√™ mua 8/3/2019 t√¨nh tr·∫°...,5,14/09/2019,{BATTERY#Negative};{GENERAL#Positive};{OTHERS};
2,2,Sao l√∫c g·ªçi ƒëi·ªán tho·∫°i m√†n h√¨nh b·ªã ch·∫•m nh·ªè nh...,3,17/08/2020,{FEATURES#Negative};
3,3,"M·ªçi ng∆∞·ªùi c·∫≠p nh·∫≠t ph·∫ßn m·ªÅm l·∫°i , n√≥ s·∫Ω b·ªõt t·ªë...",3,29/02/2020,{FEATURES#Negative};{BATTERY#Neutral};{GENERAL...
4,4,"M·ªõi mua S√†i ƒë∆∞·ª£c 1 th√°ng th·∫•y pin r·∫•t tr√¢u, S√†...",5,4/6/2020,{BATTERY#Positive};{PERFORMANCE#Positive};{SER...


In [104]:
df_train.shape

(8898, 5)

### 2.1.Overview of phone dataset

> The dataset consists of 11,122 comments, including of four features: 

>comment: Commentary content.
 n_star: The user evaluates the smartphone's star.

>data_time: The date and time the comment was posted.

>label: Label of comment.

>All samples are in text format. No tokenization has been applied. Users of this dataset are free to use whatever sentence representation they choose.

### 2.2 Structure of dataset

In [105]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8898 entries, 0 to 8897
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   index      8898 non-null   int64 
 1   comment    8898 non-null   object
 2   n_star     8898 non-null   int64 
 3   date_time  8898 non-null   object
 4   label      8898 non-null   object
dtypes: int64(2), object(3)
memory usage: 347.7+ KB


### 2.3 Summary of statistic

In [106]:
df_train.describe()

Unnamed: 0,index,n_star
count,8898.0,8898.0
mean,4448.5,3.7069
std,2568.775681,1.505585
min,0.0,1.0
25%,2224.25,3.0
50%,4448.5,4.0
75%,6672.75,5.0
max,8897.0,5.0


### 2.4. Preprocessing

In [107]:
print(df_train['comment'].values[95:98])

['ƒêi·ªán tho·∫°i kh√° t·ªët pin tr√¢u kh√° m∆∞·ª£t b·∫Øt wifi c·ª±c t·ªët ch·ªâ l√† th·ªânh tho·∫£ng m√°y b·ªã ƒë∆° ph·∫£i tho√°t ra v√†o l·∫°i v√† m√°y ko c·∫≠p nh·∫≠t l√™n miul 12'
 'L√∫c tr∆∞·ªõc nghe b·∫£o ƒëi·ªán tho·∫°i ch∆°i game n√≥ng, nh∆∞ng mua v·ªÅ chi·∫øn li√™n qu√¢n 3 4h li√™n t·ª•c th√¨ ch·ªâ th·∫•y ·∫•m kh√¥ng n√≥ng nh∆∞ l·ªùi ƒë·ªìn, nh·ªØng th·ª© kh√°c th√¨ v≈©ng ch·∫≥ng c√≥ g√¨ ƒë·ªÉ n√≥i v√¨ qu√° ngon r·ªìi'
 '1. M√°y n·∫øu kh√¥ng ch∆°i game, l∆∞·ªõt web..., √≠t s·ª≠ d·ª•ng th√¨ gi·ªØ Pin cao nh·∫•t ƒë∆∞·ª£c kho·∫£ng 1 ng√†y 16 ti·∫øng ( T√≠nh lu√¥n c·∫£ l√∫c ng·ªß nh√© v√¨ khi ng·ªß th√¨ ch·∫≥ng ƒë·ªông g√¨ t·ªõi ) . Kh√¥ng ƒë∆∞·ª£c 2 ng√†y nh∆∞ mn n√≥i ƒë√¢u \n2. Nh·∫°c nghe c·ª© c√† gi·∫≠t c√† gi·∫≠t r·∫•t kh√≥ ch·ªãu lu√¥n\n3. Quay video b·ªã m·ªù ch√°n \nC√≤n l·∫°i okie üôÜ']


> Handle punctuation, handle whitespace, handle icons in strings

In [108]:
def remove_punctuation(comment):
  # Create a translation table
  translator = str.maketrans('', '', string.punctuation)
  # Remove punctuation
  new_string = comment.translate(translator)
  # Remove redudant space and break sign
  new_string = re.sub('[\n ]+', ' ', new_string)
  # Remove emoji icon
  emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                           "]+", flags=re.UNICODE)
  new_string = re.sub(emoji_pattern, '', new_string)

  return new_string

>Read  stop words file 

In [109]:
def read_filestopwords():
    with open('../data/data_stopword/vietnamese-stopwords.txt', 'r', encoding='utf-8') as file:
        lines = file.readlines()
        words = [line.split('\n')[0] for line in lines]
    return words

> Remove stop words

In [110]:
def remove_stopword(comment):
  stop_words = read_filestopwords()
  filtered = [word for word in comment.split() if word not in stop_words]
  return ' '.join(filtered)

>Remove repeated words 

In [111]:
def remove_repeated_words(text):
    words = text.split()
    new_words = []
    for i in range(len(words)):
        if i == 0 or words[i] != words[i-1]:
            new_words.append(words[i])
    return ' '.join(new_words)

> Execute function

In [112]:
df_train['comment'] = df_train['comment'].apply(lambda x: x.lower())
df_train['comment'] = df_train['comment'].apply(remove_punctuation)

In [113]:
df_test['comment'] = df_test['comment'].apply(lambda x: x.lower())
df_test['comment']= df_test['comment'].apply(remove_punctuation)

> Tokenizer 

In [114]:
df_train['comment'] = df_train['comment'].apply(lambda x: ViTokenizer.tokenize(x))
df_test['comment']= df_test['comment'].apply(lambda x: ViTokenizer.tokenize(x))

In [115]:
df_train['comment'] = df_train['comment'].apply(remove_stopword)
df_test['comment']= df_test['comment'].apply(remove_stopword)

In [116]:
df_train['comment'].values[95:98]

array(['ƒëi·ªán_tho·∫°i kh√° t·ªët pin tr√¢u kh√° m∆∞·ª£t b·∫Øt wifi c·ª±c t·ªët th·ªânh_tho·∫£ng m√°y ƒë∆° tho√°t m√°y ko c·∫≠p_nh·∫≠t miul 12',
       'nghe b·∫£o ƒëi·ªán_tho·∫°i ch∆°i game n√≥ng mua v·ªÅ chi·∫øn li√™n_qu√¢n 3 4h li√™n_t·ª•c th·∫•y ·∫•m n√≥ng l·ªùi ƒë·ªìn th·ª© kh√°c v≈©ng ch·∫≥ng n√≥i qu√° ngon',
       '1 m√°y ch∆°i game l∆∞·ªõt web √≠t s·ª≠_d·ª•ng gi·ªØ pin cao nh·∫•t kho·∫£ng 1 ng√†y 16 ti·∫øng t√≠nh lu√¥n c·∫£ ng·ªß nh√© ng·ªß ch·∫≥ng ƒë·ªông t·ªõi 2 ng√†y mn n√≥i ƒë√¢u 2 nh·∫°c nghe c√†_gi·∫≠t c√†_gi·∫≠t kh√≥_ch·ªãu lu√¥n 3 quay video m·ªù ch√°n c√≤n okie'],
      dtype=object)

In [117]:
df_train['comment']=df_train['comment'].apply(remove_repeated_words)
df_test['comment']=df_test['comment'].apply(remove_repeated_words)

In [118]:
df_train['comment'].values[95:98]

array(['ƒëi·ªán_tho·∫°i kh√° t·ªët pin tr√¢u kh√° m∆∞·ª£t b·∫Øt wifi c·ª±c t·ªët th·ªânh_tho·∫£ng m√°y ƒë∆° tho√°t m√°y ko c·∫≠p_nh·∫≠t miul 12',
       'nghe b·∫£o ƒëi·ªán_tho·∫°i ch∆°i game n√≥ng mua v·ªÅ chi·∫øn li√™n_qu√¢n 3 4h li√™n_t·ª•c th·∫•y ·∫•m n√≥ng l·ªùi ƒë·ªìn th·ª© kh√°c v≈©ng ch·∫≥ng n√≥i qu√° ngon',
       '1 m√°y ch∆°i game l∆∞·ªõt web √≠t s·ª≠_d·ª•ng gi·ªØ pin cao nh·∫•t kho·∫£ng 1 ng√†y 16 ti·∫øng t√≠nh lu√¥n c·∫£ ng·ªß nh√© ng·ªß ch·∫≥ng ƒë·ªông t·ªõi 2 ng√†y mn n√≥i ƒë√¢u 2 nh·∫°c nghe c√†_gi·∫≠t kh√≥_ch·ªãu lu√¥n 3 quay video m·ªù ch√°n c√≤n okie'],
      dtype=object)

> Observing, we can see that there are 3 types of labels: neutral, positive and negative

In [119]:
df_train.head(5)

Unnamed: 0,index,comment,n_star,date_time,label
0,0,m·ªõi mua m√°y thegioididong th·ªët_n·ªët c·∫£m_th·∫•y ok...,5,2 tu·∫ßn tr∆∞·ªõc,{CAMERA#Positive};{FEATURES#Positive};{BATTERY...
1,1,pin k√©m c√≤n mi·ªÖn ch√™ mua 832019 t√¨nh_tr·∫°ng pin...,5,14/09/2019,{BATTERY#Negative};{GENERAL#Positive};{OTHERS};
2,2,sao g·ªçi ƒëi·ªán_tho·∫°i m√†n_h√¨nh ch·∫•m nh·ªè nh√°y g·∫ßn ...,3,17/08/2020,{FEATURES#Negative};
3,3,m·ªçi ng∆∞·ªùi c·∫≠p_nh·∫≠t ph·∫ßn_m·ªÅm n√≥ b·ªõt t·ªën pin m√¨n...,3,29/02/2020,{FEATURES#Negative};{BATTERY#Neutral};{GENERAL...
4,4,m·ªõi mua s√†i 1 th√°ng th·∫•y pin tr√¢u s√†i bao m∆∞·ª£t...,5,4/6/2020,{BATTERY#Positive};{PERFORMANCE#Positive};{SER...


> Count Features

In [120]:
def count_features(label):
    features = re.findall(r'\{.*?#', label)
    num_features = len(features)
    return num_features

> Execute function

In [121]:
count_features = df_train['label'].apply(count_features)

> Find max features

In [122]:
index_max_features=count_features.idxmax()
number_of_features=df_train.loc[index_max_features,'label']
print(f"Number of features included: {number_of_features}")
print(f"Max features:  {count_features.max()}")

Number of features included: {SCREEN#Positive};{CAMERA#Positive};{FEATURES#Positive};{BATTERY#Positive};{PERFORMANCE#Positive};{DESIGN#Positive};{PRICE#Neutral};{GENERAL#Positive};{SER&ACC#Positive};
Max features:  9


In [123]:
def check_value_features(data):
    for feature in number_of_features:
          if feature in data:
            return True
    return False

>Execute Function

In [124]:
df_train['contains_features'] =df_train['label'].apply(lambda x:check_value_features(x))
print(df_train['contains_features'].count())
# Hi·ªÉn th·ªã c√°c d√≤ng kh√¥ng ch·ª©a √≠t nh·∫•t m·ªôt trong c√°c t√≠nh nƒÉng ƒë∆∞·ª£c li·ªát k√™
rows_without_features = df_train[~df_train['contains_features']]
print(rows_without_features)

8898
Empty DataFrame
Columns: [index, comment, n_star, date_time, label, contains_features]
Index: []


> Function count how many characteristics belong to the labels positive, negative, neutral

In [125]:
def count_positive_labels(label):
    return label.count("Positive")
def count_neural_labels(label):
    return label.count("Neutral")
def count_negative_labels(label):
    return label.count("Negative")

> Execute function

In [126]:
df_train['positive_count']=df_train['label'].apply(count_positive_labels)
df_train['neutral_count']=df_train['label'].apply(count_neural_labels)
df_train['negative_count']=df_train['label'].apply(count_negative_labels)
df_test['positive_count']=df_test['label'].apply(count_positive_labels)
df_test['neutral_count']=df_test['label'].apply(count_neural_labels)
df_test['negative_count']=df_test['label'].apply(count_negative_labels)

> Assign label

In [127]:
def assign_label(row):
    if row['positive_count'] > row['neutral_count'] and row['positive_count'] > row['negative_count']:
        return 'Positive'
    elif row['negative_count'] >row['neutral_count'] and row['negative_count'] > row['positive_count']:
        return 'Negative'
    elif row['negative_count'] == row['neutral_count'] :
        return 'Negative'
    elif row['neutral_count']== row ['positive_count']:
        return "Positive"
    else :
        return "Neutral"

>Execute function

In [128]:
df_train['label'] = df_train.apply(assign_label, axis=1)
df_test['label'] = df_test.apply(assign_label,axis=1)

In [129]:
df_train.head(8)

Unnamed: 0,index,comment,n_star,date_time,label,contains_features,positive_count,neutral_count,negative_count
0,0,m·ªõi mua m√°y thegioididong th·ªët_n·ªët c·∫£m_th·∫•y ok...,5,2 tu·∫ßn tr∆∞·ªõc,Positive,True,6,0,0
1,1,pin k√©m c√≤n mi·ªÖn ch√™ mua 832019 t√¨nh_tr·∫°ng pin...,5,14/09/2019,Neutral,True,1,0,1
2,2,sao g·ªçi ƒëi·ªán_tho·∫°i m√†n_h√¨nh ch·∫•m nh·ªè nh√°y g·∫ßn ...,3,17/08/2020,Negative,True,0,0,1
3,3,m·ªçi ng∆∞·ªùi c·∫≠p_nh·∫≠t ph·∫ßn_m·ªÅm n√≥ b·ªõt t·ªën pin m√¨n...,3,29/02/2020,Neutral,True,0,2,1
4,4,m·ªõi mua s√†i 1 th√°ng th·∫•y pin tr√¢u s√†i bao m∆∞·ª£t...,5,4/6/2020,Positive,True,2,0,1
5,5,x√†i t·ªët m∆∞·ª£t pin tr√¢u b·∫°n ƒë·ªô s√°ng ƒë·ªß nh√¢n_vi√™n...,5,20/06/2019,Positive,True,3,2,0
6,6,m√¨nh m·ªõi x√†i 7 th√°ng xu·ªëng 7 pin ch·∫£ hi·ªÉu m√°y ...,1,1 tu·∫ßn tr∆∞·ªõc,Negative,True,0,0,1
7,7,h√¥m_qua_ng√†y 2362020 e th·∫ø_gi·ªõi di_ƒë·ªông mua dt...,2,23/06/2020,Negative,True,0,0,1


> 

In [130]:
df_train.columns

Index(['index', 'comment', 'n_star', 'date_time', 'label', 'contains_features',
       'positive_count', 'neutral_count', 'negative_count'],
      dtype='object')

In [131]:
df_train.shape

(8898, 9)

>observed that there is asynchronous data in the datetime column

In [132]:
df_train.head(5)

Unnamed: 0,index,comment,n_star,date_time,label,contains_features,positive_count,neutral_count,negative_count
0,0,m·ªõi mua m√°y thegioididong th·ªët_n·ªët c·∫£m_th·∫•y ok...,5,2 tu·∫ßn tr∆∞·ªõc,Positive,True,6,0,0
1,1,pin k√©m c√≤n mi·ªÖn ch√™ mua 832019 t√¨nh_tr·∫°ng pin...,5,14/09/2019,Neutral,True,1,0,1
2,2,sao g·ªçi ƒëi·ªán_tho·∫°i m√†n_h√¨nh ch·∫•m nh·ªè nh√°y g·∫ßn ...,3,17/08/2020,Negative,True,0,0,1
3,3,m·ªçi ng∆∞·ªùi c·∫≠p_nh·∫≠t ph·∫ßn_m·ªÅm n√≥ b·ªõt t·ªën pin m√¨n...,3,29/02/2020,Neutral,True,0,2,1
4,4,m·ªõi mua s√†i 1 th√°ng th·∫•y pin tr√¢u s√†i bao m∆∞·ª£t...,5,4/6/2020,Positive,True,2,0,1


> Function Check datetime

In [133]:
def is_valid_date(date_str):
    # Bi·ªÉu th·ª©c ch√≠nh quy ƒë·ªÉ ki·ªÉm tra ƒë·ªãnh d·∫°ng ng√†y th√°ng
    date_pattern = r'\d{1,2}/\d{1,2}/\d{4}'
    return bool(re.match(date_pattern, date_str))

> Execute function

In [134]:
valid_dates = df_train[df_train['date_time'].apply(is_valid_date)]
valid_dates_test=df_test[df_test['date_time'].apply(is_valid_date)]

> Observing we see that there are 6930 valid values

In [135]:
valid_dates.shape

(7927, 9)

In [136]:
valid_dates.head(5)

Unnamed: 0,index,comment,n_star,date_time,label,contains_features,positive_count,neutral_count,negative_count
1,1,pin k√©m c√≤n mi·ªÖn ch√™ mua 832019 t√¨nh_tr·∫°ng pin...,5,14/09/2019,Neutral,True,1,0,1
2,2,sao g·ªçi ƒëi·ªán_tho·∫°i m√†n_h√¨nh ch·∫•m nh·ªè nh√°y g·∫ßn ...,3,17/08/2020,Negative,True,0,0,1
3,3,m·ªçi ng∆∞·ªùi c·∫≠p_nh·∫≠t ph·∫ßn_m·ªÅm n√≥ b·ªõt t·ªën pin m√¨n...,3,29/02/2020,Neutral,True,0,2,1
4,4,m·ªõi mua s√†i 1 th√°ng th·∫•y pin tr√¢u s√†i bao m∆∞·ª£t...,5,4/6/2020,Positive,True,2,0,1
5,5,x√†i t·ªët m∆∞·ª£t pin tr√¢u b·∫°n ƒë·ªô s√°ng ƒë·ªß nh√¢n_vi√™n...,5,20/06/2019,Positive,True,3,2,0


In [137]:
invalid_dates=df_train[~df_train['date_time'].apply(is_valid_date)]
invalid_dates_test=df_test[~df_test['date_time'].apply(is_valid_date)]

> There are 856 values in the datetime column out of a total of 6930 that have inconsistent formatting

In [138]:
invalid_dates.shape

(971, 9)

In [139]:
invalid_dates.head(5)

Unnamed: 0,index,comment,n_star,date_time,label,contains_features,positive_count,neutral_count,negative_count
0,0,m·ªõi mua m√°y thegioididong th·ªët_n·ªët c·∫£m_th·∫•y ok...,5,2 tu·∫ßn tr∆∞·ªõc,Positive,True,6,0,0
6,6,m√¨nh m·ªõi x√†i 7 th√°ng xu·ªëng 7 pin ch·∫£ hi·ªÉu m√°y ...,1,1 tu·∫ßn tr∆∞·ªõc,Negative,True,0,0,1
11,11,kh√° ·ªïn trong t·∫ßm gi√° cam ƒë·∫πp s·∫°c nhanh m√†n_h√¨n...,5,3 tu·∫ßn tr∆∞·ªõc,Positive,True,4,2,0
14,14,m√¨nh mua dc 1 tu·∫ßn m√°y ph√°t tr·ª±c_ti·∫øp t·ª±_nhi√™n...,1,6 ng√†y tr∆∞·ªõc,Negative,True,0,0,3
18,18,d√πng ƒë∆° m√°y b·∫•m m√£i m·ªõi kh·ªüi_ƒë·ªông song n√≥ng ra...,1,5 ng√†y tr∆∞·ªõc,Negative,True,0,0,1


> Fill in the common value for that attribute

In [140]:
common_value=valid_dates['date_time'].mode()[0]
common_value_test=valid_dates_test['date_time'].mode()[0]
invalid_dates.loc[:, 'date_time'] = common_value
invalid_dates_test.loc[:,'date_time']=common_value

In [141]:
invalid_dates.head(5)

Unnamed: 0,index,comment,n_star,date_time,label,contains_features,positive_count,neutral_count,negative_count
0,0,m·ªõi mua m√°y thegioididong th·ªët_n·ªët c·∫£m_th·∫•y ok...,5,21/04/2020,Positive,True,6,0,0
6,6,m√¨nh m·ªõi x√†i 7 th√°ng xu·ªëng 7 pin ch·∫£ hi·ªÉu m√°y ...,1,21/04/2020,Negative,True,0,0,1
11,11,kh√° ·ªïn trong t·∫ßm gi√° cam ƒë·∫πp s·∫°c nhanh m√†n_h√¨n...,5,21/04/2020,Positive,True,4,2,0
14,14,m√¨nh mua dc 1 tu·∫ßn m√°y ph√°t tr·ª±c_ti·∫øp t·ª±_nhi√™n...,1,21/04/2020,Negative,True,0,0,3
18,18,d√πng ƒë∆° m√°y b·∫•m m√£i m·ªõi kh·ªüi_ƒë·ªông song n√≥ng ra...,1,21/04/2020,Negative,True,0,0,1


> combine invalid_dates and valid_dates

In [142]:
df_train=pd.concat([valid_dates,invalid_dates],ignore_index=True)
df_test=pd.concat ([valid_dates_test,invalid_dates_test],ignore_index=True)

In [143]:
df_train.shape

(8898, 9)

In [144]:
df_train.head(5)

Unnamed: 0,index,comment,n_star,date_time,label,contains_features,positive_count,neutral_count,negative_count
0,1,pin k√©m c√≤n mi·ªÖn ch√™ mua 832019 t√¨nh_tr·∫°ng pin...,5,14/09/2019,Neutral,True,1,0,1
1,2,sao g·ªçi ƒëi·ªán_tho·∫°i m√†n_h√¨nh ch·∫•m nh·ªè nh√°y g·∫ßn ...,3,17/08/2020,Negative,True,0,0,1
2,3,m·ªçi ng∆∞·ªùi c·∫≠p_nh·∫≠t ph·∫ßn_m·ªÅm n√≥ b·ªõt t·ªën pin m√¨n...,3,29/02/2020,Neutral,True,0,2,1
3,4,m·ªõi mua s√†i 1 th√°ng th·∫•y pin tr√¢u s√†i bao m∆∞·ª£t...,5,4/6/2020,Positive,True,2,0,1
4,5,x√†i t·ªët m∆∞·ª£t pin tr√¢u b·∫°n ƒë·ªô s√°ng ƒë·ªß nh√¢n_vi√™n...,5,20/06/2019,Positive,True,3,2,0


>Remove index and contains_features columns 

In [145]:
df_train.drop(["index",'contains_features'],axis=1,inplace=True)
df_test.drop(['index'],axis=1,inplace=True)

In [146]:
df_train.shape

(8898, 7)

In [147]:
df_train.head(5)

Unnamed: 0,comment,n_star,date_time,label,positive_count,neutral_count,negative_count
0,pin k√©m c√≤n mi·ªÖn ch√™ mua 832019 t√¨nh_tr·∫°ng pin...,5,14/09/2019,Neutral,1,0,1
1,sao g·ªçi ƒëi·ªán_tho·∫°i m√†n_h√¨nh ch·∫•m nh·ªè nh√°y g·∫ßn ...,3,17/08/2020,Negative,0,0,1
2,m·ªçi ng∆∞·ªùi c·∫≠p_nh·∫≠t ph·∫ßn_m·ªÅm n√≥ b·ªõt t·ªën pin m√¨n...,3,29/02/2020,Neutral,0,2,1
3,m·ªõi mua s√†i 1 th√°ng th·∫•y pin tr√¢u s√†i bao m∆∞·ª£t...,5,4/6/2020,Positive,2,0,1
4,x√†i t·ªët m∆∞·ª£t pin tr√¢u b·∫°n ƒë·ªô s√°ng ƒë·ªß nh√¢n_vi√™n...,5,20/06/2019,Positive,3,2,0


In [148]:
df_train['comment'][0]

'pin k√©m c√≤n mi·ªÖn ch√™ mua 832019 t√¨nh_tr·∫°ng pin c√≤n 88 ai gi·ªëng t√¥i'

> Dump file CSV after data processing

In [149]:
df_train.to_csv("../data/data_processed/trainprocessed.csv", index=False)
df_test.to_csv("../data/data_processed/testprocesssed.csv",index=False)