In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# RAW
dataFrame_raw = pd.read_csv("..\\..\\data\\raw\\dataset.csv", encoding="ISO-8859-1", header=None)
dataFrame_raw.columns = ["label", "time", "date", "query", "username", "text"]

dataFrame = dataFrame_raw[["label", "text"]]

In [8]:
# Cắt nhỏ kích thước dữ liệu
dataFrame_positive = dataFrame[dataFrame["label"] == 4]
dataFrame_negative = dataFrame[dataFrame["label"] == 0]

dataFrame_positive = dataFrame_positive.iloc[:int(len(dataFrame_positive) / 40)]
dataFrame_negative = dataFrame_negative.iloc[:int(len(dataFrame_negative) / 40)]

dataFrame = pd.concat([dataFrame_positive, dataFrame_negative])

In [9]:
# Tokenize
from nltk.tokenize import TweetTokenizer

tk = TweetTokenizer(reduce_len=True)
data = []

for index, df in dataFrame.iterrows():
    if df["label"] == 4:
        data.append( (tk.tokenize(df["text"]), 1) )
    else:
        data.append( (tk.tokenize(df["text"]), 0) )

In [25]:
# Lemmatize
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer

def lemmatize_token(token): 
    token = [token] 
    token = pos_tag(token)

    if token[0][1].startswith("NN"):
        pos = 'n'
    elif token[0][1].startswith("VB"):
        pos = 'v'
    else:
        pos = 'a'

    lemmatizer = WordNetLemmatizer()
    return lemmatizer.lemmatize(token[0][0], pos)

In [11]:
# Clean
def expand_token(token):
    if token == 'u':
        return 'you'
    if token == 'r':
        return 'are'
    if token == 'some1':
        return 'someone'
    if token == 'yrs':
        return 'years'
    if token == 'hrs':
        return 'hours'
    if token == 'mins':
        return 'minutes'
    if token == 'secs':
        return 'seconds'
    if token == 'pls' or token == 'plz':
        return 'please'
    if token == '2morow':
        return 'tomorrow'
    if token == '2day':
        return 'today'
    if token == '4got' or token == '4gotten':
        return 'forget'
    if token == 'amp' or token == 'quot' or token == 'lt' or token == 'gt' or token == '½25':
        return ''
    return token

In [40]:
#Remove noise
import re, string
from nltk.corpus import stopwords
STOP_WORDS = stopwords.words('english')

def clean_tokens(tweet_tokens):
    cleaned_tokens = []

    for token in tweet_tokens:
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9-_]+)",'', token)
        
        if len(token) >= 1:
            token = expand_token(token.lower())

            token = lemmatize_token(token)
        
            if token not in string.punctuation and token not in STOP_WORDS:
                cleaned_tokens.append(token)

    return cleaned_tokens

clean_tokens(["he", "wouldn't", "love", "u", "in", "three", "yrs"])

['love', 'three', 'year']

In [13]:
def list_to_dict(cleaned_tokens):
    return dict([token, True] for token in cleaned_tokens)

final_data = []

for tokens, label in data:
    final_data.append((list_to_dict(clean_tokens(tokens)), label))

print(final_data[:3])

[({'love': True, 'guy': True, 'best': True}, 1), ({'im': True, 'meeting': True, 'one': True, 'besties': True, 'tonight': True, 'cant': True, 'wait': True, 'girl': True, 'talk': True}, 1), ({'thanks': True, 'twitter': True, 'add': True, 'sunisa': True, 'get': True, 'meet': True, 'hin': True, 'show': True, 'dc': True, 'area': True, 'sweetheart': True}, 1)]
