<a href="https://colab.research.google.com/github/NavePnow/Auto-Hashtag-of-Social-Posts/blob/master/Generate_Formal_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import os
import re
import sys
import pandas as pd
import numpy as np
"""Reference
https://www.webopedia.com/quick_ref/Twitter_Dictionary_Guide.asp
https://www.socialmediatoday.com/content/top-twitter-abbreviations-you-need-know
https://digiphile.info/2009/06/11/top-50-twitter-acronyms-abbreviations-and-initialisms/
https://bitrebels.com/social/twitter-dictionary-35-twitter-abbreviations/
"""
class TextProcessor:
    """ class TextProcessor is a class dealing with multiple methods for processing data
    Args:
        in_dir: working directory
        dictionary_file: dictionary file includes special Twitter terms
        hashtag_file: hashtag file includes all hashtags we need to classify
    """

    def __init__(self, in_dir, dictionary_file, hashtag_file):
        self.in_dir = in_dir
        self.hashtag = set()
        self.dictionary = {}
        self.dictionary_file = dictionary_file
        self.hashtag_file = hashtag_file

    """ load local dictionary and build index
    """

    def load_dictioanry(self):
        print('loading dictionary...')
        if (not os.path.exists(self.in_dir)):
            print("wrong file path!")
            sys.exit(2)
        f = open(self.in_dir+"/"+self.dictionary_file)

        # load dictionary and build index
        for line in iter(f):
            line = line.split(' ', 1)
            if line[0].lower() not in self.dictionary:
                self.dictionary[line[0].lower()] = line[1].replace(
                    '\n', '').lower()
        print('load dictionary successfully')

    """ load local hashtag and build set
    """

    def load_hashtag(self):
        print('loading hashtag...')
        if (not os.path.exists(self.in_dir)):
            print("wrong file path!")
            sys.exit(2)
        f = open(self.in_dir+"/"+self.hashtag_file)

        # load dictionary and build index
        for line in iter(f):
            if line.lower() not in self.hashtag:
                self.hashtag.add(line.lower().replace(
                    '\n', '').replace('#', ''))
        print('load hashtag successfully')

    """ Irrelevant hashtag filtering
    Args:
        text: text to be filtered
    
    Returns:
        ' '.join(rst): filtered hashtag
    """

    def del_hashtag(self, text):
        tmp_list = str(text).split(',')
        rst = []
        for i in range(len(tmp_list)):
            if (tmp_list[i].lower().replace(' ', '') in self.hashtag):
                rst.append(tmp_list[i].lower().replace(' ', ''))
        return ', '.join(rst)

    """ Informal language normalization
    Args:
        text: text to be normailzed
    
    Returns:
        ' '.join(tmp_list): normalized text
    """

    def informal_norm(self, text):
        tmp_list = text.split()
        for i in range(len(tmp_list)):
            if (tmp_list[i].lower() in self.dictionary):
                tmp_list[i] = self.dictionary[tmp_list[i].lower()]
        return ' '.join(tmp_list)

    """ Irrelevant text tokens filtering
    Args:
        text: text to be filtered
    
    Returns:
        text: filtered text
    """

    def cleanup(self, text):
        # drop http[s]://*
        text = re.sub(
            r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', str(text))

        # drop something liek @EP_President
        text = re.sub(u"\@.*?\s", '', str(text))

        # drop # of hashtag within sentence
        text = text.replace('#', ' ')

        #  remove emojis
        text = text.encode('ascii', 'ignore').decode('ascii')
        
        return text

    """ drop tweets whose length <= 3
    Args:
        text: text to be filtered
    
    Returns:
        text: filtered text
    """

    def drop_tweet(self, text):
        if (len(str(text).split()) <= 3):
            return ''
        else:
            return text

In [0]:
class TextClassifier:
    """ class TextClassifier is a class dealing with text classification
    Args:
        in_dir: working directory
        dictionary_file: dictionary file includes special Twitter terms
        hashtag_file: hashtag file includes all hashtags we need to classify
        input_file: input file is csv format containing crawl data
        model_dir: directory of model output
    """

    def __init__(self, in_dir, dictionary_file, hashtag_file, input_file, model_dir="model_out"):
        self.in_dir = in_dir
        self.hashtag = set()
        self.dictionary = {}
        self.dictionary_file = dictionary_file
        self.hashtag_file = hashtag_file
        self.input_file = input_file
        self.train_file = None
        self.valid_file = None
        self.model_dir = model_dir
    
    """ load and process the raw crawl data
    Returns:
        dat.drop(columns=['hashtag']): formal format of data
    """

    def load_raw_data(self):
        textprocessor = TextProcessor(
            self.in_dir, self.dictionary_file, self.hashtag_file)
        textprocessor.load_dictioanry()
        textprocessor.load_hashtag()
        dat = pd.read_csv(
            self.in_dir + '/' + self.input_file, header=None)

        dat.columns = ['tweet', 'hashtag']
        # n = len(dat)
        # nlist = range(0,n)
        dat['id'] = None
        dat = dat[['id', 'tweet', 'hashtag']]

        total = ['id', 'tweet', 'hashtag']
        total = total + list(textprocessor.hashtag)
        dat = dat.reindex(columns=list(total), fill_value=0)
        dat['tweet'] = dat['tweet'].apply(textprocessor.cleanup)
        dat['tweet'] = dat['tweet'].apply(textprocessor.informal_norm)

        dat['hashtag'] = dat['hashtag'].apply(textprocessor.del_hashtag)
        dat = dat.drop(dat[dat['hashtag'].map(len) <
                           1].index).reset_index(drop=True)

        dat['tweet'] = dat['tweet'].apply(textprocessor.drop_tweet)
        dat = dat.drop(dat[dat['tweet'].map(len) <
                           1].index).reset_index(drop=True)
        n = len(dat)
        nlist = range(0, n)
        dat['id'] = nlist

        # assign label
        for i in range(len(dat['hashtag'])):
            tmp_list = dat['hashtag'][i].split(",")
            for j in range(len(tmp_list)):
                tmp_list[j] = tmp_list[j].replace(' ', '')
                dat[tmp_list[j]][i] = 1
        return dat.drop(columns=['hashtag'])

    """ split the data into train and valid data
    Args:
        data: formal processed crawl data
    """

    def split_data(self,data):
        self.train_file, self.valid_file = np.split(
            data.sample(frac=1), [int(.7*len(data))])
    
    """ save the train and valid data into file
    """
        
    def save_to_file(self):
        self.train_file.to_csv(self.in_dir + '/' +
                               "train.csv", index=False, header=True)
        self.valid_file.to_csv(self.in_dir + '/' +
                               "valid.csv", index=False, header=True)

    """ make prediction given text
    Args:
        text: free text to be classified
    
    Returns:
        rst_list: a list contains top 7 hashtags accroding to the classification
    """
    def predict(self, text):
        predictor = BertClassificationPredictor(
            model_path=self.in_dir + '/' +self.model_dir,
            label_path=self.in_dir + '/labels',  # location for labels.csv file
            multi_label=True,
            # model_type='xlnet',
            do_lower_case=True)
        prediction = predictor.predict(str(text))[:7]
        rst_list = []
        for i in range(len(prediction)):
            rst_list.append(prediction[i][0])
        return rst_list

In [3]:
!git clone https://github.com/NavePnow/multi-label-classification.git

fatal: destination path 'multi-label-classification' already exists and is not an empty directory.


In [4]:
textclassifier = TextClassifier(
    '/content/multi-label-classification', 'dictionary.txt', 'hashtag.txt', 'input.train.text.csv')
data = textclassifier.load_raw_data()
data
textclassifier.split_data(data)
textclassifier.save_to_file()

loading dictionary...
load dictionary successfully
loading hashtag...
load hashtag successfully


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
