In [27]:
import pandas as pd
import numpy as np
import re
from pathlib import Path
import os
import codecs
import glob

I have labeled arabic sentiment analysis documents from both UCI and a Kaggle repository. This first section will join UCI tweets to the Kaggle training set data.

In [89]:
pos_path = "/Users/evanwilliams/Projects/Arabic_Twitter_Sentiment/UCI/Positive"
neg_path = "/Users/evanwilliams/Projects/Arabic_Twitter_Sentiment/UCI/Negative"

In [91]:
def get_text(path):
    """
    imports UCI data while ignoring non utf-8 encoded chars
    """
    text = ""
    text_words = []
    # use codecs.open to ingnore non utf-8 encoded chars; instead of "with open(path, 'r') as myfile:"
    with codecs.open(path, "r", encoding="utf-8", errors="ignore") as myfile:
        text = myfile.read()
    words = text.split(" ")
    for word in words:
        text_words.append(word)
    return ' '.join(text_words)

In [92]:
def get_data(path):
    """
    imports UCI data 
    """
    texts = []
    all_paths = glob.glob(os.path.join(path,'*.txt'))
    for path in all_paths: 
        texts.append(get_text(path))
    return texts

In [95]:
positive_text = get_data(pos_path)
negative_text = get_data(neg_path)

In [101]:
def append_UCI(positive_text, negative_text):
    """
    converts text to dataframes and appends them together
    """
    pos = pd.DataFrame(positive_text, columns = ["text"])
    pos['sentiment'] = "positive"
    neg = pd.DataFrame(negative_text, columns = ["text"])
    neg['sentiment'] = "negative"
    sentiment = pd.concat([pos, neg])
    return sentiment

In [102]:
uci_df = append_UCI(positive_text, negative_text)

In [130]:
def tsv_merge(path):
    all_files = glob.glob(path + "/*.tsv")
    li = []
    for filename in all_files:
        df = pd.read_table(filename, index_col=None)
        df.columns = ['sentiment', 'text']
        li.append(df)
    frame = pd.concat(li, axis=0, ignore_index=True)
    return frame

Time to bring in kaggle data & merge the two. change column names to be consistent with UCI.

In [137]:
kaggle = tsv_merge("../Data")

In [138]:
cleanup_sentiment = {"sentiment": {"neg": "negative", "pos": "positive"}}
kaggle.replace(cleanup_sentiment, inplace=True)

In [139]:
df = pd.concat([kaggle, uci_df])