In [7]:
import pandas as pd
import numpy as np
import csv
import re

# read schema 
schema_rawfile = pd.read_csv("twitter-swisscom/schema_home.txt", header=None, sep='\s+')
# extract schema columns
data_columns = schema_rawfile[1].values

from nltk import word_tokenize
from string import punctuation

In [3]:
col_list = [0, 3, 10, 11]

## Emoji

**Handcrafted two sets of Emojis Happy vs Sad**

Compile the unicode of those Emojis in regex.

In [4]:
emoji_happy = re.compile(
    u"(\U0001f60a)|"  # 😊
    u"(\u2764\ufe0ff)|"  # ❤️
    u"(\u2764)|"  # ❤
    u"(\U0001f601)|"  # 😊
    u"(\U0001f600)|"  # 😀
    u"(\U0001f602)|"  # 😁
    u"(\U0001f609)|"  # 😉
    u"(\U0001f60b)|"  # 😋
    u"(\U0001f60e)|"  # 😎
    u"(\U0001f618)|"  # 😘
    u"(\U0001f60d)|"  # 😍
    u"(\U0001f617)|"  # 😗
    u"(\u263a\ufe0f)"  # ☺️
    "+", flags=re.UNICODE)

emoji_sad = re.compile(
    u"(\U0001f610)|"  # 
    u"(\U0001f61e)|"  # 
    u"(\U0001f61f)|"  # 
    u"(\U0001f622)|"  # 
    u"(\U0001f624)|"  # 
    u"(\U0001f62d)|"  # 
    u"(\U0001f629)|"  # 
    u"(\U0001f628)|"  # 
    u"(\U0001f620)|"  # 
    u"(\U0001f621)|"  # 
    u"(\U0001f635)|"  # 
    u"(\U0001f631)|"  # 
    u"(\U0001f611)|"  # 
    u"(\U0001f60f)|"  #
    u"(\U0001f623)|"  # 
    u"(\U0001f625)|"  # 
    u"(\U0001f910)|"  # 
    u"(\U0001f62b)|"  # 
    u"(\U0001f612)|"  # 
    u"(\U0001f613)|"  # 
    u"(\U0001f614)|"  # 
    u"(\u2639\ufe0f)|" # 
    u"(\U0001f641)|"  # 
    u"(\U0001f616)"  # 
    "+", flags=re.UNICODE)

## Emoticons

regex to match and extract happy and sad emoticons from text.

**Happy: 
 :p 
 :3 
 :* 
 :)) 
 (: 
 :-) 
 :-P 
 :-p 
 :-3 
 :-* 
 :^) 
 :^P 
 :^p 
 :^3 
 :^* 
 (: 
 (-: 
 (^: < 3**
 
 **Sad:
 :O
 :|
 :/
 :\**
 :$
 :((
 :-(
 :-O
 :-|
 :-/
 :-\
 :-$
 :^(
 :^O
 :^|
 :^/
 :^\
 :^$
 ):
 )-:
 )^:

In [5]:
emoticon_happy = re.compile("(\:\w+\:|\^\^|\<[\/\\]?3|[\(|][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DPp\*\)]+)(?=\s|[\!\.\?]|$)")
emoticon_sad = re.compile("(\:\w+\:|[\)][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[O\$\\\(\/\|]+)(?=\s|[\!\.\?]|$)")

## Extract Happy and Sad Tweets

The extract_tweets function first strip and sanitize the tweets in dataframe, then extract and save the tweets to text files based on emoji/emoticon regex matching.

In [6]:
def extract_tweets(df):
    NaN_text = df.text.isnull()
    df = df[~ NaN_text]

    happy_tweets = ""
    sad_tweets = ""

    for index, row in df.iterrows():
        # match url in string and replace with "" empty string
        t = re.sub(r"http\S+", " ", row['text'])
        # remove @username
        t = re.sub('@[^\s]+', " ", t)
        # find all hashtags
        tag_list = re.findall(r'#(\w+)', t)
        # remove hashtags
        t = re.sub(r'#(\w+)', " ",  t)

        if (emoji_happy.search(t) or emoticon_happy.search(t)):
            t = emoticon_happy.sub(' ', t)
            t = emoji_happy.sub(' ', t)
            t = emoticon_sad.sub(' ', t)
            t = emoji_sad.sub(' ', t)
            happy_tweets += (t + '\n')
            
        if (emoji_sad.search(t) or emoticon_sad.search(t)):
            t = emoticon_sad.sub(' ', t)
            t = emoji_sad.sub(' ', t)
            t = emoticon_happy.sub(' ', t)
            t = emoji_happy.sub(' ', t)
            sad_tweets += (t + '\n')
    # print(happy_tweets)
    # print("========================================================================")
    # print(sad_tweets)
    
    text_file = open("happy_tweets_line.txt", "a")
    text_file.write(happy_tweets)
    text_file.close()
    
    text_file2 = open("sad_tweets_line.txt", "a")
    text_file2.write(sad_tweets)
    text_file2.close()
    return 

In [7]:
# Apply the extract_tweets function on the 26 million tweet data, we decide to chunk the data in 2000 pieces.
for data in pd.read_table(open("data_clean/twex.tsv", 'rU'),sep='\t',encoding='utf-8',escapechar="\\",na_values='N', index_col=0,quoting=csv.QUOTE_NONE, header=None, names=data_columns, chunksize=10000, engine='c', usecols=col_list):
    extract_tweets(data)

  if __name__ == '__main__':


In [3]:
# Check if the tweets have been extracted correctly.
sad_line = pd.read_csv("sad_tweets_line.txt", header=None, sep='\\n', names=["text"])

  from ipykernel import kernelapp as app


In [4]:
sad_line

Unnamed: 0,text
0,leider werden die rumfahrenden nicht verschrot...
1,bye bye 4-hour Workweek back to 9-5 tomorrow...
2,"Cashier in the Apple Store: ""This (adaptor) is..."
3,Sometimes I don't quite live up to my gadgetee...
4,There are strange days. This was one of them.
5,sorry - heute ganzer Tag Workshop im Software ...
6,"Hm, die grosse Einkaufstour bleibt fällt wohl ..."
7,Danke für alle die mich unterstützt haben! Es ...
8,Obligatoire pour son Iphone (juste après talki...
9,"Argh, das ist ja ärgerlich Wie siehts beim M..."


## Tweets extraction Conclusion

**happy tweets extracted**

76M

→ wc -l happy_tweets_line.txt
  1767108 happy_tweets_line.txt

**sad tweets extracted**

12M

→ wc -l sad_tweets_line.txt
  256019 sad_tweets_line.txt


In [8]:
# Accuracy testing

counter = 0

for index, row in sad_line.iterrows():
    score = 0
        
        # match url in string and replace with "" empty string
    t = re.sub(r"http\S+", " ", row['text'])
        # remove @username
    t = re.sub('@[^\s]+', " ", t)
        # remove hashtags
    t = re.sub(r'#(\w+)', " ",  t)

    tbl = str.maketrans({ord(ch):" " for ch in punctuation})
        
    tokens = t.translate(tbl).split()
            
    for it in tokens:
        if (it in happy_lexicon):
            score += 1
        if (it in sad_lexicon):
            score -= 1
    
    if(score > 0):
        counter = (counter + 1)

print(counter)

NameError: name 'happy_lexicon' is not defined