# Exploritory data analasys

In [1]:
import numpy as np
import pandas as pd

from constants import *

In [2]:
def remove_tags(df):
  print('Removing tags...')
  df['text'] = df['text'].str.replace('<[\w]*>', '')

In [3]:
def read_data(list_, submission=False):
  if not submission:
    if len(list_) == 2:
      for i, file_name in enumerate(list_):
        with open(file_name) as f:
          content = f.read().splitlines()
        df = pd.DataFrame(columns=['text', 'label'],
                          data={'text': content,
                                'label': np.ones(len(content)) * i})
  else:
    if len(list_) == 1:
      with open(list_[0]) as f:
        content = f.read().splitlines()
        ids = [line.split(',')[0] for line in content]
        texts = [','.join(line.split(',')[1:]) for line in content]
        df = pd.DataFrame(columns=['ids', 'text'],
                          data={'ids': ids, 'text': texts})
  return df


train = read_data([TRAIN_DATA_NEGATIVE_FULL, TRAIN_DATA_POSITIVE_FULL], submission=False)
test = read_data([TEST_DATA], submission=True)

In [4]:
train.head(5)

Unnamed: 0,text,label
0,<user> i dunno justin read my mention or not ....,1.0
1,"because your logic is so dumb , i won't even c...",1.0
2,""" <user> just put casper in a box ! "" looved t...",1.0
3,<user> <user> thanks sir > > don't trip lil ma...,1.0
4,visiting my brother tmr is the bestest birthda...,1.0


In [5]:
test.head(5)

Unnamed: 0,ids,text
0,1,sea doo pro sea scooter ( sports with the port...
1,2,<user> shucks well i work all week so now i ca...
2,3,i cant stay away from bug thats my baby
3,4,<user> no ma'am ! ! ! lol im perfectly fine an...
4,5,"whenever i fall asleep watching the tv , i alw..."


In [6]:
remove_tags(train)
remove_tags(test)

Removing tags...
Removing tags...


In [7]:
train.head(5)

Unnamed: 0,text,label
0,i dunno justin read my mention or not . only ...,1.0
1,"because your logic is so dumb , i won't even c...",1.0
2,""" just put casper in a box ! "" looved the bat...",1.0
3,thanks sir > > don't trip lil mama ... just ...,1.0
4,visiting my brother tmr is the bestest birthda...,1.0


## 1. Find all emoticons used

In [8]:
with open('utility/emoticons.txt') as f:
  content = f.read().splitlines()
  # Ignore first lines of comments and the blank line
  content = content[6:]
  EMOTICONS = content
  

def search_emoticon(text, emoticons=EMOTICONS):
  e_list = []
  for e in emoticons:
    if e in text:
      e_list.append(e)
  return e_list

  
def find_emoticons(df, emoticons=EMOTICONS):
  df['emoticons'] = df['text_wo_spaces'].apply(
      lambda text: search_emoticon(text))

In [9]:
train['text_wo_spaces'] = train['text'].str.replace(' ', '')
test['text_wo_spaces'] = test['text'].str.replace(' ', '')
find_emoticons(train)
find_emoticons(test)

In [10]:
all_train_emoticons = [x for y in train['emoticons'].tolist() for x in y]
all_test_emoticons = [x for y in test['emoticons'].tolist() for x in y]
all_emoticons = list(set(all_train_emoticons + all_test_emoticons))

In [11]:
len(all_emoticons)

122

In [12]:
all_emoticons

['(__)',
 ':#',
 ';-)',
 '>:)',
 ';;',
 ';)',
 ':c',
 ':-c',
 ':-*',
 '^_^',
 'v.v',
 '=\\',
 ':^)',
 '=]',
 "('_')",
 ':o)',
 '(._.)',
 ':>',
 ':[',
 ':-/',
 '>:/',
 'm(__)m',
 'uwu',
 '%)',
 '(^^)v',
 '<3',
 '=/',
 ':]',
 '(:',
 '=(',
 ":'(",
 ':{',
 ':\\',
 ':-&',
 ':-o',
 '\\o/',
 ';3',
 ':-3',
 'o-o',
 ':c)',
 ':-)',
 '(^o^)',
 '(~_~)',
 '(^^)/',
 ';-;',
 ':*',
 '0:3',
 '=3',
 '<\\3',
 ':-.',
 ':-}',
 'xp',
 ':-0',
 ":'-)",
 ';^)',
 '>^_^<',
 '^5',
 '(o.o)',
 ';n;',
 '(+_+)',
 ':-]',
 ":')",
 '8-)',
 '</3',
 '(-.-)',
 ':))',
 '*-)',
 '(^^)',
 ':b',
 '(^_^)/',
 ';(',
 '>:[',
 '(*_*)',
 '<+',
 '0:)',
 '8-0',
 'o_o',
 '>:\\',
 "(';')",
 ':}',
 '(^.^)',
 ':&',
 '>.<',
 ':o',
 ':<',
 ';]',
 '(^_^)',
 '(#^.^#)',
 ':(',
 '=)',
 '}:)',
 ':-|',
 '^^;',
 '><>',
 ';_;',
 ':@',
 ':-p',
 ':-<',
 '(-_-)',
 ':-b',
 'o/\\o',
 '(^_-)',
 '^_^;',
 ':)',
 '))',
 ':|',
 '=p',
 ':/',
 '3:)',
 'd:',
 '(..)',
 '>:3',
 ':$',
 ':3',
 '*)',
 ':-[',
 '8)',
 'x-p',
 ":'-(",
 ':p',
 ':->',
 'o_0']