In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize 
from collections import Counter

## Data Load

In [2]:
df = pd.read_csv("train.csv", delimiter=",")

In [3]:
df. head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [4]:
df["sentiment"].unique()

array(['neutral', 'negative', 'positive'], dtype=object)

## Data Cleaning

In [5]:
df = df.astype(str).apply(lambda x: x.str.lower())
df["selected_text"] = df["selected_text"].apply(lambda x: re.sub(r'http\S+',' ',x))
df["selected_text"] = df["selected_text"].apply(lambda x: re.sub(r'[^a-z]',' ',x))
df["selected_text"] = df["selected_text"].apply(lambda x: re.sub(r'[^\w\s]',' ',x))
df["selected_text"] = df["selected_text"].apply(lambda x: x.replace('\n',' '))
df["selected_text"] = df["selected_text"].apply(lambda x: x.replace('.',' '))
df["selected_text"] = df["selected_text"].apply(lambda x: x.replace('\\',' '))
df["selected_text"] = df["selected_text"].apply(lambda x: re.sub(r'test(.*)',' ',x))

In [6]:
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))

In [7]:
df['selected_text'] = df['selected_text'].apply(lambda x: ' '.join([item for item in x.split() if item not in stopwords]))

In [8]:
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"i`d have responded, if i were going",responded going,neutral
1,549e992a42,sooo sad i will miss you here in san diego!!!,sooo sad,negative
2,088c60f138,my boss is bullying me...,bullying,negative
3,9642c003ef,what interview! leave me alone,leave alone,negative
4,358bd9e861,"sons of ****, why couldn`t they put them on t...",sons,negative


## df per sentiment

In [9]:
df_positive = df[df["sentiment"]=="positive"]
df_positive.head()

Unnamed: 0,textID,text,selected_text,sentiment
6,6e0c6d75b1,2am feedings for the baby are fun when he is a...,fun,positive
9,fc2cbefa9d,journey!? wow... u just became cooler. hehe....,wow u became cooler,positive
11,16fab9f95b,i really really like the song love story by ta...,like,positive
21,e48b0b8a23,playing ghost online is really interesting. th...,interesting,positive
25,e00c6ef376,"the free fillin` app on my ipod is fun, im add...",free fillin app ipod fun im addicted,positive


In [10]:
len(df_positive)

8582

In [11]:
df_negative = df[df["sentiment"]=="negative"]
df_negative.head()

Unnamed: 0,textID,text,selected_text,sentiment
1,549e992a42,sooo sad i will miss you here in san diego!!!,sooo sad,negative
2,088c60f138,my boss is bullying me...,bullying,negative
3,9642c003ef,what interview! leave me alone,leave alone,negative
4,358bd9e861,"sons of ****, why couldn`t they put them on t...",sons,negative
12,74a76f6e0a,my sharpie is running dangerously low on ink,dangerously,negative


In [12]:
len(df_negative)

7781

In [13]:
df_neutral = df[df["sentiment"]=="neutral"]
df_neutral.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"i`d have responded, if i were going",responded going,neutral
5,28b57f3990,http://www.dothebouncy.com/smf - some shameles...,shameless plugging best rangers forum earth,neutral
7,50e14c0bb8,soooo high,soooo high,neutral
8,e050245fbd,both of you,,neutral
10,2339a9b08b,"as much as i love to be hopeful, i reckon the...",much love hopeful reckon chances minimal p nev...,neutral


In [14]:
len(df_neutral)

11118

## Common words

In [15]:
positive = df_positive['selected_text'].tolist()

In [16]:
negative = df_negative['selected_text'].tolist()

In [17]:
neutral = df_negative['selected_text'].tolist()

## Tokenize

In [18]:
positive_words = [word_tokenize(i) for i in positive]

In [19]:
negative_words = [word_tokenize(i) for i in negative]

In [20]:
neutral_words = [word_tokenize(i) for i in neutral]

In [21]:
#positive_neutral = list(positive_words and neutral_words)

## Test

In [22]:
def evaluar_tweet(palabras_filtradas, palabras_positivas, palabras_negativas):
    
    hits = []
    
    for palabra in palabras_filtradas:
        if palabra in palabras_positivas:
            hits.append(1)
            #print("+" + palabra)
        elif palabra in palabras_negativas:
            hits.append(-1)
            #print("-" + palabra)
    
    media_hits = 0.0
    if len(hits) > 0:
        media_hits = sum(hits) / float(len(hits))
    return media_hits

In [23]:
df_test = pd.read_csv("test.csv", delimiter=",")

In [24]:
df_test.head()

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive


In [25]:
df_test['selected_text'] = df_test['text'].apply(lambda x: x.lower())

In [26]:
df_test["selected_text"] = df_test["selected_text"].apply(lambda x: re.sub(r'http\S+',' ',x))
df_test["selected_text"] = df_test["selected_text"].apply(lambda x: re.sub(r'[^a-z]',' ',x))
df_test["selected_text"] = df_test["selected_text"].apply(lambda x: re.sub(r'[^\w\s]',' ',x))
df_test["selected_text"] = df_test["selected_text"].apply(lambda x: x.replace('\n',' '))
df_test["selected_text"] = df_test["selected_text"].apply(lambda x: x.replace('.',' '))
df_test["selected_text"] = df_test["selected_text"].apply(lambda x: x.replace('\\',' '))
df_test["selected_text"] = df_test["selected_text"].apply(lambda x: re.sub(r'test(.*)',' ',x))

In [27]:
df_test['selected_text'] = df_test['selected_text'].apply(lambda x: ' '.join([item for item in x.split() if item not in stopwords]))

In [28]:
test = df_test['selected_text'].tolist()

In [29]:
test_words = [word_tokenize(i) for i in test]

In [30]:
df_test["suggested_sentiment"] = df_test["selected_text"].apply(lambda x: evaluar_tweet(test_words, positive_words, negative_words))

In [32]:
df_test.head()

Unnamed: 0,textID,text,sentiment,selected_text,suggested_sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,last session day,0.323308
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,shanghai also really exciting precisely skyscr...,0.323308
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,recession hit veronique branquinho quit compan...,0.323308
3,01082688c6,happy bday!,positive,happy bday,0.323308
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,like,0.323308
