## Libraries

In [194]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression

## Data Gathering

In [246]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [247]:
train = df.copy()
train = train.set_index('textID')
train['Selected Text'] = 0
train.head()

Unnamed: 0_level_0,text,selected_text,sentiment,Selected Text
textID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,0
549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,0
088c60f138,my boss is bullying me...,bullying me,negative,0
9642c003ef,what interview! leave me alone,leave me alone,negative,0
358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,0


In [248]:
train['sentiment'].value_counts()

neutral     11118
positive     8582
negative     7781
Name: sentiment, dtype: int64

In [249]:
train[train['sentiment']=='neutral']

Unnamed: 0_level_0,text,selected_text,sentiment,Selected Text
textID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,0
28b57f3990,http://www.dothebouncy.com/smf - some shameles...,http://www.dothebouncy.com/smf - some shameles...,neutral,0
50e14c0bb8,Soooo high,Soooo high,neutral,0
e050245fbd,Both of you,Both of you,neutral,0
2339a9b08b,"as much as i love to be hopeful, i reckon the...","as much as i love to be hopeful, i reckon the ...",neutral,0
...,...,...,...,...
a753a93e45,"few grilled mushrooms and olives, feta cheese ...","few grilled mushrooms and olives, feta cheese ...",neutral,0
ac92790d8b,94 more days till BH comes back to LA,94 more days till BH comes back to LA,neutral,0
15bb120f57,"i`m defying gravity. and nobody in alll of oz,...","i`m defying gravity. and nobody in alll of oz,...",neutral,0
a208770a32,in spoke to you yesterday and u didnt respond...,in spoke to you yesterday and u didnt respond ...,neutral,0


In [250]:
for idx in train[train['sentiment']=='neutral'].index:
    train.loc[idx,'Selected Text'] = train.loc[idx,'text'] 

In [251]:
train[train['sentiment']=='neutral'].head()

Unnamed: 0_level_0,text,selected_text,sentiment,Selected Text
textID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,"I`d have responded, if I were going"
28b57f3990,http://www.dothebouncy.com/smf - some shameles...,http://www.dothebouncy.com/smf - some shameles...,neutral,http://www.dothebouncy.com/smf - some shameles...
50e14c0bb8,Soooo high,Soooo high,neutral,Soooo high
e050245fbd,Both of you,Both of you,neutral,Both of you
2339a9b08b,"as much as i love to be hopeful, i reckon the...","as much as i love to be hopeful, i reckon the ...",neutral,"as much as i love to be hopeful, i reckon the..."


In [252]:
train[train['sentiment']=='positive'].head()

Unnamed: 0_level_0,text,selected_text,sentiment,Selected Text
textID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6e0c6d75b1,2am feedings for the baby are fun when he is a...,fun,positive,0
fc2cbefa9d,Journey!? Wow... u just became cooler. hehe....,Wow... u just became cooler.,positive,0
16fab9f95b,I really really like the song Love Story by Ta...,like,positive,0
e48b0b8a23,Playing Ghost Online is really interesting. Th...,interesting.,positive,0
e00c6ef376,"the free fillin` app on my ipod is fun, im add...","the free fillin` app on my ipod is fun, im add...",positive,0


In [253]:
train[train['sentiment']=='negative'].head()

Unnamed: 0_level_0,text,selected_text,sentiment,Selected Text
textID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,0
088c60f138,my boss is bullying me...,bullying me,negative,0
9642c003ef,what interview! leave me alone,leave me alone,negative,0
358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,0
74a76f6e0a,My Sharpie is running DANGERously low on ink,DANGERously,negative,0


In [254]:
def text_token(text):
    wordnet = WordNetLemmatizer()
    STOPWORDS = set(stopwords.words("english"))
    token = nltk.word_tokenize(text)
    token = [wordnet.lemmatize(i) for i in token]
    token = [i for i in token if i.isalpha() and len(i)>2 and (i not in STOPWORDS)]
    return token

In [255]:
postive_data = train[train['sentiment']=='positive'].copy()
negative_data = train[train['sentiment']=='negative'].copy()
postive_data['Selected Text'] = postive_data['text'].apply(lambda x: text_token(x))
negative_data['Selected Text'] = negative_data['text'].apply(lambda x: text_token(x))
negative_data.head()

Unnamed: 0_level_0,text,selected_text,sentiment,Selected Text
textID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,"[Sooo, SAD, miss, San, Diego]"
088c60f138,my boss is bullying me...,bullying me,negative,"[bos, bullying]"
9642c003ef,what interview! leave me alone,leave me alone,negative,"[interview, leave, alone]"
358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,"[Sons, put, release, already, bought]"
74a76f6e0a,My Sharpie is running DANGERously low on ink,DANGERously,negative,"[Sharpie, running, DANGERously, low, ink]"


In [256]:
postive_data.head()

Unnamed: 0_level_0,text,selected_text,sentiment,Selected Text
textID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6e0c6d75b1,2am feedings for the baby are fun when he is a...,fun,positive,"[feeding, baby, fun, smile, coo]"
fc2cbefa9d,Journey!? Wow... u just became cooler. hehe....,Wow... u just became cooler.,positive,"[Journey, Wow, became, cooler, hehe, possible]"
16fab9f95b,I really really like the song Love Story by Ta...,like,positive,"[really, really, like, song, Love, Story, Tayl..."
e48b0b8a23,Playing Ghost Online is really interesting. Th...,interesting.,positive,"[Playing, Ghost, Online, really, interesting, ..."
e00c6ef376,"the free fillin` app on my ipod is fun, im add...","the free fillin` app on my ipod is fun, im add...",positive,"[free, fillin, app, ipod, fun, addicted]"


## Frequency Count and Bag of Words

In [257]:
freq_count = {}
for tokens in postive_data['Selected Text']:
    for word in tokens:
        if word not in freq_count:
            freq_count[word] = 1
        else:
            freq_count[word] += 1

In [258]:
for tokens in negative_data['Selected Text']:
    for word in tokens:
        if word not in freq_count:
            freq_count[word] = 1
        else:
            freq_count[word] += 1

In [259]:
for text in (postive_data['selected_text']):
    tokens = text_token(text)
    for word in tokens:
        if word not in freq_count:
            freq_count[word] = 1
        else:
            freq_count[word] += 1

In [260]:
for text in (negative_data['selected_text']):
    tokens = text_token(text)
    for word in tokens:
        if word not in freq_count:
            freq_count[word] = 1
        else:
            freq_count[word] += 1

In [261]:
freq_count = {k: v for k, v in sorted(freq_count.items(), key=lambda item: item[1], reverse=True)}
word_index_map = {v:k for k,v in enumerate(list(freq_count.keys())[:6000])}

In [262]:
def token_vector(token, label):
    fg = False
    x = np.zeros(len(word_index_map)+1)
    for word in token:
        if word in word_index_map:
            index = word_index_map[word]
            x[index] += 1
            fg = True
    if fg:
        x = x/x.sum()
    x[-1] = label
    return x

In [263]:
postive_data.head()

Unnamed: 0_level_0,text,selected_text,sentiment,Selected Text
textID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6e0c6d75b1,2am feedings for the baby are fun when he is a...,fun,positive,"[feeding, baby, fun, smile, coo]"
fc2cbefa9d,Journey!? Wow... u just became cooler. hehe....,Wow... u just became cooler.,positive,"[Journey, Wow, became, cooler, hehe, possible]"
16fab9f95b,I really really like the song Love Story by Ta...,like,positive,"[really, really, like, song, Love, Story, Tayl..."
e48b0b8a23,Playing Ghost Online is really interesting. Th...,interesting.,positive,"[Playing, Ghost, Online, really, interesting, ..."
e00c6ef376,"the free fillin` app on my ipod is fun, im add...","the free fillin` app on my ipod is fun, im add...",positive,"[free, fillin, app, ipod, fun, addicted]"


In [264]:
data = np.zeros((len(postive_data)+len(negative_data), len(word_index_map)+1))
index = 0
for idx in zip(postive_data['Selected Text'],postive_data['sentiment']):
    text = idx[0]
    label = idx[1]
    if label == 'positive':
        label = 1
    else:
        label = 0
    data[index,:] = token_vector(text, label)
    index += 1

In [265]:
for idx in zip(negative_data['Selected Text'],negative_data['sentiment']):
    text = idx[0]
    label = idx[1]
    if label == 'positive':
        label = 1
    else:
        label = 0
    data[index,:] = token_vector(text, label)
    index += 1

## Modelling

In [266]:
X = data[:,:-1]
y = data[:,-1]

In [267]:
model = LogisticRegression()
model.fit(X,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [217]:
for word,index in word_index_map.items():
    weight = model.coef_[0][index]
    if weight>0.5:
        print(word,":",weight)

good : 5.037425021938829
day : 1.6516762432186793
love : 8.891173493891047
like : 0.6305964474960788
Happy : 6.190974238576343
great : 6.493091651463694
fun : 4.664066193320727
happy : 4.450631651542656
http : 1.3138115434767044
night : 2.509571749320958
nice : 5.033862465636485
thanks : 6.4314150851895855
see : 1.163377069895879
wish : 2.0357429701063903
hope : 5.593484890179396
Day : 3.776058322234853
Thanks : 6.284437985943987
new : 1.9920091231725585
mother : 3.027506540220484
awesome : 6.044885812241773
lol : 2.0454958010483835
better : 4.753599486697956
Good : 5.607684934520692
The : 0.9576133622353714
morning : 2.828459665772743
You : 2.542029691936805
best : 4.256859261816706
would : 1.4100202579070493
look : 0.5496614212557139
haha : 2.943964815440851
mom : 2.197891144699309
tomorrow : 0.6869332670929971
weekend : 0.8851386696835621
Mother : 2.5512505728889803
cool : 3.9720954365221006
though : 1.119352391877292
could : 0.6970393364099227
amazing : 4.673594995276395
guy : 0.82

In [268]:
def token_text(tokens, label):
    threshold = 0.5
    text = []
    for word in tokens:
        if word in word_index_map:
            index = word_index_map[word]
            weight = model.coef_[0][index]
            if label == 1:
                if weight>threshold:
                    text.append(word)
            if label == 0:
                if weight<-threshold:
                    text.append(word)
    if len(text) == 0:
        return " ".join(tokens)
    return " ".join(text)

In [269]:
token_text(['day','with','you','bad','boy'],0)

'bad'

In [270]:
postive_data['Selected Text'] = postive_data['Selected Text'].apply(lambda x: token_text(x,1))
negative_data['Selected Text'] = negative_data['Selected Text'].apply(lambda x: token_text(x,0))

In [271]:
postive_data

Unnamed: 0_level_0,text,selected_text,sentiment,Selected Text
textID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6e0c6d75b1,2am feedings for the baby are fun when he is a...,fun,positive,fun smile
fc2cbefa9d,Journey!? Wow... u just became cooler. hehe....,Wow... u just became cooler.,positive,Wow hehe
16fab9f95b,I really really like the song Love Story by Ta...,like,positive,like song Love
e48b0b8a23,Playing Ghost Online is really interesting. Th...,interesting.,positive,interesting The new wait
e00c6ef376,"the free fillin` app on my ipod is fun, im add...","the free fillin` app on my ipod is fun, im add...",positive,free app fun
...,...,...,...,...
432e6de6c9,morning twit-friends! welcome to my new followers,welcome,positive,morning welcome new
8f14bb2715,So I get up early and I feel good about the da...,I feel good ab,positive,good day alright
b78ec00df5,enjoy ur night,enjoy,positive,enjoy night
f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,Yay good Enjoy weekend hun


In [272]:
postive_data[postive_data['selected_text']==postive_data['Selected Text']]

Unnamed: 0_level_0,text,selected_text,sentiment,Selected Text
textID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6ce4a4954b,juss came backk from Berkeleyy ; omg its madd ...,fun,positive,fun
7d8c4c11e4,i hope unni will make the audition . fighting ...,hope,positive,hope
fa2654e730,Chilliin,Chilliin,positive,Chilliin
92e1b6846e,We saw that in none 3D - the baddie`s the best,best,positive,best
2207d982bc,4am. And Im on the beach. Pretty,Pretty,positive,Pretty
...,...,...,...,...
45154b041d,ok bye alex have fun today,fun,positive,fun
bda2ce839a,_ it`s on again right now!! aah I love demi l...,love,positive,love
25ae8c8eff,That`s more than ok. Personally I`m very good...,good,positive,good
47c474aaf1,Good choice,Good,positive,Good


In [273]:
negative_data[negative_data['selected_text']==negative_data['Selected Text']]

Unnamed: 0_level_0,text,selected_text,sentiment,Selected Text
textID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
447dc22c81,hemp cloth is marvelous but unfortunately no,unfortunately,negative,unfortunately
02c004bc2d,I am tres depressed,depressed,negative,depressed
05a0e60f99,I`m sorry to hear that.,sorry,negative,sorry
1f581d48bc,Sorry RB is on PS3 for me,Sorry,negative,Sorry
cf888f0df3,I`m taking a twitter break. Cell is dying,dying,negative,dying
...,...,...,...,...
0828ad29fe,_123 i cant sleep,cant sleep,negative,cant sleep
83d3323ada,Screen On The Green started yesterday!!!ahhh! ...,missed,negative,missed
836b055959,Twitter`s being lame and won`t post my twitpic...,lame,negative,lame
0161b55ae1,just threw up,threw,negative,threw


## Test Data

In [285]:
test = pd.read_csv('test.csv', index_col='textID')
test['selected_text'] = test['text']
test.head()

Unnamed: 0_level_0,text,sentiment,selected_text
textID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,Last session of the day http://twitpic.com/67ezh
96d74cb729,Shanghai is also really exciting (precisely -...,positive,Shanghai is also really exciting (precisely -...
eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,"Recession hit Veronique Branquinho, she has to..."
01082688c6,happy bday!,positive,happy bday!
33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,http://twitpic.com/4w75p - I like it!!


In [286]:
postive_test = test[test['sentiment']=='positive']
negative_test = test[test['sentiment']=='negative']
postive_test['selected_text'] = postive_test['text'].apply(lambda x: text_token(x))
negative_test['selected_text'] = negative_test['text'].apply(lambda x: text_token(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [287]:
postive_test.head()

Unnamed: 0_level_0,text,sentiment,selected_text
textID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
96d74cb729,Shanghai is also really exciting (precisely -...,positive,"[Shanghai, also, really, exciting, precisely, ..."
01082688c6,happy bday!,positive,"[happy, bday]"
33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,"[http, like]"
726e501993,that`s great!! weee!! visitors!,positive,"[great, weee, visitor]"
33f19050cf,you guys didn`t say hi or answer my questions...,positive,"[guy, say, answer, question, yesterday, nice, ..."


In [288]:
negative_test.head()

Unnamed: 0_level_0,text,sentiment,selected_text
textID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,"[Recession, hit, Veronique, Branquinho, quit, ..."
261932614e,I THINK EVERYONE HATES ME ON HERE lol,negative,"[THINK, EVERYONE, HATES, HERE, lol]"
afa11da83f,"soooooo wish i could, but im in school and my...",negative,"[soooooo, wish, could, school, myspace, comple..."
24c92644a4,My bike was put on hold...should have known th...,negative,"[bike, put, hold, known, argh, total, bummer]"
5c1e0b61a1,"I`m in VA for the weekend, my youngest son tur...",negative,"[weekend, youngest, son, turn, tomorrow, make,..."


In [290]:
for tokens in postive_test['selected_text']:
    for word in tokens:
        if word not in freq_count:
            freq_count[word] = 1
        else:
            freq_count[word] += 1
for tokens in negative_test['selected_text']:
    for word in tokens:
        if word not in freq_count:
            freq_count[word] = 1
        else:
            freq_count[word] += 1

In [291]:
freq_count = {k: v for k, v in sorted(freq_count.items(), key=lambda item: item[1], reverse=True)}
word_index_map = {v:k for k,v in enumerate(list(freq_count.keys())[:6000])}

In [293]:
postive_test['selected_text'] = postive_test['selected_text'].apply(lambda x: token_text(x,1))
negative_test['selected_text'] = negative_test['selected_text'].apply(lambda x: token_text(x,0))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [294]:
negative_test.head()

Unnamed: 0_level_0,text,sentiment,selected_text
textID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,hit shame
261932614e,I THINK EVERYONE HATES ME ON HERE lol,negative,THINK EVERYONE HATES HERE lol
afa11da83f,"soooooo wish i could, but im in school and my...",negative,could myspace blocked
24c92644a4,My bike was put on hold...should have known th...,negative,put argh
5c1e0b61a1,"I`m in VA for the weekend, my youngest son tur...",negative,turn tomorrow sad big


In [295]:
test[test['sentiment']=='positive'] = postive_test
test[test['sentiment']=='negative'] = negative_test

In [296]:
test.head()

Unnamed: 0_level_0,text,sentiment,selected_text
textID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,Last session of the day http://twitpic.com/67ezh
96d74cb729,Shanghai is also really exciting (precisely -...,positive,also Good
eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,hit shame
01082688c6,happy bday!,positive,happy
33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,http like


## Saving the csv file

In [186]:
test.loc[:,['selected_text']].to_csv('submission.csv')

## Rough Space

In [280]:
train[train['sentiment']=='positive'] = postive_data
train[train['sentiment']=='negative'] = negative_data

In [281]:
train[train['selected_text']==train['Selected Text']].groupby(by='sentiment').count()

Unnamed: 0_level_0,text,selected_text,Selected Text
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negative,425,425,425
neutral,5930,5930,5930
positive,627,627,627


In [282]:
train.head()

Unnamed: 0_level_0,text,selected_text,sentiment,Selected Text
textID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,"I`d have responded, if I were going"
549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,SAD miss
088c60f138,my boss is bullying me...,bullying me,negative,bos
9642c003ef,what interview! leave me alone,leave me alone,negative,leave alone
358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,already
