In [228]:
TRAIN_PATH = "../data/raw/train.pkl"
TEST_PATH = "../data/raw/test.pkl"

FINAL_TRAIN = "../data/processed/train_final.pkl"
FINAL_TEST = "../data/processed/test_final.pkl"

In [260]:
# Load packages
import numpy as np
import pandas as pd

import pickle

import sys
sys.path.append("../scripts")
from Utils import build_freqs, process_tweet

pd.set_option("display.max_colwidth", None)

___
## Functions:

In [256]:
def extract_features(tweet, freqs=features):
    """
    function used to transfrom tweet into its numerical representation
    based on the stored features
    INPUT:
        tweet: the tweet in a string format
        freqs: a dictionary of all the word frequency
                e.g {(great, 1): 35}
    OUTPUT:
        returns a numpy array of each score for features 
        e.g [bias, positive_score, negative_score]
    """
    words = process_tweet(tweet)
    tweet_array = np.array([[1.0, 0.0, 0.0]])
    
    for word in words:
        tweet_array[0, 1] += freqs.get((word, 1.0), 0)
        tweet_array[0, 2] += freqs.get((word, 0.0), 0)
    
    assert(tweet_array.shape == (1, 3))
    return tweet_array#.reshape(3,)

___
## Read data

In [99]:
train_df = pd.read_pickle(TRAIN_PATH)
test_df = pd.read_pickle(TEST_PATH)

In [261]:
train_df.sample(5)

Unnamed: 0,tweets,sentiment
7452,I want takoyaki :(( My #TeenChoice for #ChoiceInternationalArtist is #SuperJunior!,0.0
7220,@RedLipsteeq funny thing is someone said I was telling his life story :(,0.0
4828,vidcon :(((,0.0
7977,no date yet :( https://t.co/Tu9R2CfSyx,0.0
2362,Took leave to rehearse for @MOEsg's #YouthCelebrate at @SGSportsHub. Keep my bro @GeraldKoh987 company on @987fm! :) http://t.co/Ajk6gD0Hdn,1.0


In [265]:
test_df.sample(5)

Unnamed: 0,tweets,sentiment
504,My sisters are the best!!!! Best pre bday celebration :)))),1.0
1476,@JayMcGuiness i want to meet you :(,0.0
253,Love it! :) 13 photos that explain why Ukraine is not Russia http://t.co/KfblbBBYkQ http://t.co/2EYOfYUWwy,1.0
1491,Feel so sick :(,0.0
649,@OMGitsBern_92 All sorted in the early hours. Thanks again for your help &amp; raising the initial issue. makes the community what it is :),1.0


## Now building features:
**From build_freqs fuction we will have a dictionary of {(words,label): freq} as our features**

In [106]:
features = build_freqs(X_train, y_train)
print(type(features))
print("length of features: ", len(features.keys()))

<class 'collections.Counter'>
length of features:  11338


In [230]:
# Showing some of features
list(features.items())[:5]

[(('followfriday', 1.0), 23),
 (('top', 1.0), 30),
 (('engag', 1.0), 7),
 (('member', 1.0), 14),
 (('commun', 1.0), 27)]

**Now construct your features using extract_features function**
the output of shape `[[1], [0], [0]] `  will be splitted and used to construct a dataframe of three features; [bias, pos, neg]

In [213]:
train_features = train_df['tweets'].apply(extract_features)
train_features[:5]

0     [[1.0, 3020.0, 61.0]]
1    [[1.0, 3573.0, 444.0]]
2    [[1.0, 3005.0, 115.0]]
3      [[1.0, 2862.0, 4.0]]
4    [[1.0, 3119.0, 225.0]]
Name: tweets, dtype: object

As we can see the features are formated as a numpay array and so we will extract them to build our features as follows:
* bias: the bias term in the equation
* pos: the positive score of tweet
* neg: the negative score of the tweet

In [214]:
bias = [u[0,0] for u in train_features.to_numpy()]
pos = [u[0,1] for u in train_features.to_numpy()]
neg = [u[0,2] for u in train_features.to_numpy()]

In [215]:
X_train = pd.DataFrame({"bias": bias,
              "pos": pos,
              "neg": neg})

In [216]:
X_train.sample(5)

Unnamed: 0,bias,pos,neg
6790,1.0,173.0,3824.0
2787,1.0,3165.0,324.0
453,1.0,3333.0,429.0
4540,1.0,279.0,702.0
1532,1.0,3422.0,106.0


In [226]:
# Adding the target to final train features
X_train['sentiment'] = train_df['sentiment']
X_train.sample(5)

Unnamed: 0,bias,pos,neg,sentiment
2990,1.0,3089.0,219.0,1.0
2004,1.0,572.0,4.0,1.0
4710,1.0,431.0,4208.0,0.0
2456,1.0,105.0,0.0,1.0
6096,1.0,337.0,770.0,0.0


**Do the same thing with test data**

In [219]:
test_features = test_df['tweets'].apply(extract_features)
test_features[:5]

0    [[1.0, 3214.0, 414.0]]
1    [[1.0, 3236.0, 397.0]]
2    [[1.0, 1363.0, 328.0]]
3    [[1.0, 3164.0, 400.0]]
4      [[1.0, 795.0, 27.0]]
Name: tweets, dtype: object

In [220]:
bias = [u[0,0] for u in test_features.to_numpy()]
pos = [u[0,1] for u in test_features.to_numpy()]
neg = [u[0,2] for u in test_features.to_numpy()]

In [221]:
X_test = pd.DataFrame({"bias": bias,
              "pos": pos,
              "neg": neg})

In [222]:
X_test.sample(5)

Unnamed: 0,bias,pos,neg
141,1.0,3680.0,312.0
1463,1.0,221.0,3976.0
1265,1.0,323.0,3883.0
1583,1.0,71.0,3855.0
479,1.0,3058.0,363.0


In [227]:
# Adding the target to final train features
X_test['sentiment'] = test_df['sentiment']
X_test.sample(5)

Unnamed: 0,bias,pos,neg,sentiment
1127,1.0,15.0,3689.0,0.0
1122,1.0,154.0,3889.0,0.0
962,1.0,3486.0,703.0,1.0
133,1.0,3055.0,248.0,1.0
1293,1.0,1.0,3663.0,0.0


___
## Export features

In [229]:
X_train.to_pickle(FINAL_TRAIN)
X_test.to_pickle(FINAL_TEST)

In [232]:
with open("../data/objects/features.pkl", "wb") as handle:
    pickle.dump(features, handle)