# Post Preprocess

## 0. Import

In [1]:
import numpy as np
import pandas as pd
import csv

## 1. Preprocess tweet polarity (0 - 4)

In [2]:
FILE_DATA = "../dataset/tweets/train_0_to_4.csv"
FILE_DATA_OUTPUT = "../dataset/tweets/train_0_1.csv"

In [3]:
tweet_df = pd.read_csv(FILE_DATA, sep=',', usecols=['sentiment', 'tweet'], dtype={'tweet':str})

In [4]:
tweet_df['sentiment'] = tweet_df['sentiment'].apply(lambda x: 1 if (x == 4) else 0)

In [5]:
tweet_df.head()
tweet_df.tail()

Unnamed: 0,sentiment,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


Unnamed: 0,sentiment,tweet
1599995,1,Just woke up. Having no school is the best fee...
1599996,1,TheWDB.com - Very cool to hear old Walt interv...
1599997,1,Are you ready for your MoJo Makeover? Ask me f...
1599998,1,Happy 38th Birthday to my boo of alll time!!! ...
1599999,1,happy #charitytuesday @theNSPCC @SparksCharity...


In [6]:
tweet_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 2 columns):
sentiment    1600000 non-null int64
tweet        1600000 non-null object
dtypes: int64(1), object(1)
memory usage: 24.4+ MB


In [7]:
tweet_df.dtypes

sentiment     int64
tweet        object
dtype: object

In [10]:
tweet_df.to_csv(FILE_DATA_OUTPUT, index_label="tweet_id", header=None, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)

### Shuffle, Shorten, Split

In [2]:
PATH_FILE_CORPUS = "../dataset/tweets/tweets_pos_neg.csv"
PATH_FILE_TRAIN = "../dataset/tweets/tweets_pos_neg_train.csv"
PATH_FILE_TEST = "../dataset/tweets/tweets_pos_neg_test.csv"

In [3]:
tweet_df = pd.read_csv(PATH_FILE_CORPUS, header=None, usecols=[1, 2])

In [4]:
tweet_df.head()
tweet_df.info()

Unnamed: 0,1,2
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 2 columns):
1    1600000 non-null int64
2    1600000 non-null object
dtypes: int64(1), object(1)
memory usage: 24.4+ MB


Shuffle

In [4]:
tweet_df = tweet_df.sample(frac=1).reset_index(drop=True)

In [5]:
tweet_df.head(10)
tweet_df.tail(10)

Unnamed: 0,1,2
0,0,Gah! Not going to tweet about this. It's frea...
1,1,Is walkin up to Club Sevilla downtown SD right...
2,0,im tryin to vote for tom but its no lettin me ...
3,0,plllleeeasseee vote for me for the mtv movie a...
4,0,Actually I'd like to relive NIN/JA tour all ov...
5,1,"@Masterface AHAHAHAHAHAHAHA. God, that still m..."
6,0,No time to get to the gym today
7,1,You Fool
8,1,@blankskater1 a I forgot to tell you that I lo...
9,0,@cheynesaw we are going on monday and thursday...


Unnamed: 0,1,2
1599990,1,"Agh nice to just sit outside, drinking milk an..."
1599991,1,"@LetiPoynter @Mcflyismydrug_x dougie, good cho..."
1599992,1,just got up.. eating bagel bites and coke
1599993,0,sun went away ugh i hate Nebraska! i want it ...
1599994,0,ugh long day well its that day were nothing se...
1599995,0,Now as of June 24th the government will only g...
1599996,0,is cleaning my car for someone to come look at...
1599997,1,@nerdpunkcub that is very possible. i think i...
1599998,1,"just went out to pick up bagels, cream cheese,..."
1599999,0,Whitening my teeth!! this is going to suck!!


Shorten

In [6]:
tweet_df, _ = np.split(tweet_df, [int(0.5 * len(tweet_df))])

In [7]:
tweet_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 2 columns):
1    800000 non-null int64
2    800000 non-null object
dtypes: int64(1), object(1)
memory usage: 12.2+ MB


Split

In [8]:
tweet_train_df, tweet_test_df = np.split(tweet_df, [int(0.80 * len(tweet_df))])

In [9]:
tweet_train_df.to_csv(PATH_FILE_TRAIN, index_label="tweet_id", header=None, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
tweet_test_df.to_csv(PATH_FILE_TEST, index_label="tweet_id", header=None, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)

## 2. Preprocess tweet polarity (pos - neg - neut)

In [3]:
!ls ../dataset/tweets/tweets_transfer/

tweets_pos_neg_neu_test.csv  tweets_pos_neg_neu_train.csv


In [27]:
PATH_FILE_TRAIN = '../dataset/tweets/tweets_transfer/tweets_pos_neg_neu_train.csv'
PATH_FILE_TRAIN_OUTPUT = '../dataset/tweets/tweets_transfer/tweets_pos_neg_neu_train_bis.csv'
PATH_FILE_TEST = '../dataset/tweets/tweets_transfer/tweets_pos_neg_neu_test.csv'

In [35]:
tweet_train_df = pd.read_csv(PATH_FILE_TEST, header=None, usecols=[1,2], sep='\t')

In [None]:
#tweet_train_df = tweet_train_df.sample(frac=1).reset_index(drop=True)

In [38]:
tweet_train_df

Unnamed: 0,1,2
0,0,So @Ryanair site crashes everytime I try to bo...
1,0,Theme of week: Ask the Lord for strength &amp;...
2,0,"@F1 Why announcing so late, it will be hard to..."
3,2,The greatest happiness is seeing someone you l...
4,2,omg so grateful to have an education but ive b...
5,2,"Because of your @smile, you @make the #life mo..."
6,2,@mashable For some reason this has filled me w...
7,2,@LoveMyFFAJacket FaceTime - we can still annoy...
8,0,and i shouldve cut them off the moment i start...
9,2,@VescioDiana You forgot #laughter as well ❤️❤️❤️


In [37]:
tweet_train_df[1] = tweet_train_df[1].apply(lambda x: 0 if (x == "negative") else (1 if (x == "neutral") else 2))

In [39]:
tweet_train_df.to_csv(PATH_FILE_TRAIN_OUTPUT, index_label="tweet_id", header=None, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)

## 3. Preprocess tweet emotion (angry - sad - happy - others) 

In [1]:
!ls ../dataset/tweets/tweets_emotion/tweets_emotions_train.txt

../dataset/tweets/tweets_emotion/tweets_emotions_train.txt


In [2]:
PATH_FILE_TRAIN = "../dataset/tweets/tweets_emotion/tweets_emotions_train.txt"
PATH_FILE_TRAIN_OUTPUT = "../dataset/tweets/tweets_emotion/tweets_emotions_train.csv"
PATH_FILE_TEST_OUTPUT = "../dataset/tweets/tweets_emotion/tweets_emotions_test.csv"

In [6]:
tweet_train_df = pd.read_csv(PATH_FILE_TRAIN, sep='\t')

In [7]:
tweet_train_df.head()

Unnamed: 0,id,turn1,turn2,turn3,label
0,0,Don't worry I'm girl,hmm how do I know if you are,What's ur name?,others
1,1,When did I?,saw many times i think -_-,No. I never saw you,angry
2,2,By,by Google Chrome,Where you live,others
3,3,U r ridiculous,I might be ridiculous but I am telling the truth.,U little disgusting whore,angry
4,4,Just for time pass,wt do u do 4 a living then,Maybe,others


In [8]:
tweet_train_df['label'] = tweet_train_df['label'].apply(lambda x: 0 if (x == "angry") 
                                                        else (1 if (x == "sad")
                                                        else (2 if (x == "others")
                                                        else 3))) # 3 -> happy

In [9]:
tweet_train_df['tweet'] = tweet_train_df["turn1"] + " " + tweet_train_df["turn2"] + " " + tweet_train_df["turn3"]

In [10]:
tweet_train_df = tweet_train_df[['label', 'tweet']]

In [11]:
tweet_train_df.head()

Unnamed: 0,label,tweet
0,2,Don't worry I'm girl hmm how do I know if you...
1,0,When did I? saw many times i think -_- No. I n...
2,2,By by Google Chrome Where you live
3,0,U r ridiculous I might be ridiculous but I am ...
4,2,Just for time pass wt do u do 4 a living then ...


### Split

In [12]:
tweet_train_df, tweet_test_df = np.split(tweet_train_df, [int(0.80 * len(tweet_train_df))])

In [14]:
tweet_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24128 entries, 0 to 24127
Data columns (total 2 columns):
label    24128 non-null int64
tweet    24128 non-null object
dtypes: int64(1), object(1)
memory usage: 377.1+ KB


In [15]:
tweet_train_df.to_csv(PATH_FILE_TRAIN_OUTPUT, index_label="tweet_id", header=None, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
tweet_test_df.to_csv(PATH_FILE_TEST_OUTPUT, index_label="tweet_id", header=None, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)

## 4. Preprocess tweet emotion (anger - fear - sadness - joy - love - others) 

In [15]:
PATH_FILE_TRAIN = "../dataset/tweets/tweets_emotion_6/emotion.data"
PATH_FILE_TRAIN_OUTPUT = "../dataset/tweets/tweets_emotion_6/emotion_6.csv"

In [3]:
tweet_train_df = pd.read_csv(PATH_FILE_TRAIN, sep=',', usecols=[1,2])

In [4]:
tweet_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416809 entries, 0 to 416808
Data columns (total 2 columns):
text        416809 non-null object
emotions    416809 non-null object
dtypes: object(2)
memory usage: 6.4+ MB


In [14]:
tweet_train_df.head(10)

Unnamed: 0,emotions,text
0,1,i feel awful about it too because it s my job ...
1,1,im alone i feel awful
2,4,ive probably mentioned this before but i reall...
3,1,i was feeling a little low few days back
4,5,i beleive that i am much more sensitive to oth...
5,5,i find myself frustrated with christians becau...
6,4,i am one of those people who feels like going ...
7,4,i feel especially pleased about this as this h...
8,4,i was struggling with these awful feelings and...
9,0,i feel so enraged but helpless at the same time


In [9]:
tweet_train_df = tweet_train_df[['emotions', 'text']]

In [13]:
tweet_train_df['emotions'] = tweet_train_df['emotions'].apply(lambda x: 0 if (x == "anger") 
                                                              else (1 if (x == "sadness")
                                                              else (2 if (x == "fear")
                                                              else (3 if (x == "surprise")
                                                              else (4 if (x == "joy")
                                                              else 5))))) # 5 -> love

In [11]:
tweet_train_df.emotions.unique()

array(['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'],
      dtype=object)

In [16]:
tweet_train_df.to_csv(PATH_FILE_TRAIN_OUTPUT, index_label="tweet_id",
                      header=None, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)