In [1]:
#Feature Engineering
#Importing dataset + date shenanigans

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("./data/tweets.csv", encoding='latin')
df['Date'] = pd.to_datetime(df['Date'], format='%a %b %d %H:%M:%S PDT %Y')
try:
    df.drop(['ID', 'flag'], axis=1, inplace=True)
except KeyError:
    pass
df['Weekday'] = df['Date'].dt.weekday
df['Time'] = df['Date'].dt.time
df['Full_date'] = df['Date'].dt.date

df['Target'] = df['Target'].map({0: 0, 4: 1})
df.head()


Unnamed: 0,Target,Date,User,Text,Weekday,Time,Full_date
0,0,2009-04-06 22:19:49,scotthamilton,is upset that he can't update his Facebook by ...,0,22:19:49,2009-04-06
1,0,2009-04-06 22:19:53,mattycus,@Kenichan I dived many times for the ball. Man...,0,22:19:53,2009-04-06
2,0,2009-04-06 22:19:57,ElleCTF,my whole body feels itchy and like its on fire,0,22:19:57,2009-04-06
3,0,2009-04-06 22:19:57,Karoli,"@nationwideclass no, it's not behaving at all....",0,22:19:57,2009-04-06
4,0,2009-04-06 22:20:00,joy_wolf,@Kwesidei not the whole crew,0,22:20:00,2009-04-06


In [4]:
#In EDA we have observed that the fraction of tweets with negative sentiment seems to reach maximum around 3. p.m. and changes 2 times 
#rapidly to evening than to morning. Let us then create a quantity describing this 'skewed' standardised distance from 15:00.

In [5]:
df['Hour'] = df['Date'].dt.hour
df['skewed_hour_dist']=df['Hour'].apply(lambda x: (16-x)/16 if x<16 else (x-16)/8)
df.head()

Unnamed: 0,Target,Date,User,Text,Weekday,Time,Full_date,Hour,skewed_hour_dist
0,0,2009-04-06 22:19:49,scotthamilton,is upset that he can't update his Facebook by ...,0,22:19:49,2009-04-06,22,0.75
1,0,2009-04-06 22:19:53,mattycus,@Kenichan I dived many times for the ball. Man...,0,22:19:53,2009-04-06,22,0.75
2,0,2009-04-06 22:19:57,ElleCTF,my whole body feels itchy and like its on fire,0,22:19:57,2009-04-06,22,0.75
3,0,2009-04-06 22:19:57,Karoli,"@nationwideclass no, it's not behaving at all....",0,22:19:57,2009-04-06,22,0.75
4,0,2009-04-06 22:20:00,joy_wolf,@Kwesidei not the whole crew,0,22:20:00,2009-04-06,22,0.75


In [6]:
#It is kind of forced but we can try doing similar thing with Weekday
df['skewed_week_dist']=df['Weekday'].apply(lambda x: (2-x)/2 if x<2 else (x-2)/4)
df.head()

Unnamed: 0,Target,Date,User,Text,Weekday,Time,Full_date,Hour,skewed_hour_dist,skewed_week_dist
0,0,2009-04-06 22:19:49,scotthamilton,is upset that he can't update his Facebook by ...,0,22:19:49,2009-04-06,22,0.75,1.0
1,0,2009-04-06 22:19:53,mattycus,@Kenichan I dived many times for the ball. Man...,0,22:19:53,2009-04-06,22,0.75,1.0
2,0,2009-04-06 22:19:57,ElleCTF,my whole body feels itchy and like its on fire,0,22:19:57,2009-04-06,22,0.75,1.0
3,0,2009-04-06 22:19:57,Karoli,"@nationwideclass no, it's not behaving at all....",0,22:19:57,2009-04-06,22,0.75,1.0
4,0,2009-04-06 22:20:00,joy_wolf,@Kwesidei not the whole crew,0,22:20:00,2009-04-06,22,0.75,1.0


In [7]:
#We have seen that after certain date (2009-05-29 07:33:45) sentiment of all tweets is negative. Of course it is a feature
#strictly specific to our data and according to common sense shouldn't be used in the model if we would like to predict the sentiment
#of any text. However if testing is going to be done on our set alone it is a meaningful piece of information.
y=df[df['Target']==1]['Full_date'].max()
df['is_after_certain_day']=df['Full_date'].apply(lambda x: 0 if x<=y else 1)
df.head()

Unnamed: 0,Target,Date,User,Text,Weekday,Time,Full_date,Hour,skewed_hour_dist,skewed_week_dist,is_after_certain_day
0,0,2009-04-06 22:19:49,scotthamilton,is upset that he can't update his Facebook by ...,0,22:19:49,2009-04-06,22,0.75,1.0,0
1,0,2009-04-06 22:19:53,mattycus,@Kenichan I dived many times for the ball. Man...,0,22:19:53,2009-04-06,22,0.75,1.0,0
2,0,2009-04-06 22:19:57,ElleCTF,my whole body feels itchy and like its on fire,0,22:19:57,2009-04-06,22,0.75,1.0,0
3,0,2009-04-06 22:19:57,Karoli,"@nationwideclass no, it's not behaving at all....",0,22:19:57,2009-04-06,22,0.75,1.0,0
4,0,2009-04-06 22:20:00,joy_wolf,@Kwesidei not the whole crew,0,22:20:00,2009-04-06,22,0.75,1.0,0


In [8]:
#Thaks to one of correlation matrices from EDA (or to having at least 1 brain cell) we know that the number of hashtags, mentions or
#exclamation marks is highly correlated with its binary counterpart quantifier. From the same analysis we see that their correlation with 
# target is marginal however mentions and exclamation marks seem to have more to say than hashtags, that is why we 
#will leave out hashtags but not the other two.
def has_men(text):
    for c in text:
        if c=='@':
            return 1
    return 0
def has_exc(text):
    for c in text:
        if c=='!':
            return 1
        return 0
df['has_mentions']=df['Text'].apply(has_men)
df['has_exclamation_marks']=df['Text'].apply(has_exc)
df.head()
        


Unnamed: 0,Target,Date,User,Text,Weekday,Time,Full_date,Hour,skewed_hour_dist,skewed_week_dist,is_after_certain_day,has_mentions,has_exclamation_marks
0,0,2009-04-06 22:19:49,scotthamilton,is upset that he can't update his Facebook by ...,0,22:19:49,2009-04-06,22,0.75,1.0,0,0,0
1,0,2009-04-06 22:19:53,mattycus,@Kenichan I dived many times for the ball. Man...,0,22:19:53,2009-04-06,22,0.75,1.0,0,1,0
2,0,2009-04-06 22:19:57,ElleCTF,my whole body feels itchy and like its on fire,0,22:19:57,2009-04-06,22,0.75,1.0,0,0,0
3,0,2009-04-06 22:19:57,Karoli,"@nationwideclass no, it's not behaving at all....",0,22:19:57,2009-04-06,22,0.75,1.0,0,1,0
4,0,2009-04-06 22:20:00,joy_wolf,@Kwesidei not the whole crew,0,22:20:00,2009-04-06,22,0.75,1.0,0,1,0


In [9]:
#And now for the main course tf-idf, we will set a limit o a 100 words so as not to everything too high in computation cost.
from sklearn.feature_extraction.text import TfidfVectorizer
text_data = df['Text'].tolist()


In [10]:
#We shall also remove certain english stop words because they are very frequent but do not carry much sentiment.
vectoriser=TfidfVectorizer(max_features=100, lowercase=True, stop_words='english')
tfidf_matrix = vectoriser.fit_transform(text_data)

In [11]:
#back to df
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectoriser.get_feature_names_out())
df = pd.concat([df, tfidf_df], axis=1)
df.head()

Unnamed: 0,Target,Date,User,Text,Weekday,Time,Full_date,Hour,skewed_hour_dist,skewed_week_dist,...,way,week,weekend,went,wish,won,work,working,yeah,yes
0,0,2009-04-06 22:19:49,scotthamilton,is upset that he can't update his Facebook by ...,0,22:19:49,2009-04-06,22,0.75,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,2009-04-06 22:19:53,mattycus,@Kenichan I dived many times for the ball. Man...,0,22:19:53,2009-04-06,22,0.75,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,2009-04-06 22:19:57,ElleCTF,my whole body feels itchy and like its on fire,0,22:19:57,2009-04-06,22,0.75,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,2009-04-06 22:19:57,Karoli,"@nationwideclass no, it's not behaving at all....",0,22:19:57,2009-04-06,22,0.75,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,2009-04-06 22:20:00,joy_wolf,@Kwesidei not the whole crew,0,22:20:00,2009-04-06,22,0.75,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
#fast check that it is not all 0
y=df['weekend']
sum(y)

7989.118709569409