## 1. Call function _transform(tweets__raw)_ -> _tweets__raw_ is dataframe with tweets' data from file 'tweets.csv'.
## 2. Function _transform(tweets__raw)_ returns dataframe, which contains rows corresponding consecutive tweets and colums with extracted features.
## 3. Features list:
* 'author',
* 'encoded_tweet_long' -> all words in tweet represented as numbers (list of integers),
* 'encoded_tweet_short' -> words in tweet without stopwords (taken from nltk package) represented as numbers (list of integers),
* 'letters_nr',
* 'urls_nr',
* 'hashtag_nr',
* 'mentioned_nr' -> e.g. @SelenaGomez,
* 'exclamations_nr',
* 'emojis_nr',
* 'perc_of_upper' -> percentage of upper case letters,
* 'words_nr' -> number of all words in tweet,
* 'average_word_len',
* 'std_dev_word_len' -> standard deviation of word's length,
* 'min_word_len',
* 'max_word_len',
* 'time' -> time of tweet posting represented as number of minutes elapsed from midnight (integer),
* 'weekday' -> weekday represented as numeric value e.g. Monday = 1 (inetger).

In [48]:
#Imports and installations
!pip install emoji
!pip install keras
!pip install regex
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [49]:
#Libraries
import pandas as pd
import os
import re
import string
import nltk
import datetime
import statistics as stat
from keras.preprocessing.text import Tokenizer
import emoji
import regex
import pickle

In [50]:
#Additional downloads
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /Users/Kasia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/Kasia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [51]:
#Download dataset (use in Colab)
#%%capture
#if not os.path.isfile('tweets.csv'):
    #!wget 'https://drive.google.com/uc?export=download&id=17F1luxwaaE4vrhlFsHbOFjSoYhsThuAJ' -O tweets.csv

In [52]:
#Create data frame
tweets_raw = pd.read_csv('../data/tweets.csv')
print("Number of tweets and their features: ", tweets_raw.shape)
tweets_raw.head()

Number of tweets and their features:  (52542, 10)


Unnamed: 0,author,content,country,date_time,id,language,latitude,longitude,number_of_likes,number_of_shares
0,katyperry,Is history repeating itself...?#DONTNORMALIZEH...,,12/01/2017 19:52,8.19633e+17,en,,,7900,3472
1,katyperry,@barackobama Thank you for your incredible gra...,,11/01/2017 08:38,8.19101e+17,en,,,3689,1380
2,katyperry,Life goals. https://t.co/XIn1qKMKQl,,11/01/2017 02:52,8.19014e+17,en,,,10341,2387
3,katyperry,Me right now 🙏🏻 https://t.co/gW55C1wrwd,,11/01/2017 02:44,8.19012e+17,en,,,10774,2458
4,katyperry,SISTERS ARE DOIN' IT FOR THEMSELVES! 🙌🏻💪🏻❤️ ht...,,10/01/2017 05:22,8.18689e+17,en,,,17620,4655


In [53]:
#Show general info
tweets_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52542 entries, 0 to 52541
Data columns (total 10 columns):
author              52542 non-null object
content             52542 non-null object
country             36 non-null object
date_time           52542 non-null object
id                  52542 non-null float64
language            52542 non-null object
latitude            1 non-null float64
longitude           1 non-null float64
number_of_likes     52542 non-null int64
number_of_shares    52542 non-null int64
dtypes: float64(3), int64(2), object(5)
memory usage: 4.0+ MB


In [54]:
#Tweets per person 
tweets_raw['author'].value_counts()

TheEllenShow     3147
jimmyfallon      3123
ArianaGrande     3104
YouTube          3077
KimKardashian    2939
katyperry        2924
selenagomez      2913
rihanna          2877
BarackObama      2863
britneyspears    2776
instagram        2577
shakira          2530
Cristiano        2507
jtimberlake      2478
ladygaga         2329
Twitter          2290
ddlovato         2217
taylorswift13    2029
justinbieber     2000
cnnbrk           1842
Name: author, dtype: int64

In [55]:
# 149 contains: #, @ i https
# 198 contains two #
# 114 contains two https

tweet_nr = 149
tweet = tweets_raw['content'][tweet_nr]
print(tweet)

YOU GOT THIS @HillaryClinton #DEBATES https://t.co/IZgi0yL9T2


In [56]:
def find_urls(text):
    "finds all URLs in the given text and returns the list of them"
    urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    return urls

In [57]:
def nr_of_urls(text):
    return len(find_urls(text))

In [58]:
def remove_urls(text):
    return re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)

In [59]:
def find_mentioned(text):
    "finds all mentions in the given text and returns the list of them. Ommits emails."
    # this line removes email adresses
    text = re.sub("[\w]+@[\w]+\.[c][o][m]", "", text)
    mentions = re.findall('@([a-zA-Z0-9]{1,15})', text)
    return mentions

In [60]:
def count_mentioned(text):
    return len(find_mentioned(text))

In [61]:
def remove_mentions_and_emails(text):
    text = re.sub("[\w]+@[\w]+\.[c][o][m]", "", text)
    text = re.sub("@([a-zA-Z0-9]{1,15})", "", text)
    return text

In [62]:
def find_hashtags(text):
    "finds all hashtags in the given text and returns the list of them. Will catch other #."
    #return list(part[1:] for part in text.split() if part.startswith('#')) # this version won't catch hashtags with no blank spaces before them
    return re.findall(r"#(\w+)", text)

In [63]:
def count_hashtags(text):
    return len(find_hashtags(text))

In [64]:
def remove_hashtags(text):
    return re.sub(r"#(\w+)", '', text)

In [65]:
def count_letters(text):
    return len(text)

In [66]:
def count_exclamation(text):
    return text.count('!')

In [67]:
def percent_of_upper(text):
    upper = len(re.findall(r'[A-Z]', text))
    return upper / len(text)

In [68]:
def extract_emojis(text):
    "finds all emoji in the given text and returns the list of them"
    clean_text = regex.findall(r'\X', text)
    return [word for word in clean_text if any(char in emoji.UNICODE_EMOJI for char in word)]

In [69]:
def remove_emojis(text):
    return text.encode('ascii', 'ignore').decode('ascii')

In [70]:
punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

def remove_punct(text):
    text  = "".join([char for char in text if char not in punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

In [71]:
def tokenization(text):
    text = re.split('\W+', text)
    return text

In [72]:
stopword = nltk.corpus.stopwords.words('english')

def remove_stopwords(text):
    text = [word for word in text if word not in stopword]
    return text

In [73]:
ps = nltk.PorterStemmer()

def stemming(text):
    text = [ps.stem(word) for word in text]
    return text

In [74]:
wn = nltk.WordNetLemmatizer()

def lemmatizer(text):
    text = [wn.lemmatize(word) for word in text]
    return text

In [75]:
def leave_words(text):
    "takes text as list of words. returns list deleting strange things ;)"
    return [word for word in text if re.search('[a-zA-Z]', word) is not None]

In [76]:
def calculate_word_length_list(words_list):
    return list(map(len, words_list))

In [77]:
def calculate_average_word_length(words_list):
    words_len_list = calculate_word_length_list(words_list)
    return stat.mean(words_len_list)

In [78]:
def calculate_std_deviation_word_length(words_list):
    words_len_list = calculate_word_length_list(words_list)
    if len(words_len_list) == 1:
        return 0
    else:
        return stat.stdev(words_len_list)

In [79]:
def calculate_max_word_length(words_list):
    words_len_list = calculate_word_length_list(words_list)
    return max(words_len_list)

In [80]:
def calculate_min_word_length(words_list):
    words_len_list = calculate_word_length_list(words_list)
    return min(words_len_list)

In [81]:
def split_date_time(date_time):
    "takes date as string. return list in form: [day, month, year, hour, minute], every element is converted to int"
    date_time_split = re.sub("[^\w]", " ",  date_time).split()
    return list(map(int, date_time_split))

In [82]:
def calculate_time(date_time):
    "takes date as string. returns time in minutes elapsed from midnight"
    date_time_split = split_date_time(date_time)
    return date_time_split[3]*60 + date_time_split[4]

In [83]:
def calculate_weekday(date_time):
    "takes date as string. returns weekday"
    date_time_split = split_date_time(date_time)
    return datetime.date(date_time_split[2],date_time_split[1],date_time_split[0]).weekday()

In [84]:
def transform_for_words_long_coding(text):
    t = remove_emojis(text)
    t = remove_urls(t)
    t = remove_hashtags(t)
    t = remove_mentions_and_emails(t)
    t = remove_punct(t)
    t = tokenization(t.lower())
    t = leave_words(t)
    return t

In [85]:
def transform_for_words_short_coding(text):
    t = transform_for_words_long_coding(text)
    t = remove_stopwords(t)
    t = stemming(t)
    t = lemmatizer(t)
    return t

In [86]:
def create_tokenizer(texts, type):
    transformed_texts = []
    for t in texts:
        if type == "long":
            transformed_texts.append(transform_for_words_long_coding(t))
        elif type == "short":
            transformed_texts.append(transform_for_words_short_coding(t))
        else:
            print("Incorrect tokenizer type. Must be 'long' or 'short'.")
    # create the tokenizer
    tok = Tokenizer()
    # fit the tokenizer on words
    tok.fit_on_texts(transformed_texts)
    return tok

In [87]:
def create_char_codes(texts, type):
    transformed_texts = []
    for t in texts:
        if type == "long":
            transformed_texts.append(transform_for_words_long_coding(t))
        elif type == "short":
            transformed_texts.append(transform_for_words_short_coding(t))
        else:
            print("Incorrect tokenizer type. Must be 'long' or 'short'.")
    char_list = [c for t in transformed_texts for s in t for c in s]
    unique_chars = sorted(set(char_list))
    return {u:i for i, u in enumerate(unique_chars)}

In [88]:
def get_max_words_nr(texts, type):
    if type == "long":
        return max([len( transform_for_words_long_coding(t) ) for t in texts])
    elif type == "short":
        return max([len( transform_for_words_short_coding(t) ) for t in texts])
    else:
        return None

In [89]:
def add_padding(list_to_extend, basic_len, extended_len):
    list_to_extend.extend([0]*(extended_len - basic_len))
    return list_to_extend

In [90]:
def transform_row(text, author, nr_of_shares, nr_of_likes, date_time, tokenizer_long, tokenizer_short, char_codes, max_words_nr_long, max_words_nr_short, padding):
    "takes tweet text, nr of shares and nr of likes and returns extracted features"
    nr_of_letters = count_letters(text)
    urls_list = find_urls(text)
    urls_nr = len(urls_list)
    hashtag_list = find_hashtags(text)
    hashtag_nr = len(hashtag_list)
    mentioned_list = find_mentioned(text)
    mentioned_nr = len(mentioned_list)
    exclamations_nr = count_exclamation(text)
    emojis_list = extract_emojis(text)
    emojis_nr = len(emojis_list)
    t = remove_emojis(text)
    t = remove_urls(t)
    t = remove_hashtags(t)
    t = remove_mentions_and_emails(t)
    t = remove_punct(t)
    perc_of_upper = percent_of_upper(t)
    t = tokenization(t.lower())
    t = leave_words(t)
    nr_of_words = len(t)
    average_word_len = calculate_average_word_length(t)
    std_dev_word_len = calculate_std_deviation_word_length(t)
    min_word_len = calculate_min_word_length(t)
    max_word_len = calculate_max_word_length(t)
    # first character and words coding
    encoded_tweet_long = tokenizer_long.texts_to_sequences([t])[0]
    if padding and len(encoded_tweet_long) < max_words_nr_long:
        encoded_tweet_long = add_padding(encoded_tweet_long, len(encoded_tweet_long), max_words_nr_long)
    encoded_tweet_chars = [char_codes[c] for s in t for c in s]
    t = remove_stopwords(t)
    t = stemming(t)
    t = lemmatizer(t)
    # second character and words coding
    encoded_tweet_short = tokenizer_short.texts_to_sequences([t])[0]
    if padding and len(encoded_tweet_short) < max_words_nr_short:
        encoded_tweet_short = add_padding(encoded_tweet_short, len(encoded_tweet_short), max_words_nr_short)
    time = calculate_time(date_time)
    weekday = calculate_weekday(date_time)
    return [author, encoded_tweet_long, encoded_tweet_short, encoded_tweet_chars, nr_of_letters, urls_nr, hashtag_nr, mentioned_nr, \
            exclamations_nr, emojis_nr, perc_of_upper, nr_of_words, average_word_len, \
            std_dev_word_len, min_word_len, max_word_len, time, weekday]


In [91]:
def transform(tweets_raw):
    "napisać funkcję, która najpierw tworzy ten słownik (word_index), potem korzystać z niego dla kolejnych tweetów"
    tokenizer_long = create_tokenizer(tweets_raw['content'][:20], "long")
    tokenizer_short = create_tokenizer(tweets_raw['content'][:20], "short")
    char_codes = create_char_codes(tweets_raw['content'][:20], "long")
    max_words_nr_long = get_max_words_nr(tweets_raw['content'][:20], "long")
    max_words_nr_short = get_max_words_nr(tweets_raw['content'][:20], "short")
    #print(tokenizer_short.word_index)
    features_list = [transform_row(row['content'], row['author'], row['number_of_shares'], row['number_of_likes'], row['date_time'], tokenizer_long, tokenizer_short, char_codes, max_words_nr_long, max_words_nr_short, True) \
        for index, row in tweets_raw[:20].iterrows()]
    return pd.DataFrame(features_list, columns = ['author', 'encoded_tweet_long', 'encoded_tweet_short', 'encoded_tweet_chars', 'nr_of_letters', 'urls_nr', \
                                                  'hashtag_nr', 'mentioned_nr', 'exclamations_nr', 'emojis_nr', 'perc_of_upper', \
                                                  'nr_of_words', 'average_word_len', 'std_dev_word_len', 'min_word_len', 'max_word_len', \
                                                  'time', 'weekday']) 

In [92]:
########################################
### CREATING DATAFRAME WITH FEATURES ###
########################################

feature_df = transform(tweets_raw)
print(feature_df)

       author                                 encoded_tweet_long  \
0   katyperry  [9, 20, 21, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
1   katyperry  [23, 10, 2, 11, 24, 25, 12, 26, 3, 2, 27, 28, ...   
2   katyperry  [30, 31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...   
3   katyperry  [32, 33, 34, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
4   katyperry  [13, 35, 36, 37, 2, 38, 0, 0, 0, 0, 0, 0, 0, 0...   
5   katyperry  [14, 39, 40, 41, 42, 43, 44, 45, 15, 46, 0, 0,...   
6   katyperry  [47, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...   
7   katyperry  [49, 50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...   
8   katyperry  [51, 1, 52, 53, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
9   katyperry  [13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...   
10  katyperry  [14, 54, 55, 16, 3, 56, 1, 57, 58, 15, 4, 59, ...   
11  katyperry  [60, 61, 62, 1, 63, 64, 5, 65, 66, 67, 0, 0, 0...   
12  katyperry  [5, 68, 69, 70, 71, 6, 7, 72, 17, 73, 0, 0, 0,...   
13  katyperry  [5, 11, 17, 74, 9, 75, 76, 77, 0,

In [93]:
# open a file, where you ant to store the data
file = open('../data/tweets_features', 'wb')
# dump information to that file
pickle.dump(feature_df, file)
# close the file
file.close()

testy transform_row function

In [94]:
which_tweet = 4
tweet = tweets_raw['content'][which_tweet]
author = tweets_raw['author'][which_tweet]
nr_of_shares = tweets_raw['number_of_shares'][which_tweet]
nr_of_likes = tweets_raw['number_of_likes'][which_tweet]
date_time = tweets_raw['date_time'][which_tweet]
print(tweet)
tokenizer_long = create_tokenizer(tweets_raw['content'][:20], "long")
tokenizer_short = create_tokenizer(tweets_raw['content'][:20], "short")
char_codes = create_char_codes(tweets_raw['content'][:20], "long")
max_words_nr_long = get_max_words_nr(tweets_raw['content'][:20], "long")
max_words_nr_short = get_max_words_nr(tweets_raw['content'][:20], "short")
#print(tokenizer_long.word_index)
#print(tokenizer_short.word_index)
transform_row(tweet, author, nr_of_shares, nr_of_likes, date_time, tokenizer_long, tokenizer_short, char_codes, max_words_nr_long, max_words_nr_long, True)

SISTERS ARE DOIN' IT FOR THEMSELVES! 🙌🏻💪🏻❤️ https://t.co/0shuUYUBEv


['katyperry',
 [13, 35, 36, 37, 2, 38, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [2, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [17,
  8,
  17,
  18,
  4,
  16,
  17,
  0,
  16,
  4,
  3,
  14,
  8,
  13,
  8,
  18,
  5,
  14,
  16,
  18,
  7,
  4,
  12,
  17,
  4,
  11,
  20,
  4,
  17],
 67,
 1,
 0,
 0,
 1,
 3,
 0.8055555555555556,
 6,
 4.833333333333333,
 3.0605010483034745,
 2,
 10,
 322,
 1]