# Extract 4 features from dataset
1. Visibility
2. favorites_count / statues_count
3. followers_count / friends_count
4. Frequency of tweets published since the creation of the account

In [1]:
import numpy as np
import pandas as pd
from pymongo import MongoClient
np.set_printoptions(suppress=True)
#consider inf and -inf to be “NA” in computations
pd.options.mode.use_inf_as_na = True

client = MongoClient('localhost', 27017)
db = client['if29']
collection = db['Tweet Worldcup 200']

In [2]:
cursor = collection.aggregate(
    [
        {"$group" : {
            "_id" : "$user.id", 
            "friends_count" : {"$last" : "$user.friends_count"},
            "listed_count" : {"$last" : "$user.listed_count"},
            "favourites_count" : {"$last" : "$user.favourites_count"},
            "statuses_count" : {"$last" : "$user.statuses_count"},
            "followers_count" : {"$last" : "$user.followers_count"},
            "created_time" : {"$last" : "$user.created_at"}
        }
        }
    ]
)
df = pd.json_normalize(cursor)

In [3]:
df

Unnamed: 0,_id,friends_count,listed_count,favourites_count,statuses_count,followers_count,created_time
0,2907244978,1703,287,44979,52950,1283,Sat Dec 06 03:08:09 +0000 2014
1,232747052,187,3,3,3190,266,Sat Jan 01 05:01:13 +0000 2011
2,903630554582708231,81,0,453,1621,40,Fri Sep 01 14:48:07 +0000 2017
3,945643630428958720,102,0,7184,4437,144,Tue Dec 26 13:13:05 +0000 2017
4,899432288634535936,280,0,185,26,25,Mon Aug 21 00:45:43 +0000 2017
...,...,...,...,...,...,...,...
267462,803586894156152832,94,1,261,2708,245,Tue Nov 29 13:10:20 +0000 2016
267463,159834730,258,0,267,1718,43,Sat Jun 26 13:09:10 +0000 2010
267464,972057848338370560,51,0,396,96,11,Fri Mar 09 10:33:45 +0000 2018
267465,361212291,615,204,1165,3997,871,Wed Aug 24 12:43:25 +0000 2011


### Export to .csv file

In [4]:
df.iloc[:,range(0,6)].to_csv("./data/feature_raw.csv")

## Feature constuct (vis, r_fri_follow, avg_fav, frequency)

### Visibility
#### Group tweet texts by each user

In [5]:
cursor1 = collection.aggregate(
    [
        {"$group" : {
            "_id" : "$user.id", 
            
            "tweets" : {"$push": "$text"}
        }
        }
    ]
)
df1 = pd.json_normalize(cursor1)

In [6]:
df1

Unnamed: 0,_id,tweets
0,531585010,[#WorldCup starts today! Woo! A bit bummed tha...
1,1006397655549915136,[India is going to be 'Begaani shaadi mai abdu...
2,559816619,[RT @PurelyFootball: We are giving away a pers...
3,133115653,[RT @SaudiNT: 📹 | وصول صُقورنا الخُضر لملعب ال...
4,286859311,[#WorldCup Day !!! Not Long to Go Now 😅]
...,...,...
267462,233602734,[RT @ani_nomso: How do you hate from outside t...
267463,2800742344,[Biggest anime betrayal if this happens https:...
267464,2396603382,[RT @rotabet: Dünya Kupası'nın açılış maçında ...
267465,81326054,[Any sun dream team leagues out there I can jo...


### calculate visibility

In [7]:
def calcul_visibility(tweets):
    s = 0
    for tweet in tweets:
        s += tweet.count("@")*11.4 
        s += tweet.count("#")*11.6
    return s/(140*len(tweets))

In [8]:
visibilities = []
for texts in df1.iloc[:,1]:
    visibilities.append(calcul_visibility(texts))

### favorites_count / statues_count

In [9]:
avg_fav = df.favourites_count/df.statuses_count

### followers_count / friends_count

In [10]:
r_fri_follow = df.followers_count/df.friends_count

### Frequency of tweets published since the creation of the account with t0=01/01/2019
$ I_1 = \frac{N_t}{t_0-t} $

In [11]:
import time
def ratio(row):
    time_array = time.strptime(row["created_time"], "%a %b %d %H:%M:%S +0000 %Y")
    time_stamp = time.mktime(time_array)
    time_stamp_0 = time.mktime(time.strptime("Mon Jan 1 00:00:00 2019", '%a %b %d %H:%M:%S %Y'))
    return row["statuses_count"]*100/(time_stamp_0 - time_stamp)

In [12]:
frequency = df.apply(ratio, axis=1)

### Generate a dataframe with 4 features

In [13]:
features = pd.DataFrame()
features["vis"] = visibilities
features["r_fri_follow"] = r_fri_follow
features["avg_fav"] = avg_fav
features["frequency"] = frequency

In [14]:
features.isnull().sum()

vis                0
r_fri_follow    1264
avg_fav            0
frequency          0
dtype: int64

In [15]:
features.dropna(inplace=True)

In [16]:
features

Unnamed: 0,vis,r_fri_follow,avg_fav,frequency
0,0.082857,0.753376,0.849462,0.041217
1,0.165714,1.422460,0.000940,0.001264
2,0.164286,0.493827,0.279457,0.003857
3,0.412857,1.411765,1.619112,0.013863
4,0.082857,0.089286,7.115385,0.000060
...,...,...,...,...
267462,0.081429,2.606383,0.096381,0.004111
267463,0.040714,0.166667,0.155413,0.000639
267464,0.164286,0.215686,4.125000,0.000373
267465,0.248571,1.416260,0.291469,0.001722


## Export features to .csv file

In [17]:
features.to_csv("./data/features.csv")