# Extract 4 features from dataset
1. Visibility
2. favorites_count / statues_count
3. followers_count / friends_count
4. Frequency of tweets published since the creation of the account

In [1]:
import numpy as np
import pandas as pd
from pymongo import MongoClient
np.set_printoptions(suppress=True)
#consider inf and -inf to be “NA” in computations
pd.options.mode.use_inf_as_na = True

client = MongoClient('localhost', 27017)
db = client['if29']
collection = db['Tweet Worldcup 200']

In [2]:
cursor = collection.aggregate(
    [
        {"$group" : {
            "_id" : "$user.id", 
            "friends_count" : {"$last" : "$user.friends_count"},
            "listed_count" : {"$last" : "$user.listed_count"},
            "favourites_count" : {"$last" : "$user.favourites_count"},
            "statuses_count" : {"$last" : "$user.statuses_count"},
            "followers_count" : {"$last" : "$user.followers_count"},
            "created_time" : {"$last" : "$user.created_at"}
        }
        }
    ]
)
df = pd.json_normalize(cursor)

In [3]:
df

Unnamed: 0,_id,friends_count,listed_count,favourites_count,statuses_count,followers_count,created_time
0,3951606867,559,0,39,668,39,Wed Oct 14 01:52:09 +0000 2015
1,758966916593496064,49,0,75,446,28,Fri Jul 29 10:06:28 +0000 2016
2,2494785421,448,10,1585,14300,263,Wed May 14 19:53:07 +0000 2014
3,1140595902,2566,0,7625,5672,2838,Fri Feb 01 20:03:57 +0000 2013
4,899575572732551168,533,0,15472,8576,176,Mon Aug 21 10:15:04 +0000 2017
...,...,...,...,...,...,...,...
267462,264143196,369,1,3378,22232,204,Fri Mar 11 12:05:28 +0000 2011
267463,356977475,4010,3,68281,36246,578,Wed Aug 17 17:13:55 +0000 2011
267464,235208639,1758,18,2205,4170,688,Fri Jan 07 16:38:01 +0000 2011
267465,2406883352,1070,14,266,27934,2304,Sun Mar 23 12:20:44 +0000 2014


## Feature constuct (vis, r_fri_follow, avg_fav, frequency)

### Visibility
#### Group tweet texts by each user

In [4]:
cursor1 = collection.aggregate(
    [
        {"$group" : {
            "_id" : "$user.id", 
            
            "tweets" : {"$push": "$text"}
        }
        }
    ]
)
df1 = pd.json_normalize(cursor1)

In [5]:
df1

Unnamed: 0,_id,tweets
0,279680649,[RT @Okwonga: A #WorldCup thread for you 👇🏿 ht...
1,253284367,[¡Inició la fiebre! ⚽\nHoy inicia el mundial d...
2,1000127090,[RT @SpencerOwen: Today is the day! The 2018 #...
3,827542176934543360,[RT @jbguegan: Faites vous plaisir ! Avec @QMi...
4,718314336486354944,[RT @worIdcupfan: The #WorldCup⁠⁠ is finally h...
...,...,...
267462,91510594,"[HellO #WorldCup, hello defending champions! L..."
267463,4045338442,[The #WorldCup gets underway today. \n\nWhich...
267464,342564165,[RT @ASRomaEN: #ASRoma fans voted these two Ni...
267465,2730613563,[RT @PatriaNostraa: Jour J ! 🔥😍 #CoupeduMonde2...


### Calculate visibility

In [6]:
def calcul_visibility(tweets):
    s = 0
    for tweet in tweets:
        s += tweet.count("@")*11.4 
        s += tweet.count("#")*11.6
    return s/(140*len(tweets))

In [7]:
visibilities = []
for texts in df1.iloc[:,1]:
    visibilities.append(calcul_visibility(texts))

### favorites_count / statues_count

In [8]:
avg_fav = df.favourites_count/df.statuses_count

### followers_count / friends_count

In [9]:
r_fri_follow = df.followers_count/df.friends_count

In [10]:
r_fri_follow.to_csv("./r_fri_follow.csv")

### Frequency of tweets published since the creation of the account with t0=01/01/2019
$ I_1 = \frac{N_t}{t_0-t} $

In [21]:
import time
def ratio1(n_tweets, date):
    time_array = time.strptime(date, "%a %b %d %H:%M:%S +0000 %Y")
    time_stamp = time.mktime(time_array)
    time_stamp_0 = time.mktime(time.strptime("Mon Jan 1 00:00:00 2019", '%a %b %d %H:%M:%S %Y'))
    return n_tweets/(time_stamp_0 - time_stamp)

In [17]:
import time
def ratio(row):
    time_array = time.strptime(row["created_time"], "%a %b %d %H:%M:%S +0000 %Y")
    time_stamp = time.mktime(time_array)
    time_stamp_0 = time.mktime(time.strptime("Mon Jan 1 00:00:00 2019", '%a %b %d %H:%M:%S %Y'))
    return row["statuses_count"]*100/(time_stamp_0 - time_stamp)

In [18]:
frequency = df.apply(ratio, axis=1)

In [19]:
frequency

0         0.000658
1         0.000583
2         0.009781
3         0.003040
4         0.019947
            ...   
267462    0.009021
267463    0.015576
267464    0.001656
267465    0.018533
267466    0.015114
Length: 267467, dtype: float64

In [22]:
frequency1 = []
for index, row in df.iterrows():
    n_tweet = row["statuses_count"]
    created_date = row["created_time"]
    frequency1.append(ratio1(n_tweet, created_date)*100)

In [23]:
frequency1

[0.0006580187050879454,
 0.0005828723457809949,
 0.009780639208272168,
 0.0030404429895417285,
 0.01994701759524189,
 0.01575330137068419,
 0.0061281336576781455,
 0.0003062211502899867,
 0.15355430372763174,
 0.001308703747469901,
 0.003705240932924808,
 0.0005335279812447483,
 0.00042154445994412515,
 0.00012208154389985,
 0.00029304557840287396,
 0.0038488800658143985,
 0.0019154195152015282,
 0.0005749876504864803,
 0.0016243219936675905,
 0.00237376971439803,
 0.0005912229036879656,
 0.0010932925779490576,
 0.0015351652465448672,
 0.00011785848714201276,
 0.011614667227697016,
 0.001777810941234671,
 0.0011097706221171754,
 0.001721418145640948,
 0.023363515507219042,
 0.0476294149375952,
 0.035155031457981746,
 0.007177298381942499,
 0.0008212082897987898,
 0.007243686778191687,
 0.0015248097544175582,
 0.0015889678180403223,
 8.263065509605394e-05,
 0.020840421386358894,
 0.011180306426088638,
 0.011182846271908055,
 0.002131688116261692,
 0.006528470162695921,
 0.00080319103754

### Generate a dataframe with 4 features

In [38]:
features = pd.DataFrame()
features["vis"] = visibilities
features["r_fri_follow"] = r_fri_follow
features["avg_fav"] = avg_fav
features["frequency"] = frequency

In [41]:
features.isnull().sum()

vis             0
r_fri_follow    0
avg_fav         0
frequency       0
dtype: int64

In [40]:
features.dropna(inplace=True)

In [42]:
features.to_csv("./features.csv")

## Export features to .csv file

In [98]:
features.to_csv("./features.csv")