# Extract 4 features from dataset
1. Visibility
2. favorites_count / statues_count
3. followers_count / friends_count
4. Frequency of tweets published since the creation of the account

In [8]:
import numpy as np
import pandas as pd
from pymongo import MongoClient
np.set_printoptions(suppress=True)
#consider inf and -inf to be “NA” in computations
pd.options.mode.use_inf_as_na = True

client = MongoClient('localhost', 27017)
db = client['if29']
collection = db['Tweet Worldcup 200']

In [9]:
cursor = collection.aggregate(
    [
        {"$group" : {
            "_id" : "$user.id", 
            "friends_count" : {"$last" : "$user.friends_count"},
            "listed_count" : {"$last" : "$user.listed_count"},
            "favourites_count" : {"$last" : "$user.favourites_count"},
            "statuses_count" : {"$last" : "$user.statuses_count"},
            "followers_count" : {"$last" : "$user.followers_count"},
            "created_time" : {"$last" : "$user.created_at"}
        }
        }
    ]
)
df = pd.json_normalize(cursor)

In [10]:
df

Unnamed: 0,_id,friends_count,listed_count,favourites_count,statuses_count,followers_count,created_time
0,353576074,1086,0,661,714,289,Fri Aug 12 08:59:38 +0000 2011
1,837488007124963328,266,0,32005,8684,179,Fri Mar 03 02:21:15 +0000 2017
2,423108474,345,1,1330,35264,358,Mon Nov 28 03:01:49 +0000 2011
3,2598356918,4996,1,25639,8706,1071,Tue Jul 01 18:11:48 +0000 2014
4,2494785421,448,10,1585,14300,263,Wed May 14 19:53:07 +0000 2014
...,...,...,...,...,...,...,...
267462,144278441,451,8,1449,5431,112,Sat May 15 20:46:52 +0000 2010
267463,788044945949069313,379,0,3205,552,170,Mon Oct 17 15:52:11 +0000 2016
267464,264143196,369,1,3378,22232,204,Fri Mar 11 12:05:28 +0000 2011
267465,356977475,4010,3,68281,36246,578,Wed Aug 17 17:13:55 +0000 2011


### Export to .csv file

In [11]:
df.iloc[:,range(0,6)].to_csv("./feature_raw.csv")

## Feature constuct (vis, r_fri_follow, avg_fav, frequency)

### Visibility
#### Group tweet texts by each user

In [12]:
cursor1 = collection.aggregate(
    [
        {"$group" : {
            "_id" : "$user.id", 
            
            "tweets" : {"$push": "$text"}
        }
        }
    ]
)
df1 = pd.json_normalize(cursor1)

In [13]:
df1

Unnamed: 0,_id,tweets
0,788044945949069313,[RT @talkingbawscom: The 2018 #WorldCup kicks ...
1,356977475,[RT @FIFAWorldCup: The #WorldCup starts today ...
2,423108474,[RT @PurelyFootball: The #WorldCup is finally ...
3,353576074,[RT @jimrainford11: The 2018 World Cup kicks o...
4,2494785421,[RT @Reuters: Female Saudi flag bearers to mis...
...,...,...
267462,3334130291,[#CokeScoreChallenge\nRussia 2 - 1 Saudi Arabi...
267463,833738070,"[RT @VarskySports: ES HOY! Durante un mes, el ..."
267464,837488007124963328,[RT @ChinaDailyUSA: A cool fan of Messi! Serbi...
267465,264143196,[RT @ChampionsLeague: Ronaldinho 🇧🇷\n\n✅ 2006 ...


### calculate visibility

In [14]:
def calcul_visibility(tweets):
    s = 0
    for tweet in tweets:
        s += tweet.count("@")*11.4 
        s += tweet.count("#")*11.6
    return s/(140*len(tweets))

In [15]:
visibilities = []
for texts in df1.iloc[:,1]:
    visibilities.append(calcul_visibility(texts))

### favorites_count / statues_count

In [16]:
avg_fav = df.favourites_count/df.statuses_count

### followers_count / friends_count

In [17]:
r_fri_follow = df.followers_count/df.friends_count

### Frequency of tweets published since the creation of the account with t0=01/01/2019
$ I_1 = \frac{N_t}{t_0-t} $

In [19]:
import time
def ratio(row):
    time_array = time.strptime(row["created_time"], "%a %b %d %H:%M:%S +0000 %Y")
    time_stamp = time.mktime(time_array)
    time_stamp_0 = time.mktime(time.strptime("Mon Jan 1 00:00:00 2019", '%a %b %d %H:%M:%S %Y'))
    return row["statuses_count"]*100/(time_stamp_0 - time_stamp)

In [20]:
frequency = df.apply(ratio, axis=1)

### Generate a dataframe with 4 features

In [22]:
features = pd.DataFrame()
features["vis"] = visibilities
features["r_fri_follow"] = r_fri_follow
features["avg_fav"] = avg_fav
features["frequency"] = frequency

In [23]:
features.isnull().sum()

vis                0
r_fri_follow    1264
avg_fav            0
frequency          0
dtype: int64

In [24]:
features.dropna(inplace=True)

In [25]:
features

Unnamed: 0,vis,r_fri_follow,avg_fav,frequency
0,0.164286,0.266114,0.925770,0.000306
1,0.578571,0.672932,3.685514,0.015026
2,0.164286,1.037681,0.037716,0.015753
3,0.245714,0.214371,2.944980,0.006128
4,0.164286,0.587054,0.110839,0.009781
...,...,...,...,...
267462,0.264286,0.248337,0.266802,0.001994
267463,0.081429,0.448549,5.806159,0.000793
267464,0.164286,0.552846,0.151943,0.009021
267465,0.330000,0.144140,1.883822,0.015576


## Export features to .csv file

In [26]:
features.to_csv("./features.csv")