In [1]:
import numpy as np
import pandas as pd
from pymongo import MongoClient
client = MongoClient('localhost', 27017)
db = client['if29']
collection = db['Tweet Worldcup 200']

In [54]:
cursor = collection.aggregate(
    [
        {"$group" : {
            "_id" : "$user.id", 
            "friends_count" : {"$last" : "$user.friends_count"},
            "listed_count" : {"$last" : "$user.listed_count"},
            "favourites_count" : {"$last" : "$user.favourites_count"},
            "statuses_count" : {"$last" : "$user.statuses_count"},
            "followers_count" : {"$last" : "$user.followers_count"},
            "created_time" : {"$last" : "$user.created_at"}
        }
        }
    ]
)
df = pd.json_normalize(cursor)

In [55]:
df

Unnamed: 0,_id,friends_count,listed_count,favourites_count,statuses_count,followers_count,created_time
0,2494785421,448,10,1585,14300,263,Wed May 14 19:53:07 +0000 2014
1,353576074,1086,0,661,714,289,Fri Aug 12 08:59:38 +0000 2011
2,1140595902,2566,0,7625,5672,2838,Fri Feb 01 20:03:57 +0000 2013
3,899575572732551168,533,0,15472,8576,176,Mon Aug 21 10:15:04 +0000 2017
4,423108474,345,1,1330,35264,358,Mon Nov 28 03:01:49 +0000 2011
...,...,...,...,...,...,...,...
267462,2406883352,1070,14,266,27934,2304,Sun Mar 23 12:20:44 +0000 2014
267463,2376538666,190,0,128,23052,535,Sun Mar 02 17:21:26 +0000 2014
267464,14877969,297,4,1,6017,161,Fri May 23 05:26:32 +0000 2008
267465,3951606867,559,0,39,668,39,Wed Oct 14 01:52:09 +0000 2015


In [3]:
from sklearn import preprocessing
from sklearn.decomposition import PCA

In [8]:
np.set_printoptions(suppress=True)

In [4]:
features = df.iloc[:,[1,2,3,4,5]]

features_scaled = preprocessing.normalize(features)

## Features constuct (vis, r_fri_follow, avg_fav, frequency)

### Visibility
#### Group tweet texts by each user

In [4]:
cursor1 = collection.aggregate(
    [
        {"$group" : {
            "_id" : "$user.id", 
            
            "tweets" : {"$push": "$text"}
        }
        }
    ]
)
df1 = pd.json_normalize(cursor1)

In [5]:
df1

Unnamed: 0,_id,tweets
0,32218443,[Primera dia en Copa Mundial! gooooooooooollll...
1,757470042,"[The 2018 World Cup begins today, and there's ..."
2,466348695,[Uruguay \nVecino\nUruguay https://t.co/pQZwgJ...
3,814458502781501440,"[RT @iambolar: I expect Senegal, Nigeria and E..."
4,865578724112158720,[RT @BleacherReport: One more sleep til #World...
...,...,...
267462,722519839382859776,[Its a World Cup Time!!!! 🙌🏻🍺🍺🙌🏻 ⚽️⚽️ Lets go ...
267463,3221677187,[#worldcup The world cup opens today and we a...
267464,956847210007732224,[QB will cheer for all the teams and players w...
267465,923923725241679872,[RT @MesutOzil1088: Back with the team 😃🇩🇪 Jus...


In [3]:
def calcul_visibility(tweets):
    s = 0
    for tweet in tweets:
        s += tweet.count("@")*11.4 
        s += tweet.count("#")*11.6
    return s/(140*len(tweets))

In [6]:
visibilities = []
for texts in df1.iloc[:,1]:
    visibilities.append(calcul_visibility(texts))

### favorites_count / statues_count

In [18]:
avg_fav = df.favourites_count/df.statuses_count

### followers_count / friends_count

In [19]:
r_fri_follow = df.followers_count/df.friends_count

### Frequency of tweets published since the creation of the account with t0=01/01/2019
$ I_1 = \frac{N_t}{t_0-t} $

In [28]:
projection = { 
    'user.statuses_count': 1,
    'user.created_at': 1,
    '_id': 0}
cursor3 = collection.find({},projection)

In [15]:
import time
def ratio(n_tweets, date):
    time_array = time.strptime(date, "%a %b %d %H:%M:%S +0000 %Y")
    time_stamp = time.mktime(time_array)
    time_stamp_0 = time.mktime(time.strptime("Mon Jan 1 00:00:00 2019", '%a %b %d %H:%M:%S %Y'))
    return n_tweets/(time_stamp_0 - time_stamp)

In [76]:
frequency = []
for index, row in df.iterrows():
    n_tweet = row["statuses_count"]
    created_date = row["created_time"]
    frequency.append(ratio(n_tweet, created_date)*100)

### Generate a dataframe with 4 features

In [77]:
features = pd.DataFrame()
features["vis"] = visibilities
features["r_fri_follow"] = r_fri_follow
features["avg_fav"] = avg_fav
features["frequency"] = frequency

In [78]:
features.to

Unnamed: 0,vis,r_fri_follow,avg_fav,frequency
0,0.082857,2.437204,0.064328,0.009781
1,0.082857,0.290210,0.008942,0.000306
2,0.000000,1.005658,0.175127,0.003040
3,0.122857,1.053957,0.058894,0.019947
4,0.164286,3.666667,1.400000,0.015753
...,...,...,...,...
267462,0.165714,0.377778,0.009201,0.018533
267463,0.082857,1.935252,1.062810,0.015114
267464,0.055238,0.369650,0.794212,0.001797
267465,0.411429,0.554404,3.989836,0.000658


## PCA analyse

In [5]:
pca = PCA(n_components='mle')
pca.fit(features)

PCA(n_components='mle')

In [11]:
pca.n_components_

4

In [9]:
pca.components_

array([[ 0.00245349,  0.00236398,  0.00003716,  0.02813996,  0.99959819],
       [ 0.01771169,  0.00086559,  0.18122699,  0.98289054, -0.02772187],
       [-0.00695963,  0.00023663, -0.98338988,  0.18130181, -0.00505079],
       [ 0.99981191,  0.00280804, -0.01005555, -0.01622154, -0.00200363]])

In [10]:
print(pca.explained_variance_ratio_)

print(pca.singular_values_)

[0.900734   0.08328725 0.01516849 0.00080419]
[94024482.2375293  28591170.62323371 12201515.7587858   2809460.07852549]


## KMeans

In [7]:
from sklearn.cluster import KMeans

In [8]:
kmeans = KMeans(n_clusters=2, random_state=0).fit(features_scaled)

In [26]:
np.unique(kmeans.labels_, return_counts=True)

(array([0, 1]), array([151499, 115968], dtype=int64))

In [9]:
kmeans.cluster_centers_

array([[0.21890709, 0.00112238, 0.70084102, 0.43919502, 0.13412476],
       [0.10003199, 0.00174393, 0.18922492, 0.92186614, 0.1052733 ]])

In [14]:
kmeans1 = KMeans(n_clusters=2).fit(features)
# np.unique(kmeans1.labels_, return_counts=True)

In [15]:
kmeans1.cluster_centers_

array([[1.09921808e+03, 4.14603220e+01, 9.11012590e+03, 2.21766168e+04,
        5.82618084e+03],
       [1.52953846e+04, 2.30896154e+04, 1.75617308e+03, 1.64511654e+05,
        1.03118794e+07]])

In [16]:
pd.DataFrame(features).describe()

Unnamed: 0,friends_count,listed_count,favourites_count,statuses_count,followers_count
count,267467.0,267467.0,267467.0,267467.0,267467.0
mean,1101.978053,45.941264,9108.696,22204.29,7829.848
std,5539.322982,640.463954,25271.81,54745.63,181738.8
min,0.0,0.0,0.0,1.0,0.0
25%,186.0,0.0,380.0,1329.0,124.0
50%,408.0,3.0,1946.0,6275.0,363.0
75%,906.0,14.0,7547.0,21750.5,1019.0
max,544830.0,114627.0,1718515.0,8898917.0,30755370.0
