In [1]:
from __future__ import annotations
from typing import List
from pathlib import Path
from datetime import datetime, timezone

import pandas as pd

from scraping_kit.graph_follow import GraphFollows, KWClusters
from scraping_kit.utils_loader import load_db_and_bots


def load_profiles(path_data: Path):
    """ TODO: Ask Vera how she wants to import the users to study."""
    df = pd.read_excel(path_data / "twitter_accounts.xlsx")
    df = df.sort_values("followersCount", ascending=False)
    profiles = df["screenName"].to_list()
    return profiles


db_tw, bots = load_db_and_bots()
profiles = load_profiles(db_tw.path_data)

Collection Names: ['tweet_user', 'raw', 'cursors', 'trends', 'search', 'user', 'user_suspended', 'topics', 'follows']
Bots: bots=[BotScraper(acc_name='Vera')]


In [None]:
date_i = datetime(2024, 1, 13, tzinfo=timezone.utc)
date_f = datetime(2024, 1, 20, tz=timezone.utc)
N_BESTS_USERS = 40
MAX_WORKERS = 40
DAYS_TO_UPDATE_TWEETS = 14
DAYS_TO_UPDATE_FOLLOWS_LINK = 120
WITH_UPDATE = False      # If it is false, it will do the analysis with what you have downloaded.

- `date_i:` Initial date where tweets are considered.
- `date_i:` Final date where tweets are considered.
- `N_BESTS_USERS:` TOP users with the most followers in the graph analysis. <span style="color:crimson;">IMPORTANT!!! Then read the warning.</span>
- `MAX_WORKERS:` Number of requests that can be made in parallel, use between `10~40`. The higher the faster, but you can saturate the API server and get blocked.
- `DAYS_TO_UPDATE_TWEETS:` Time after collecting a user. It is not expensive, you can choose a low number, `7~14`.
- `DAYS_TO_UPDATE_FOLLOWS_LINK:` Time after which followers are updated. Using a high number `>120 or >180`, it is expensive to collect.
- `WITH_UPDATE:` Use `False` if you only want to view information `and not collect` it.


##### <span style="color:crimson;">WARNING: (N_BESTS_USERS)</span>
1. ---> `Remember that you have a quota of 100k requests per month.`
2. From the list you send you will always get the latest Tweets from each one, but it will only look for many-to-many followers, among the TOP you determine.
3. Then the system makes `N*(N-1)` requests, where `N=N_BESTS_USERS`.
4. If the link between `User_A` and `User_B` was already collected, it does `not make the request`.
4. If you choose a very high number, you can consume all the monthly requests, I attach a table with the number of requests by value.
5. **<span style="color:green;">Note that: I would use a low value, and increase it little by little, collecting information and seeing how the graph changes.</span>**

|N_BESTS_USERS|N_REQUESTS|
|-------------|----------|
|20           |<= 380    |
|40           |<= 1560   |
|60           |<= 3540   |
|80           |<= 6320   |
|100          |<= 9900   |
|120          |<= 14280  |
|140          |<= 19460  |
|160          |<= 25440  |
|180          |<= 32220  |
|200          |<= 39800  |
- The `graph will be plotted` with the `TOP` of `N_BESTS_USERS`


### Collect and obtain the users with the most followers.

In [None]:
users = db_tw.collect_and_get_users(
    profiles = profiles,
    bots = bots,
    date_i = date_i,
    date_f = date_f,
    n_bests_users = N_BESTS_USERS,
    days_to_update_tweets = DAYS_TO_UPDATE_TWEETS,
    days_to_update_follow_link = DAYS_TO_UPDATE_FOLLOWS_LINK,
    with_update = WITH_UPDATE,
    max_workers = MAX_WORKERS
)
graph_follow = GraphFollows.from_users_db(db_tw, users, date_i, date_f)

In [None]:
colors_ranges = [
    (0, "#666"),
    (1, "#E6BE37"),
    (2, "#ED851D"),
    (3, "#FF0000"),
]
MODE_PLOT_GRAPH = 'in'      # Can use 'in', 'out' or 'sum'.
WITH_SAVE = True

cairo_plot = graph_follow.plot(
    with_save = WITH_SAVE,
    colors_ranges = colors_ranges,
    mode = MODE_PLOT_GRAPH
)
cairo_plot

In [None]:
MIN_USERS_PER_CLUSTER = 2
kw_clusters = KWClusters.from_graph_follow(graph_follow, MIN_USERS_PER_CLUSTER)
kw_clusters