In [None]:
from __future__ import annotations
from scraping_kit.utils import get_datetime
from scraping_kit.graph_follow import GraphFollows, KWClusters, GraphPlotStyle
from scraping_kit.utils_loader import load_db_and_bots, load_profiles

db_tw, bots = load_db_and_bots()
profiles = load_profiles(file_name="twitter_accounts.xlsx")

In [None]:
date_i = get_datetime(year=2024, month=1, day=13)
date_f = get_datetime(year=2024, month=1, day=20)
N_BESTS_USERS = 40

# Secondary variables.
MAX_WORKERS = 40
DAYS_TO_UPDATE_TWEETS = 14
DAYS_TO_UPDATE_FOLLOWS_LINK = 120
WITH_UPDATE = True      # If it is false, it will do the analysis with what you have downloaded.

- `date_i:` Initial date where tweets are considered.
- `date_f:` Final date where tweets are considered.
- `N_BESTS_USERS:` TOP users with the most followers in the graph analysis. <span style="color:crimson;">IMPORTANT!!! Then read the warning.</span>
- `MAX_WORKERS:` Number of requests that can be made in parallel, use between `10~40`. The higher the faster, but you can saturate the API server and get blocked.
- `DAYS_TO_UPDATE_TWEETS:` Time after collecting a user. It is not expensive, you can choose a low number, `7~14`.
- `DAYS_TO_UPDATE_FOLLOWS_LINK:` Time after which followers are updated. Using a high number `>120 or >180`, it is expensive to collect.
- `WITH_UPDATE:` Use `False` if you only want to view information `and not collect` it.


##### <span style="color:crimson;">WARNING: (N_BESTS_USERS)</span>
1. ---> `Remember that you have a quota of 100k requests per month.`
2. From the list you send you will always get the latest Tweets from each one, but it will only look for many-to-many followers, among the TOP you determine.
3. Then the system makes `N*(N-1)` requests, where `N=N_BESTS_USERS`.
4. If the link between `User_A` and `User_B` was already collected, it does `not make the request`.
4. If you choose a very high number, you can consume all the monthly requests, I attach a table with the number of requests by value.
5. **<span style="color:green;">Note that: I would use a low value, and increase it little by little, collecting information and seeing how the graph changes.</span>**

|N_BESTS_USERS|N_REQUESTS|
|-------------|----------|
|20           |<= 380    |
|40           |<= 1560   |
|60           |<= 3540   |
|80           |<= 6320   |
|100          |<= 9900   |
|120          |<= 14280  |
|140          |<= 19460  |
|160          |<= 25440  |
|180          |<= 32220  |
|200          |<= 39800  |
- The `graph will be plotted` with the `TOP` of `N_BESTS_USERS`


### Collect and obtain the users with the most followers.

In [None]:
users = db_tw.collect_and_get_users(
    profiles = profiles,
    bots = bots,
    date_i = date_i,
    date_f = date_f,
    n_bests_users = N_BESTS_USERS,
    days_to_update_tweets = DAYS_TO_UPDATE_TWEETS,
    days_to_update_follow_link = DAYS_TO_UPDATE_FOLLOWS_LINK,
    with_update = WITH_UPDATE,
    max_workers = MAX_WORKERS
)
graph_follow = GraphFollows.from_users_db(db_tw, users, date_i, date_f)

1. `colors_ranges`
- You can add other ranges, and take colors from here: https://htmlcolorcodes.com/

2. `MODE_PLOT_GRAPH`
- The color of the nodes depends on this variable, just aesthetic.
- And it depends what you want to show.
    - `'in'` Count the arrows pointing to the node `(followers)`.
    - `'out'` Count the arrows coming out of the node `(following)`.
    - `'sum'` the sum of the previous 2 `(followers + following)`.

In [None]:
colors_ranges = [
    (0, "#666666"),
    (1, "#E6BE37"),
    (2, "#ED851D"),
    (3, "#FF0000"),
]

MODE_PLOT_GRAPH = 'in'      # Can use 'in', 'out' or 'sum'.
WITH_SAVE = True

cairo_plot = graph_follow.plot(
    with_save = WITH_SAVE,
    #plot_style = GraphPlotStyle(background="#fff"),
    colors_ranges = colors_ranges,
    mode = MODE_PLOT_GRAPH
)
cairo_plot

### Keywords
- You will be able to see all the keywords inside:
    - `data/reports/graph_follows/from_yyyy_mm_dd_to_yyyy_mm_dd/keywords/`
    - Within this folder the following will be generated:
        1. `keywords/clusters`: Keywords for `each cluster` `(user group)`.
        2. `keywords/users`: Keywords for the `N_BEST_USERS` within the list `individually`.
        3. `keywords/all_users_list`: Keywords for `all users` within the list `individually`.
    - You can change the `time interval`, and `obtain the tweets` that are `saved in the DB`, and `generate the keywords` of those users in that time interval.
    - If you run the same script at different intervals, you will get different folders.

- `MIN_USERS_PER_CLUSTER`, must be greater than 2. And it will only export the keywords for that minimum number of users.

In [None]:
MIN_USERS_PER_CLUSTER = 2
kw_clusters = KWClusters.from_graph_follow(graph_follow, MIN_USERS_PER_CLUSTER)
kw_clusters.save_keywords_clusters()
graph_follow.save_keywords_users()
graph_follow.save_keywords_from_profiles(
    db_tw = db_tw,
    date_i = date_i,
    date_f = date_f,
    profiles = profiles
)
kw_clusters

### Topics Users
- To collect topics related to users you must run `collect_topics_users.ipynb`.
- They will be stored within: `data/reports/graph_follows/from_yyyy_mm_dd_to_yyyy_mm_dd/topics_user_i`.

In [None]:
df_topics_users = users.df_topics_users(graph_follow.path_topics_users)
df_topics_users