In [45]:
import logging
import pandas as pd
import os
import numpy as np

In [46]:
from retrieve_post_by_userid import RetrievePostByUserId
t = RetrievePostByUserId(sessionid="")

In [47]:
def get_post_data(data):
    post_list = []
    if data is not None and data["data"] is not None and data["data"]["mediaData"] is not None and data["data"]["mediaData"]["edges"] is not None:
        edges = data["data"]["mediaData"]["edges"]
        try:
            for edge in edges:
                thread_items = edge["node"]["thread_items"]
                for thread_item in thread_items:
                    post = thread_item["post"]
                    if post["caption"] is None or post["user"] is None or post["text_post_app_info"] is None:
                        continue
                    current_post = {
                        "id": post.get("id",""),
                        "post_pk": post.get("pk",""),
                        "caption_text": post.get("caption",{}).get("text",""),
                        "like_count": post.get("like_count",""),
                        "taken_at": post.get("taken_at",""),
                        "username": post.get("user",{}).get("username",""),
                        "user_pk": post.get("user",{}).get("pk",""),
                        "quote_count": post["text_post_app_info"].get("quote_count",0),
                        "repost_count": post["text_post_app_info"].get("repost_count",0),
                        "reshare_count": post["text_post_app_info"].get("reshare_count",0),
                    }
                    post_list.append(current_post)
            return post_list
        except Exception as e:
            raise RuntimeError(f"Errore durante l'elaborazione dei dati dei follower: {e}")
    return []
        

def get_all_post(userid: str):
    results = []
    try:
        response = t.retrieve_thread_by_userid(userid)
        d_out = get_post_data(response)
        results.extend(d_out)
    except Exception as e:
        print(response)
        raise RuntimeError(f"Errore generale nel recupero dei follower per l'utente {userid}: {e}")
    return results


In [48]:
df = pd.read_csv("data_parte2.csv")
df['user_pk'] = df['user_pk'].astype(str)
unique_user_pk = df['user_pk'].unique()
len(unique_user_pk)

293

In [49]:
df2 = pd.read_csv("followers_data2.csv")
df2['follower_pk'] = df2['follower_pk'].astype(str)
unique_user_pk2 = df2['follower_pk'].unique()
len(unique_user_pk2)

88258

In [50]:
unique_user = np.concatenate([unique_user_pk, unique_user_pk2])
len(unique_user)

88551

In [51]:
def load_processed_users(file_path):
    if os.path.exists(file_path):
        logging.info(f"Loading processed users from {file_path}")
        df = pd.read_csv(file_path)
        df['user_pk'] = df['user_pk'].astype(str)
        return set(df['user_pk'].tolist())
    return set()

def save_to_csv(df, file_path):
    try:
        if os.path.exists(file_path):
            df.to_csv(file_path, mode='a', header=False, index=False)
        else:
            df.to_csv(file_path, index=False)
        logging.info(f"Saved {len(df)} records to {file_path}")
    except Exception as e:
        logging.error(f"Error saving data to {file_path}: {e}")
        
csv_file = "post_data2.csv"
processed_users = load_processed_users(csv_file)

In [None]:
import threading
import pandas as pd
from queue import Queue

unique_users = unique_user 

def process_user(user, queue):
    posts = get_all_post(user)
    queue.put(pd.DataFrame(posts))

def main():
    queue = Queue()
    i = 0
    threads = []
    all_data = []
    
    for user in unique_users:
        if user not in processed_users:
            i+=1
            thread = threading.Thread(target=process_user, args=(user, queue))
            threads.append(thread)
            thread.start()
            if len(threads) >= 20:
                for t in threads:
                    t.join()
                threads = []
                
                while not queue.empty():
                    all_data.append(queue.get())
                
                if all_data:
                    final_df = pd.concat(all_data, ignore_index=True)
                    save_to_csv(final_df, csv_file)
                    all_data = []
                
                print(f"Processati {i} utenti su {len(unique_users)}")
        else:
            i+=1
        # Attendere eventuali thread rimanenti
        for t in threads:
            t.join()
        
        while not queue.empty():
            all_data.append(queue.get())
        
        if all_data:
            final_df = pd.concat(all_data, ignore_index=True)
            save_to_csv(final_df, csv_file)
        
    print("Processo completato!")

if __name__ == "__main__":
    main()


Processati 374 utenti su 82176
Processati 427 utenti su 82176


In [None]:
df1 = pd.read_csv("post_data2.csv")
print(df1.shape)
df1

(156589, 10)


Unnamed: 0,id,post_pk,caption_text,like_count,taken_at,username,user_pk,quote_count,repost_count,reshare_count
0,3558466520929870405_192592722,3558466520929870405,Artist exploring a theme: “I leaned on my life...,19,1738422287,thebrianpenny,192592722,0.0,2.0,
1,3558107406301138069_192592722,3558107406301138069,This year’s championship teams will go to the ...,8,1738379477,thebrianpenny,192592722,0.0,2.0,
2,3558036359623693731_192592722,3558036359623693731,I can’t imagine earning at least $4 billion a ...,20,1738371008,thebrianpenny,192592722,0.0,0.0,
3,3558036885748607372_192592722,3558036885748607372,"On the bright side, we are that much closer to...",2,1738371071,thebrianpenny,192592722,0.0,0.0,
4,3557844724549927392_192592722,3557844724549927392,Even with this week’s clarification from the U...,12,1738348163,thebrianpenny,192592722,0.0,1.0,
...,...,...,...,...,...,...,...,...,...,...
156584,3411034357638533503_262659671,3411034357638533503,Hdsptm las chibastardas 🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣,1,1720847004,unbreakable47,262659671,0.0,0.0,
156585,3402708120315932181_262659671,3402708120315932181,🤣🤣🤣🤣🤣🤣,0,1719854439,unbreakable47,262659671,0.0,0.0,
156586,3402696598193881872_262659671,3402696598193881872,A como me salen de estas madres para benepláci...,0,1719853066,unbreakable47,262659671,0.0,0.0,
156587,3402694987899784137_262659671,3402694987899784137,🤣🤣🤣🤣🤣🤣🤣🤣🤣,0,1719852874,unbreakable47,262659671,0.0,0.0,
