In [52]:
import logging
import pandas as pd
import os
import numpy as np

In [53]:
from retrieve_post_by_userid import RetrievePostByUserId
t = RetrievePostByUserId(sessionid="")

In [54]:
def get_post_data(data):
    post_list = []
    if data is not None and data["data"] is not None and data["data"]["mediaData"] is not None and data["data"]["mediaData"]["edges"] is not None:
        edges = data["data"]["mediaData"]["edges"]
        try:
            for edge in edges:
                thread_items = edge["node"]["thread_items"]
                for thread_item in thread_items:
                    post = thread_item["post"]
                    if post["caption"] is None or post["user"] is None or post["text_post_app_info"] is None:
                        continue
                    current_post = {
                        "id": post.get("id",""),
                        "post_pk": post.get("pk",""),
                        "caption_text": post.get("caption",{}).get("text",""),
                        "like_count": post.get("like_count",""),
                        "taken_at": post.get("taken_at",""),
                        "username": post.get("user",{}).get("username",""),
                        "user_pk": post.get("user",{}).get("pk",""),
                        "quote_count": post["text_post_app_info"].get("quote_count",0),
                        "repost_count": post["text_post_app_info"].get("repost_count",0),
                        "reshare_count": post["text_post_app_info"].get("reshare_count",0),
                    }
                    post_list.append(current_post)
            return post_list
        except Exception as e:
            raise RuntimeError(f"Errore durante l'elaborazione dei dati dei follower: {e}")
    return []
        

def get_all_post(userid: str):
    results = []
    try:
        response = t.retrieve_thread_by_userid(userid)
        d_out = get_post_data(response)
        results.extend(d_out)
    except Exception as e:
        print(response)
        raise RuntimeError(f"Errore generale nel recupero dei follower per l'utente {userid}: {e}")
    return results


In [55]:
df = pd.read_csv("data_parte3.csv")
df['user_pk'] = df['user_pk'].astype(str)
unique_user_pk = df['user_pk'].unique()
len(unique_user_pk)

378

In [56]:
df2 = pd.read_csv("followers_data3.csv")
df2['follower_pk'] = df2['follower_pk'].astype(str)
unique_user_pk2 = df2['follower_pk'].unique()
len(unique_user_pk2)

97446

In [57]:
unique_user = np.concatenate([unique_user_pk, unique_user_pk2])
len(unique_user)

97824

In [58]:
def load_processed_users(file_path):
    if os.path.exists(file_path):
        logging.info(f"Loading processed users from {file_path}")
        df = pd.read_csv(file_path)
        df['user_pk'] = df['user_pk'].astype(str)
        return set(df['user_pk'].tolist())
    return set()

def save_to_csv(df, file_path):
    try:
        if os.path.exists(file_path):
            df.to_csv(file_path, mode='a', header=False, index=False)
        else:
            df.to_csv(file_path, index=False)
        logging.info(f"Saved {len(df)} records to {file_path}")
    except Exception as e:
        logging.error(f"Error saving data to {file_path}: {e}")
        
csv_file = "post_data3.csv"
processed_users = load_processed_users(csv_file)

In [None]:
import threading
import pandas as pd
from queue import Queue

unique_users = unique_user 

def process_user(user, queue):
    posts = get_all_post(user)
    queue.put(pd.DataFrame(posts))

def main():
    queue = Queue()
    i = 0
    threads = []
    all_data = []
    
    for user in unique_users:
        if user not in processed_users:
            i+=1
            thread = threading.Thread(target=process_user, args=(user, queue))
            threads.append(thread)
            thread.start()
            
            # Quando raggiungiamo 10 thread, aspettiamo che finiscano
            if len(threads) >= 20:
                for t in threads:
                    t.join()
                threads = []
                
                while not queue.empty():
                    all_data.append(queue.get())
                
                if all_data:
                    final_df = pd.concat(all_data, ignore_index=True)
                    save_to_csv(final_df, csv_file)
                    all_data = []
                
                print(f"Processati {i} utenti su {len(unique_users)}")
        else:
            i+=1
        # Attendere eventuali thread rimanenti
        for t in threads:
            t.join()
        
        while not queue.empty():
            all_data.append(queue.get())
        
        if all_data:
            final_df = pd.concat(all_data, ignore_index=True)
            save_to_csv(final_df, csv_file)
        
    print("Processo completato!")

if __name__ == "__main__":
    main()


Processati 397 utenti su 89611


In [51]:
df1 = pd.read_csv("post_data3.csv")
print(df1.shape)
df1

(274314, 10)


Unnamed: 0,id,post_pk,caption_text,like_count,taken_at,username,user_pk,quote_count,repost_count,reshare_count
0,3558539960476269914_1822638,3558539960476269914,US Democratic lawmakers question Trump's block...,17,1738431064,reuters,1822638,1.0,3.0,
1,3558524664562754819_1822638,3558524664562754819,McDonald's settles lawsuit challenging Latino ...,5,1738429241,reuters,1822638,0.0,4.0,
2,3558509530347863285_1822638,3558509530347863285,Crashed US Army Black Hawk unit was responsibl...,31,1738427437,reuters,1822638,5.0,10.0,7.0
3,3558494693358050745_1822638,3558494693358050745,Trump administration seeks access to database ...,46,1738425668,reuters,1822638,2.0,22.0,3.0
4,3558479502042083617_1822638,3558479502042083617,"Horst Koehler, former German president and IMF...",23,1738423857,reuters,1822638,2.0,4.0,
...,...,...,...,...,...,...,...,...,...,...
274309,3475090597859236450_2487,3475090597859236450,Exakt så.,6,1728483103,pierrea,2487,0.0,0.0,
274310,3467073560831882157_2487,3467073560831882157,Ut på tur i finvädret för att testa nya telefo...,10,1727527397,pierrea,2487,0.0,0.0,
274311,3465679352019676474_2487,3465679352019676474,Skakat liv i ett gammalt amerikanskt Apple-kon...,4,1727361195,pierrea,2487,0.0,0.0,
274312,3465361112198300610_2487,3465361112198300610,Om du inte prenumererar på nyhetsbrevet som Af...,7,1727323257,pierrea,2487,0.0,1.0,
