Objectifs du notebook :

- Nettoyer les donn√©es pour obtenir un seul fichier contenant l'√©chantillon qui sera utilis√©e.
- V√©rifier la pr√©sence de valeurs manquantes ou erron√©es.
- S'assurer de r√©unir uniquement les donn√©es n√©cessaires pour la base de donn√©es.
- V√©rifier le typage des donn√©es correspondant √† la base de donn√©es pr√©vue.

[Lien DrawDB](https://drawdb.vercel.app/editor?shareId=e6c18b8ae53063fa1dfa9cc8a849605f)

In [81]:
import matplotlib.pyplot as plt
from rich import print_json, print
from tqdm import tqdm
import pandas as pd
import glob
import os

R√©cup√©ration de la liste des fichiers du dataset disponible

In [82]:
def get_csv_list(part_number):
    csv_files = []
    if type(part_number) == list:
        for part in part_number:
            csv_files += glob.glob(f"./data/part_{part}/*.csv")
    else:
        csv_files = glob.glob(f"./data/part_{part_number}/*.csv")
    return csv_files

Cr√©ation des DataFrames finaux

In [83]:
x_post_df = pd.DataFrame({
    "post_id": pd.Series(dtype="int"),
    "user_id": pd.Series(dtype="int"),
    "lang": pd.Series(dtype="str"),
    "text": pd.Series(dtype="str"),
    "date": pd.Series(dtype="datetime64[ns]"),
    "like_count": pd.Series(dtype="int"),
    "reply_count": pd.Series(dtype="int"),
    "retweet_count": pd.Series(dtype="int"),
    "quote_count": pd.Series(dtype="int"),
})

print(x_post_df.dtypes)
x_post_df.to_parquet("x_post.parquet", index=False)

x_user_df = pd.DataFrame({
    "user_id": pd.Series(dtype="int"),
    "name": pd.Series(dtype="str"),
    "bio": pd.Series(dtype="str"),
    "followers_count": pd.Series(dtype="int"),
    "follows_count": pd.Series(dtype="int")
})

print(x_user_df.dtypes)
x_user_df.to_parquet("x_user.parquet", index=False)

Clean function
- input : chemin du csv a ajouter, le dataframe original
- output : le dataframe avec les nouvelles lignes

In [84]:
def clean_file(path, x_post_df, x_user_df):
    current_df = pd.read_csv(path)
    for i, row in tqdm(current_df.iterrows(), total=len(current_df), desc=f"Processing {os.path.basename(path)}"):
        try:
            
            # V√©rification des donn√©es
            # Nous voulons :
            # - Les lignes sans valeurs nulles
            # - Les tweets en fran√ßais
            # - Sans retweets ni citation de tweets.
            required_columns = ["id", "lang", "text", "date", "likeCount", "replyCount", "retweetCount", "quoteCount", "user"]
            if any(col not in row or pd.isna(row[col]) for col in required_columns):
                continue
            if row["lang"] != "fr": # Choisir language voulu
                continue
            if row["quotedTweet"] == True:
                continue
            if row["retweetedTweet"] == True:
                continue
            
            ######### USER PART #########
            import datetime # Import datetime pour √©valuer directement le dictionnaire "user"

            user_dict = eval(row["user"]) # Evaluer la ligne
            user_id = int(user_dict["id"])
            user_name = user_dict["username"]
            user_bio = user_dict["rawDescription"]
            user_followers_count = int(user_dict["followersCount"])
            user_follows_count = int(user_dict["friendsCount"])

            ######### POST PART #########
        
            post_id = int(row["id"])
            lang = str(row["lang"])
            text = str(row["text"])
            date = str(row["date"])
            like_count = int(row["likeCount"])
            reply_count = int(row["replyCount"])
            retweet_count = int(row["retweetCount"])
            quote_count = int(row["quoteCount"])
            
            ######### AJOUT PART #########

            # V√©rification si le tweet existe d√©j√† dans le DataFrame
            if x_post_df["post_id"].eq(post_id).any():
                continue
            else:
                new_row_tweet = pd.DataFrame({
                    "post_id": [post_id],
                    "user_id": [user_id],
                    "lang": [lang],
                    "text": [text],
                    "date": [pd.to_datetime(date)],
                    "like_count": [like_count],
                    "reply_count": [reply_count],
                    "retweet_count": [retweet_count],
                    "quote_count": [quote_count]
                })
                x_post_df = pd.concat([x_post_df, new_row_tweet], ignore_index=True)

            # V√©rification si l'utilisateur existe d√©j√† dans le DataFrame
            if x_user_df["user_id"].eq(user_id).any():
                continue
            else:
                new_row_user = pd.DataFrame({
                    "user_id": [user_id],
                    "name": [user_name],
                    "bio": [user_bio],
                    "followers_count": [user_followers_count],
                    "follows_count": [user_follows_count]
                })
                x_user_df = pd.concat([x_user_df, new_row_user], ignore_index=True)


            


        except Exception as e:
            print(f"Error processing row {i} from file {os.path.basename(path)}: {str(e)}")
            continue
    
    return (x_post_df, x_user_df)
        

Cellule a utiliser pour actualiser x_post.parquet

In [85]:
x_post_df = pd.read_parquet("x_post.parquet")
x_user_df = pd.read_parquet("x_user.parquet")

for path in get_csv_list([1]): # Ici mettre une liste ou un num√©ro de partie du dataset. Pour la version compl√®te mettre [i for i in range(1, 23)]
    x_post_df, x_user_df = clean_file(path, x_post_df, x_user_df)

# x_post confirmation
print("[bold yellow]x_post_df describe:[/bold yellow]")
print(x_post_df.describe())
print("[bold yellow]x_post_df head(5):[/bold yellow]")
print(x_post_df.head())
x_post_df.to_parquet("x_post.parquet", index=False)
print("[bold yellow]Sucessfuly saved to x_post.parquet[/bold yellow]")
# x_user confirmation
print("[bold yellow]x_user_df describe:[/bold yellow]")
print(x_user_df.describe())
print("[bold yellow]x_user_df head(5):[/bold yellow]")
print(x_user_df.head())
x_user_df.to_parquet("x_user.parquet", index=False)
print("[bold yellow]Sucessfuly saved to x_user.parquet[/bold yellow]")

Processing may_july_chunk_16.csv: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 49984/49984 [00:03<00:00, 14416.59it/s]
Processing may_july_chunk_13.csv: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50000/50000 [00:03<00:00, 15666.04it/s]
Processing may_july_chunk_1.csv: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50000/50000 [00:03<00:00, 15759.97it/s]
Processing may_july_chunk_17.csv: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50000/50000 [00:03<00:00, 15074.20it/s]
Processing may_july_chunk_3.csv: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50000/50000 [00:03<00:00, 14004.25it/s]
Processing may_july_chunk_20.csv: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50000/50000 [00:04<00:00, 12439.86it/s]
Processing may_july_chunk_15.csv: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50000/50000 [00:03<00:00, 16461.65it/s]
Processing may_july_chunk_7.csv: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 49998/49998 [00:03<00:00, 16289.45it/s]
Processing may_july_chunk_5.csv: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50000/50000 [00:03<00:00, 15723.05it/s]
Proce