Objectifs du notebook :

- Nettoyer les données pour obtenir un seul fichier contenant l'échantillon qui sera utilisée.
- Vérifier la présence de valeurs manquantes ou erronées.
- S'assurer de réunir uniquement les données nécessaires pour la base de données.
- Vérifier le typage des données correspondant à la base de données prévue.

[Lien DrawDB](https://drawdb.vercel.app/editor?shareId=e6c18b8ae53063fa1dfa9cc8a849605f)

In [56]:
import matplotlib.pyplot as plt
from rich import print_json, print
from tqdm import tqdm
import pandas as pd
import glob
import os

Récupération de la liste des fichiers du dataset disponible

In [57]:
csv_files = glob.glob("./data/part_*/*.csv")
dataframes = []
print(csv_files)

Création du DataFrame final

In [58]:
x_post_df = pd.DataFrame({
    "user_id": pd.Series(dtype="int"),
    "lang": pd.Series(dtype="str"),
    "text": pd.Series(dtype="str"),
    "date": pd.Series(dtype="datetime64[ns]"),
    "like_count": pd.Series(dtype="int"),
    "reply_count": pd.Series(dtype="int"),
    "retweet_count": pd.Series(dtype="int"),
    "quote_count": pd.Series(dtype="int"),
})

print(x_post_df.dtypes)

In [None]:
def clean_file(path, x_post_df):
    current_df = pd.read_csv(path)
    for i, row in tqdm(current_df.iterrows(), total=len(current_df), desc=f"Processing {os.path.basename(path)}"):
        try:
            # Vérification des données
            # Nous voulons :
            # - Les lignes sans valeurs nulles
            # - Les tweets en français
            # - Sans retweets ni citation de tweets.
            required_columns = ["id", "lang", "text", "date", "likeCount", "replyCount", "retweetCount", "quoteCount"]
            if any(col not in row or pd.isna(row[col]) for col in required_columns):
                continue
            if row["lang"] != "fr":
                continue
            if row["quotedTweet"] == True:
                continue
            if row["retweetedTweet"] == True:
                continue
        
            user_id = int(row["id"])
            lang = row["lang"]
            text = row["text"]
            date = row["date"]
            like_count = row["likeCount"]
            reply_count = row["replyCount"]
            retweet_count = row["retweetCount"]
            quote_count = row["quoteCount"]
            
            # Create a new row for the final dataframe
            new_row = pd.DataFrame({
                "user_id": [user_id],
                "lang": [lang],
                "text": [text],
                "date": [pd.to_datetime(date)],
                "like_count": [like_count],
                "reply_count": [reply_count],
                "retweet_count": [retweet_count],
                "quote_count": [quote_count]
            })
            
            # Append to the main dataframe
            x_post_df = pd.concat([x_post_df, new_row], ignore_index=True)
            
        except Exception as e:
            print(f"Error processing row {i} from file {os.path.basename(path)}: {str(e)}")
            continue
    
    return x_post_df
        

In [60]:
for path in csv_files[:1]:
    print(f"Processing file: {path}")
    x_post_df = clean_file(path, x_post_df)

print("[bold]x_post_df describe:[/bold]")
print(x_post_df.describe())
print("[bold]x_post_df head(5):[/bold]")
print(x_post_df.head())

  current_df = pd.read_csv(path)
Processing may_july_chunk_172.csv: 100%|██████████| 50000/50000 [00:02<00:00, 18921.48it/s]


In [61]:
x_post_df.to_parquet("x_post.parquet", index=False)