In [1]:
import pandas as pd
from pytz import timezone

In [8]:
# List of file names
files = [
    "../data/raw/1er_debate.csv",
    "../data/raw/2do_debate.csv",
    "../data/raw/elecciones_generales.csv",
    "../data/raw/debate_balotaje.csv",
    "../data/raw/elecciones_balotaje.csv",
]

# List of corresponding event names
events = [
    "1er debate",
    "2do debate",
    "elecciones generales",
    "debate balotaje",
    "elecciones balotaje",
]

# Empty list to store the DataFrames
dfs = []

# Loop through the file names and event names simultaneously
for file, event in zip(files, events):
    # Read each CSV file into a DataFrame
    df = pd.read_csv(file)

    # Add a new column 'event' with the corresponding event name
    df["event"] = event

    # Append the modified DataFrame to the list
    dfs.append(df)

# Concatenate the DataFrames in the list into one DataFrame
df = pd.concat(dfs)

# Print the number of tweets in the concatenated DataFrame
print(f"Database with {len(df)} tweets")

Database with 84648 tweets


In [9]:
# replace 'T' from column
df["date_cleansed"] = df["date"].str.replace("T", " ")

# format datetime column
df["date_cleansed"] = pd.to_datetime(df["date_cleansed"], errors="coerce", utc=True)

# show the first 3 rows
df["date_cleansed"].head(3)

0   2023-10-02 22:34:22+00:00
1   2023-10-02 11:42:30+00:00
2   2023-10-02 11:46:58+00:00
Name: date_cleansed, dtype: datetime64[ns, UTC]

In [10]:
# set timezone to Buenos Aires
buenos_aires_tz = timezone("America/Argentina/Buenos_Aires")

# update timezone column
df["date_local"] = df["date_cleansed"].dt.tz_convert(buenos_aires_tz)

# show the first 3 rows
df["date_local"].head(3)

0   2023-10-02 19:34:22-03:00
1   2023-10-02 08:42:30-03:00
2   2023-10-02 08:46:58-03:00
Name: date_local, dtype: datetime64[ns, America/Argentina/Buenos_Aires]

In [11]:
# create a new column only with year, month and day
df["dt_date"] = df["date_local"].dt.date

# format datetime column
df["dt_date"] = pd.to_datetime(df["dt_date"])

# show the first 3 rows
df["dt_date"].head(3)

0   2023-10-02
1   2023-10-02
2   2023-10-02
Name: dt_date, dtype: datetime64[ns]

In [12]:
# filter the 'text' column for non-float values
filtered_df = df[pd.to_numeric(df["text"], errors="coerce").isnull()]

print(filtered_df.shape)

(84622, 22)


In [13]:
filtered_df.to_csv("../data/raw/elecciones_argentina.csv", index=False)