This notebook was useful to convert our annotations into train and test datasets.

In [None]:
import pandas as pd
import numpy as np
import re
import string

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import precision_score, recall_score, f1_score

import torch
torch.cuda.set_device('cuda:0')

In [None]:
df_new = pd.read_csv("sentiment_dataset/dataset_vaccin.csv", sep="|", dtype={0: int}, parse_dates=["timestamp"])

df_new["positif"] = [None]*len(df_new)
df_new["negatif"] = [None]*len(df_new)
df_new["neutre"] = [None]*len(df_new)

In [None]:
df_new.drop(columns=["positif", "negatif", "neutre"], inplace=True)

# Labels

In [None]:
df_cl = pd.read_csv("sentiment_dataset/classification.csv", sep=";", engine="python", encoding='latin-1')

df_cl.drop(columns=["Unnamed: 0"], inplace=True)
df_cl.rename(columns={"Unnamed: 0.1": "Unnamed: 0"}, inplace=True)

df_cl["target"] = [None]*len(df_cl)

df_cl["target"].loc[df_cl["positif"]==1.0] = 0
df_cl["target"].loc[df_cl["negatif"]==1.0] = 1
df_cl["target"].loc[df_cl["neutre"]==1.0] = 2

# Merge

In [None]:
df = df_new.merge(df_cl[["Unnamed: 0", "target"]], on="Unnamed: 0", how="left")

In [None]:
df.set_index("Unnamed: 0", inplace=True, drop=True)
print("Positifs: {0[0]}\nNegatifs: {0[1]}\nNeutres:  {0[2]}\nAutres:   {1}".format(df["target"].value_counts(), df["target"].isna().value_counts()[True]))

## Target processing

In [None]:
unlabeled = df['target'].isna() == True

df_train = df.loc[~unlabeled]
df_test = df.loc[unlabeled]

df_train['target'] = df_train['target'].apply(int)

In [None]:
header_train = list(df_train.columns)
df_train.to_csv("sentiment_dataset/train_new.csv", sep=";", header=header_train)
header_train.remove("target")
df_test.drop(columns=["target"]).to_csv("sentiment_dataset/test_new.csv", sep=";", header=header_train)

## Dataset focused on the vaccin keyword

In [None]:
import pandas as pd

In [None]:
origin = pd.read_csv("dataset_tweet.csv", sep="|", names=["id","timestamp","id_2", "nb_1", "nb_2", "lang", "text"] )

In [None]:
idx_vaccin_origin = []
found=0
key = 'vaccin'
for i in range(len(origin)):
    if len(re.findall(key, str(origin['text'].iloc[i])))>0:
        found += 1
        idx_vaccin_origin.append(i)
print(found)

In [None]:
origin_vaccin = origin.iloc[idx_vaccin_origin]

In [None]:
origin_vaccin.to_csv("data/03_primary/dataset_vaccin.csv", header=['id', 'timestamp', 'id_2', 'nb_1', 'nb_2', 'lang', 'text'], sep="|")