In [None]:
import pandas as pd
import numpy as np
import gensim
from tqdm import tqdm
import re
import os
import emoji
import swifter
import pickle

os.environ['NUMEXPR_MAX_THREADS'] = '45'

____

# Importations

Concatenate the original tweets in a single dataframe, and then apply some small preprocessing.

In [None]:
header = ["id","timestamp","id_2", "nb_1", "nb_2", "lang", "text"]

df = pd.DataFrame(columns=header)
for filename in tqdm(os.listdir("../pancealabCleanFR/")):
    df = pd.concat([df, pd.read_csv(open("../pancealabCleanFR/"+filename, 'rU'), sep='	', names=header, parse_dates=["timestamp"])])

In [None]:
df = df.reset_index().drop(columns=["index"])
df.dropna(inplace=True)

df['has_t'] = df['text'].swifter.apply(lambda x: (len(re.findall("\t",str(x)))>0))
nb_has_t = len(df[df['has_t'] == True])

In [None]:
# Some tweets contain tabs that interfer with the csv separator. This cell corrects that.

to_add = []
for i in tqdm(range(nb_has_t)):
    content = df[df['has_t'] == True].iloc[int(i)]['text'].split("\n")

    df.at[df[df['has_t'] == True].index[int(i)], 'text'] = content[0]

    for l in content[1:]:
        elem = l.split("\t")+[None]
        to_add.append(elem)
        
df = pd.concat([df, pd.DataFrame(to_add, columns=df.columns.values)])
print(f"{len(to_add)} tweets rajoutés")

In [None]:
df = df.reset_index().drop(columns=["index"])
df["text"] = df["text"].swifter.apply(lambda x: x.replace("|", " "))

df["original_text"] = df["text"]
df["timestamp"]=df["timestamp"].swifter.apply(pd.to_datetime)

df['has_t'] = df['text'].swifter.apply(lambda x: (len(re.findall("\t",x))>0))

df['text'] = df['text'].swifter.apply(lambda x: re.sub(r"\n", ' ', str(x)))
df['text'] = df['text'].swifter.apply(lambda x: re.sub(r"&amp;#39;", "'", str(x)))
df['text'] = df['text'].swifter.apply(lambda x: re.sub(r"&amp", "&", str(x)))

In [None]:
df.dropna(subset=['timestamp'], inplace=True)
df.id = df.id.apply(int)

In [None]:
df['text'] = df['text'].swifter.apply(lambda x: re.sub(r'https?://\S+', '', str(x)))

# Spelling Correction

To delete a part of the noise in the tweets, the most common spelling errors are manually corrected. The pickle objects contain dictionaries linking the error to the correction.

In [None]:
def transf_modifs(text):
    text = str(text)
    text = text.replace("<url>", "")
    text = text.replace("\xa0", " ")
    text = re.sub(r'[^\w\s@]',' ',text)
    new_text = []
    for word in text.lower().split(" "):
        if word != '':
            try:
                new_text.append(modifs[word])
            except:
                new_text.append(word)
    return " ".join(new_text)

def transf_modifs_vaccins(text):
    text = str(text)
    new_text = []
    for word in text.lower().split(" "):
        if word != '':
            try:
                new_text.append(modifs_vaccins[word])
            except:
                new_text.append(word)
    return " ".join(new_text)

In [None]:
modifs = pickle.load(open("modifs", 'rb'))
modifs_vaccins = pickle.load(open("modifs_vaccins", 'rb'))

In [None]:
df["text"] = df["text"].swifter.apply(lambda x: transf_modifs(x))
df["text"] = df["text"].swifter.apply(lambda x: transf_modifs_vaccins(x))

# Save

In [None]:
df.drop(columns=["has_t"]).to_csv('dataset_tweet.csv', sep="|", header=header+['original_text'])