In [44]:
import pandas as pd
import re
import emoji
import unidecode
from nltk.stem.snowball import SnowballStemmer    

In [45]:
def clean_text(string, punctuations, stop_words):

    # Cleaning the .fr urls
    string = re.sub(r'\w*\.fr','',string)
    
    # Cleaning the urls
    string = re.sub(r'https?://\S+|www\.\S+', '', string)

    # Cleaning the html elements
    string = re.sub(r'<.*?>', '', string)
    
    # Converting the text to lower
    string = string.lower()
    
    # accented_string is of type 'unicode'
    string = unidecode.unidecode(string)
    
    # Removing stop words
    string = ' '.join([word for word in string.split() if word not in stop_words])
    
    frenchStemmer=SnowballStemmer("french")
    string = ' '.join([frenchStemmer.stem(word) for word in string.split()])
    
    # Cleaning the whitespaces
    string = re.sub(r'\s+', ' ', string).strip()
    
    #Enlever les apostrophes
    for i in range(len(string.lower())): 
        x = string.lower()[i:i+2]
        if x in apostrophes: 
            string = string.replace(x, "")  
            
    # Removing the punctuations
    for x in string.lower(): 
        if x in punctuations: 
            string = string.replace(x, "")

    return string 

apostrophes = ["l'","l’","j'", "j’", "d’","s’","c’","t’","m’","s'", "t'","m'", "c'","d'","s'", "y'", "y’", "t'", "t’"]
list_emojies = [c for c in  emoji.UNICODE_EMOJI['fr']]
punctuations = r'''!()-[]{};:'"\,<>./?$%^&*_~’#@“”‘’«»'''
stop_words = ['un', 'une', 'la', 'le', 'les', 'des', 'et', "au", "a","son","sa","ses","ce"
             "te","se","que","quel","quelle","en","je","tu","ici","jusqu'a","etais","etait"
              ,"etaient","etions","est","mon","ma","mes","ton","ta","tes","d'"] + list_emojies + apostrophes +  ["au", "aux", "avec", "ce","ces", "dans", "de", "des", "du", "en", "et", "eux", "je", "la","le","leur","lui",
"ma","mais","me", "même", "mes", "moi", "mon", "nos", "notre", "nous", "on", "ou", "par", "pour", "qu",
"que","qui", "sa", "se", "ses", "son", "sur", "ta", "te", "tes", "toi", "ton", "tu", "un", "une", "vos", "votre",  
"vous", "etee", "etees", "etant", "suis", "es", "sommes", "sont", "serai", "seras", "sera", "serons", "serez",
"seront", "serais", "serait", "serions", "seriez", "seraient", "etais", "etait", "etions", "etiez", "etaient", "fus",
"fut", "fumes", "futes", "furent", "sois", "soit", "soyons", "soyez", "soient", "fusse", "fusses", "fussions", "fussiez",
"fussent", "ayant", "eu", "eue", "eues", "eus", "ai", "avons", "avez", "ont", "aurai", "aurons", "aurez", "auront", 
"aurais", "aurait", "aurions", "auriez", "auraient", "avais", "avait", "aviez", "avaient", "eut", "eumes", "eutes",
"eurent", "aie", "aies", "ait", "ayons", "ayez", "aient", "eusse", "eusses", "eut", "eussions", "eussiez", "eussent",
"ceci", "celà", "cet", "cette", "ici", "les", "leurs", "quel", "quels", "quelle", "quelles", "sans", "soi"]      

In [46]:
# Lecture de nos données
data = pd.read_csv("my_csv.csv",sep = ',')
data.columns = ['tweet', 'class']

In [47]:
list_tweets = []
for i in range(len(data)):
    list_tweets.append([clean_text(data['tweet'][i],punctuations,stop_words),data['class'][i]])
    
#TEST
#print("NORMAL : \n")
#print(data['tweet'][0])
#print("\nCLEAN : \n")
#print(clean_text(data['tweet'][0],punctuations,stop_words))

In [48]:
dataFrame = pd.DataFrame(list_tweets, columns = ['tweet','class'])

In [49]:
dataFrame.to_csv("my_csv_clean.csv", index=False, header=False)  #Création d'un nouveau csv avec les données nettoyées