In [None]:
!pip install kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json




In [None]:
!kaggle datasets download -d thoughtvector/customer-support-on-twitter


Downloading customer-support-on-twitter.zip to /content
 99% 167M/169M [00:02<00:00, 91.6MB/s]
100% 169M/169M [00:02<00:00, 67.8MB/s]


In [None]:
!unzip /content/customer-support-on-twitter.zip

Archive:  /content/customer-support-on-twitter.zip
  inflating: sample.csv              
  inflating: twcs/twcs.csv           


In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string
import re
import spacy
from nltk.tokenize import word_tokenize


In [None]:
df = pd.read_csv('/content/sample.csv')
df.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,119237,105834,True,Wed Oct 11 06:55:44 +0000 2017,@AppleSupport causing the reply to be disregar...,119236.0,
1,119238,ChaseSupport,False,Wed Oct 11 13:25:49 +0000 2017,@105835 Your business means a lot to us. Pleas...,,119239.0
2,119239,105835,True,Wed Oct 11 13:00:09 +0000 2017,@76328 I really hope you all change but I'm su...,119238.0,
3,119240,VirginTrains,False,Tue Oct 10 15:16:08 +0000 2017,@105836 LiveChat is online at the moment - htt...,119241.0,119242.0
4,119241,105836,True,Tue Oct 10 15:17:21 +0000 2017,@VirginTrains see attached error message. I've...,119243.0,119240.0


In [None]:
# Télécharger les mots vides et les données nécessaires pour le prétraitement en anglais
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


                                                 text  \
0   @AppleSupport causing the reply to be disregar...   
1   @105835 Your business means a lot to us. Pleas...   
2   @76328 I really hope you all change but I'm su...   
3   @105836 LiveChat is online at the moment - htt...   
4   @VirginTrains see attached error message. I've...   
..                                                ...   
88  @105860 I wish Amazon had an option of where I...   
89  They reschedule my shit for tomorrow https://t...   
90  @105861 Hey Sara, sorry to hear of the issues ...   
91  @Tesco bit of both - finding the layout cumber...   
92  @105861 If that doesn't help please DM your fu...   

                                         tokenization  \
0   [@, AppleSupport, causing, the, reply, to, be,...   
1   [@, 105835, Your, business, means, a, lot, to,...   
2   [@, 76328, I, really, hope, you, all, change, ...   
3   [@, 105836, LiveChat, is, online, at, the, mom...   
4   [@, VirginTrains, see, att

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Initialiser les outils NLP
stop_words = set(stopwords.words('english'))  # Mots vides en anglais
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()


In [None]:
# Fonction de prétraitement
def preprocess_text(text):
    # Élimination de la ponctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenization
    tokens = word_tokenize(text)

    # Suppression des chiffres
    tokens = [word for word in tokens if not word.isdigit()]

    # Suppression des mots vides
    tokens = [word for word in tokens if word.lower() not in stop_words]

    # Stemming
    stemmed_tokens = [stemmer.stem(word) for word in tokens]

    # Lemmatisation
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in stemmed_tokens]

    return ' '.join(lemmatized_tokens)

In [None]:
# Appliquer le prétraitement à la colonne 'text' et stocker les résultats dans une nouvelle colonne 'clean_text'
df['clean_text'] = df['text'].apply(preprocess_text)

In [None]:
# Créer une colonne pour la tokenization
df['tokenization'] = df['text'].apply(word_tokenize)

In [None]:
# Afficher le DataFrame avec les colonnes 'text', 'tokenization' et 'clean_text'
print(df[['text', 'tokenization', 'clean_text']])

                                                 text  \
0   @AppleSupport causing the reply to be disregar...   
1   @105835 Your business means a lot to us. Pleas...   
2   @76328 I really hope you all change but I'm su...   
3   @105836 LiveChat is online at the moment - htt...   
4   @VirginTrains see attached error message. I've...   
..                                                ...   
88  @105860 I wish Amazon had an option of where I...   
89  They reschedule my shit for tomorrow https://t...   
90  @105861 Hey Sara, sorry to hear of the issues ...   
91  @Tesco bit of both - finding the layout cumber...   
92  @105861 If that doesn't help please DM your fu...   

                                         tokenization  \
0   [@, AppleSupport, causing, the, reply, to, be,...   
1   [@, 105835, Your, business, means, a, lot, to,...   
2   [@, 76328, I, really, hope, you, all, change, ...   
3   [@, 105836, LiveChat, is, online, at, the, mom...   
4   [@, VirginTrains, see, att

# L'objectif de ce TP consiste à nettoyer et préparer des données textuelles pour l'analyse en effectuant les étapes suivantes :

1. Élimination de la ponctuation.
2. Tokenization (découpage du texte en mots).
3. Suppression des chiffres.
4. Suppression des mots vides.
5. Stemming (réduction des mots à leur forme racine).
6. Lemmatisation (conversion des mots en leur forme canonique).

Les résultats de chaque étape sont stockés dans un tableau pour une analyse ultérieure, facilitant la comparaison du texte original avec le texte nettoyé ( Clean-text).

In [None]:
from google.colab import drive
drive.mount('/content/drive')