In [None]:
#facilite le téléchargement de jeux de données open source directement à partir de sources en ligne
!pip install opendatasets
# Elle fournit des structures de données flexibles et performantes, notamment des DataFrames
!pip install pandas

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [None]:
#on crée un répertoire caché "kaggle" dans le répertoire "/root/" (s'il n'existe pas déjà), copie le fichier "kaggle.json" dans ce répertoire,
!mkdir -p /root/.kaggle
!cp kaggle.json /root/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json


In [None]:
!kaggle datasets download -d thoughtvector/customer-support-on-twitter

Downloading customer-support-on-twitter.zip to /content
 94% 159M/169M [00:01<00:00, 159MB/s]
100% 169M/169M [00:01<00:00, 130MB/s]


In [None]:
!unzip /content/customer-support-on-twitter.zip

Archive:  /content/customer-support-on-twitter.zip
  inflating: sample.csv              
  inflating: twcs/twcs.csv           


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


**Import dataset**

In [None]:
df = pd.read_csv('/content/sample.csv')
df

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,119237,105834,True,Wed Oct 11 06:55:44 +0000 2017,@AppleSupport causing the reply to be disregar...,119236,
1,119238,ChaseSupport,False,Wed Oct 11 13:25:49 +0000 2017,@105835 Your business means a lot to us. Pleas...,,119239.0
2,119239,105835,True,Wed Oct 11 13:00:09 +0000 2017,@76328 I really hope you all change but I'm su...,119238,
3,119240,VirginTrains,False,Tue Oct 10 15:16:08 +0000 2017,@105836 LiveChat is online at the moment - htt...,119241,119242.0
4,119241,105836,True,Tue Oct 10 15:17:21 +0000 2017,@VirginTrains see attached error message. I've...,119243,119240.0
...,...,...,...,...,...,...,...
88,119330,105859,True,Wed Oct 11 13:50:42 +0000 2017,@105860 I wish Amazon had an option of where I...,119329,119331.0
89,119331,105860,True,Wed Oct 11 13:47:14 +0000 2017,They reschedule my shit for tomorrow https://t...,119330,
90,119332,Tesco,False,Wed Oct 11 13:34:06 +0000 2017,"@105861 Hey Sara, sorry to hear of the issues ...",119333,119334.0
91,119333,105861,True,Wed Oct 11 14:05:18 +0000 2017,@Tesco bit of both - finding the layout cumber...,119335119336,119332.0


In [None]:
#EXTRACT TWEETS ONLY
tweets = df['text']
tweets.head()

0    @AppleSupport causing the reply to be disregar...
1    @105835 Your business means a lot to us. Pleas...
2    @76328 I really hope you all change but I'm su...
3    @105836 LiveChat is online at the moment - htt...
4    @VirginTrains see attached error message. I've...
Name: text, dtype: object

In [None]:
import re
import nltk
import spacy
import string

In [None]:
# lower case :
tweets = tweets.str.lower()

# remove punctuation
# appliquer une fonction lambda à chaque élément de la série (ou colonne) "tweets".
# text représente chaque élément individuel de la série.
# La fonction re.sub(r'[^\w\s]', '', text) utilise le module de expressions régulières re pour supprimer tous les caractères
tweets = tweets.apply(lambda text: re.sub(r'[^\w\s]', '', text))


In [None]:
tweets.head()

0    applesupport causing the reply to be disregard...
1    105835 your business means a lot to us please ...
2    76328 i really hope you all change but im sure...
3    105836 livechat is online at the moment  https...
4    virgintrains see attached error message ive tr...
Name: text, dtype: object

In [None]:
import nltk
nltk.download('stopwords') #Télécharge la liste des mots vides (stopwords) de NLTK.
nltk.download('punkt') #Télécharge le tokenizer punkt, utilisé pour diviser le texte en mots (tokenization).
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize #Importe la fonction word_tokenize de la bibliothèque NLTK, qui est utilisée pour diviser le texte en mots.

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
#stopwords.words('english'): Récupère la liste des stopwords en anglais à partir de la bibliothèque NLTK.
#set(stopwords.words('english')): Convertit la liste de stopwords en un ensemble (set) pour une recherche plus efficace.
s_words = set(stopwords.words('english'))
#C'est une fonction lambda qui prend un texte en entrée. Elle divise le texte en mots, puis crée une liste de mots
#où chaque mot n'est inclus que s'il n'est pas dans l'ensemble
# des stopwords. Enfin, elle utilise " ".join(...) pour reconstituer le texte en joignant les mots restants avec des espaces.
tweets = tweets.apply(lambda text:" ".join([word for word in str(text).split() if word not in s_words]))

In [None]:
tweets.head()

0    applesupport causing reply disregarded tapped ...
1    105835 business means lot us please dm name zi...
2           76328 really hope change im sure wont dont
3    105836 livechat online moment httpstcosy94vtu8...
4    virgintrains see attached error message ive tr...
Name: text, dtype: object

In [None]:
#tokenization
tweets = tweets.apply(lambda text: word_tokenize(text))

In [None]:
tweets.head()

0    [applesupport, causing, reply, disregarded, ta...
1    [105835, business, means, lot, us, please, dm,...
2    [76328, really, hope, change, im, sure, wont, ...
3    [105836, livechat, online, moment, httpstcosy9...
4    [virgintrains, see, attached, error, message, ...
Name: text, dtype: object

In [None]:
# stemming
from nltk.stem import PorterStemmer
word_stemmer = PorterStemmer()
#applique la racinisation à chaque mot à l'aide de word_stemmer.stem(word).
# Enfin, elle utilise " ".join(...) pour reconstituer le texte en joignant les mots racinisés avec des espaces.
tweets = tweets.apply(lambda text: " ".join([word_stemmer.stem(word) for word in text]))


In [None]:
tweets.head()

0    applesupport caus repli disregard tap notif ke...
1    105835 busi mean lot us pleas dm name zip code...
2            76328 realli hope chang im sure wont dont
3    105836 livechat onlin moment httpstcosy94vtu8k...
4    virgintrain see attach error messag ive tri le...
Name: text, dtype: object

In [None]:
from io import TextIOBase
#Lemmatisation
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
tweets = tweets.apply(lambda text:lemmatizer.lemmatize(text))

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
tweets.head()

0    applesupport caus repli disregard tap notif ke...
1    105835 busi mean lot us pleas dm name zip code...
2            76328 realli hope chang im sure wont dont
3    105836 livechat onlin moment httpstcosy94vtu8k...
4    virgintrain see attach error messag ive tri le...
Name: text, dtype: object