# Text Pre-processing Level 1 (TPL 1) - Data Cleaning

---

# Import dependencies

In [1]:
# nltk dependencies
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# To pre-process text
import re

import pandas as pd

# To download the tokenizer
# nltk.download('punkt')

# Download the lemmatizer
# nltk.download('wordnet')

# To download the corpus of stopwords
# nltk.download('stopwords')

# Import Tweets

In [2]:
df = pd.read_csv('tweets.csv')
df.drop(['Unnamed: 0'], axis=1)
df

Unnamed: 0.1,Unnamed: 0,context_annotations,created_at,edit_history_tweet_ids,id,text
0,0,"[{'domain': {'id': '10', 'name': 'Person', 'de...",2022-11-06 17:30:11+00:00,['1589309180527116289'],1589309180527116289,At the heart of every family (if you’re fortun...
1,1,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",2022-11-03 16:00:24+00:00,['1588199421279354881'],1588199421279354881,He was a groundbreaking filmmaker. As well as ...
2,2,"[{'domain': {'id': '29', 'name': 'Events [Enti...",2022-10-31 19:37:31+00:00,['1587166898281623552'],1587166898281623552,RT @MarkRuffalo: Parabéns aos heróis da democr...
3,3,"[{'domain': {'id': '46', 'name': 'Business Tax...",2022-10-28 21:04:25+00:00,['1586101602657648650'],1586101602657648650,"Excelente, @debora_dpio! A melhor estratégia, ..."
4,4,"[{'domain': {'id': '29', 'name': 'Events [Enti...",2022-10-28 20:51:48+00:00,['1586098430363017217'],1586098430363017217,"Eu tô com vocês, @RDJBrazil! Essa semana, esta..."
...,...,...,...,...,...,...
95,95,"[{'domain': {'id': '131', 'name': 'Unified Twi...",2021-08-27 19:02:17+00:00,['1431331249935638529'],1431331249935638529,RT @fp_coalition: Our CO2 emissions are the la...
96,96,"[{'domain': {'id': '131', 'name': 'Unified Twi...",2021-08-25 21:11:46+00:00,['1430639061329268736'],1430639061329268736,Here with our crew over at Bose Downey X=Chang...
97,97,"[{'domain': {'id': '131', 'name': 'Unified Twi...",2021-08-20 18:15:26+00:00,['1428782744134316032'],1428782744134316032,RT @fp_coalition: Here’s @RobertDowneyJr with ...
98,98,"[{'domain': {'id': '131', 'name': 'Unified Twi...",2021-08-18 17:57:25+00:00,['1428053433030905861'],1428053433030905861,"My good friend Valentino Vettori, the mastermi..."


In [3]:
tweets = df['text']
tweets

0     At the heart of every family (if you’re fortun...
1     He was a groundbreaking filmmaker. As well as ...
2     RT @MarkRuffalo: Parabéns aos heróis da democr...
3     Excelente, @debora_dpio! A melhor estratégia, ...
4     Eu tô com vocês, @RDJBrazil! Essa semana, esta...
                            ...                        
95    RT @fp_coalition: Our CO2 emissions are the la...
96    Here with our crew over at Bose Downey X=Chang...
97    RT @fp_coalition: Here’s @RobertDowneyJr with ...
98    My good friend Valentino Vettori, the mastermi...
99    RT @fp_coalition: Explore underwater worlds, f...
Name: text, Length: 100, dtype: object

In [7]:
type(tweets)

pandas.core.series.Series

# Processing Tweets

In [4]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

processedTweets = []

stopWords = set(stopwords.words('english'))

for tweet in tweets:
    # Convert the tweet to lowercase using REGEX
    for f in re.findall("([A-Z]+)", tweet):
        tweet = tweet.replace(f, f.lower())

    # Removing special characters and replacing them with a space
    tweet = re.sub("[^A-Za-z0-9]", " ", tweet, 0, re.IGNORECASE)
    
    # From a single sentence, store all the words 
    wordsInTweet = nltk.word_tokenize(tweet)
    
    # Filter out all the stop words
    wordsInTweet = [word for word in wordsInTweet if word not in stopWords]
    
    # Stem each of the tweets
    wordsInTweet = [stemmer.stem(word) for word in wordsInTweet if word not in stopWords]
    
    # Lemmatize each of the tweets
    wordsInTweet = [lemmatizer.lemmatize(word) for word in wordsInTweet]
    
    # Append them to a list
    processedTweets.append(wordsInTweet)
    
# print(processedTweets)

In [5]:
processedTweetsDataFrame = pd.DataFrame(data=processedTweets)
processedTweetsDataFrame.to_csv('processed-tweets.csv')

processedTweetsDataFrame

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,40
0,heart,everi,famili,fortun,find,mother,fierc,sophist,protector,nurtur,...,,,,,,,,,,
1,groundbreak,filmmak,well,dad,uncompl,sr,documentari,tell,stori,robert,...,,,,,,,,,,
2,rt,markruffalo,parab,n,ao,da,democracia,brasileira,pela,primeira,...,,,,,,,,,,
3,excelent,debora,dpio,melhor,estrat,gia,assim,como,melhor,tecnologia,...,nemtodoh,iusacapa,http,co,tb5chosuay,,,,,
4,eu,com,voc,rdjbrazil,essa,semana,estamo,todo,unido,pelo,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,rt,fp,coalit,co2,emiss,largest,contributor,climat,chang,compani,...,,,,,,,,,,
96,crew,bose,downey,x,chang,bdx,alway,ask,x,throw,...,,,,,,,,,,
97,rt,fp,coalit,robertdowneyjr,littl,quot,day,frank,lloyd,wright,...,,,,,,,,,,
98,good,friend,valentino,vettori,mastermind,geniu,behind,arcadiaearth,use,immers,...,,,,,,,,,,


---