# Text Pre-processing Level 1 (TPL 1) - Data Cleaning

---

# Import dependencies

In [None]:
# nltk dependencies
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# To pre-process text
import re

import pandas as pd

# To download the tokenizer
nltk.download('punkt')

# Download the lemmatizer
nltk.download('wordnet')

# To download the corpus of stopwords
nltk.download('stopwords')

# Import Tweets

In [None]:
df = pd.read_csv('tweet.csv')
df.drop(['Unnamed: 0'], axis=1)
df

In [None]:
tweets = df['text']
tweets

# Processing Tweets

In [59]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

processedTweets = []

for tweet in tweets:
    # Convert the tweet to lowercase using REGEX
    for f in re.findall("([A-Z]+)", tweet):
        tweet = tweet.replace(f, f.lower())
    
    # From a single sentence, store all the words 
    wordsInTweet = nltk.word_tokenize(tweet)
    
    # Filter out all the stop words
    wordsInTweet = [word for word in wordsInTweet if word not in set(stopwords.words('english'))]
    
    # Stem each of the tweets
    wordsInTweet = [stemmer.stem(word) for word in wordsInTweet if word not in set(stopwords.words('english'))]
    
    # Lemmatize each of the tweets
    wordsInTweet = [lemmatizer.lemmatize(word) for word in wordsInTweet]
    
    # Append them to a list
    processedTweets.append(wordsInTweet)
    
print(processedTweets)

[['heart', 'everi', 'famili', '(', '’', 'fortun', ')', ',', '’', 'find', 'mother', '.', 'fierc', 'sophist', 'protector', ',', 'nurtur', ',', 'teacher', 'friend', '...', 'susan', 'downey', ',', '(', 'party-s', 'bag', 'chip', ')', '.', 'http', ':', '//t.co/88uxwbxs9l'], ['groundbreak', 'filmmak', '.', 'well', 'dad', '.', '’', 'uncomplicated…', '”', 'sr.', '”', 'documentari', 'tell', 'stori', 'robert', 'downey', '’', 'maverick', 'rise', 'nyc', ',', 'crash', ',', 'burn', 'redempt', 'hollywood', 'relationship', 'aftermath', '.', 'netflix', ',', 'decemb', '2nd', '.', 'http', ':', '//t.co/uqbwihmffw'], ['rt', '@', 'markruffalo', ':', 'parabén', 'ao', 'herói', 'da', 'democracia', 'brasileira', '!', 'pela', 'primeira', 'vez', ',', 'número', 'de', 'voto', 'aument', 'segundo', 'turno', '.', 'o', 'brasileir…'], ['excelent', ',', '@', 'debora_dpio', '!', 'melhor', 'estratégia', ',', 'assim', 'como', 'melhor', 'tecnologia', ',', 'é', 'simpl', 'e', 'eficient', '.', 'vc', 'consegu', 'mudar', 'mundo', 

---