# NLP preprocessing of twitter

Author: Fadoua Ghourabi (fadouaghourabi@gmail.com)

Date: July 09, 2019

This library provides functions for preprocessing tweets such as tokenization, removing stopwords and some patterns, lemmatization, etc. The main function ``clean_collection`` is a pipeline of preprocessing functions adapted to our purpose. It is exported to make corpora and to convert preprocessed tweet into vector representation.

<font color="red">The functions use libraries for processing text in french. We observe various issues that we wish to address gradually to improve the</font>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import string
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

In [2]:
# file contains manually labeled tweets
tw_path = "../datasets/twData_clean_labeled.csv"
tw_data = pd.read_csv(tw_path, header=0)

In [3]:
tweets = tw_data.TwContent.values

In [4]:
#data_desc(tw_data)

## 1. Normalization pipeline
### 1.1 Lower case

In [5]:
def lower_case(text):
    '''
    - Description:
    lower_case converts a text to lower case
    - History:
    June 9, 2019 --> implementation
    '''
    text_lower = text.lower()
    
    return text_lower

In [6]:
def lower_case_collection(array):
    '''
    - Description:
    lower_case_collection converts texts in a list to lower case
    June 9, 2019 --> implementation
    '''
    array_lower = [lower_case(text) for text in array]
    
    return array_lower

In [7]:
#lower_case_collection(tw_data.TwContent)

### 1.2  Remove punctuation

In [8]:
def remove_punctuation(text):
    '''
    - Description:
    remove_punctuation removes punctuation from a text/tweet. 
    Note: "://", "/" and "." in urls will be removed. 
    To not loose the url information, must remove punctuation after extracting urls.
    - History:    
    June 9, 2019 --> implementation (credit to Shivangi)
    to fix: 
    string.punctuation: '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~', apostrophe should be removed?
    '''
    # str.maketrans construct a mapping table where 
    # the 1st parameter is replaced by the 2nd parameter 
    # and third parameter is removed  
    obj = str.maketrans('', '',string.punctuation)
    
    # text.translate applies the mapping table
    text_unpunct = text.translate(obj)
    
    return text_unpunct # text with no punctuation

In [9]:
def remove_punctuation_collection(array):
    array_unpunct = [remove_punctuation(text) for text in array]
    
    return array_unpunct

In [18]:
#remove_punctuation_collection(lower_case_collection(tweets))

### 1.3 Tokenization

In [19]:
def tokenize_word(text):
    '''
    - Description:
    tokenize_word performs the tokenization of a tweet.
    - Output:
    words: a list of words in a tweet
    - History:
    June 9, 2019 --> implementation
    to fix: tokenization of "l'eau"
    '''
    words = word_tokenize(text)
    
    return words

In [20]:
def tokenize_word_collection(array):
    array_wtokens = [tokenize_word(text) for text in array]
    
    return array_wtokens

In [21]:
#tokenize_word_collection(tweets)

In [22]:
def tokenized_sent(text):
    '''
    - Description:
    tokenize_sent performs the sentence tokenization of a tweet. 
    A tweet can be 1 or more sentenses delimited by ".".
    - Output:
    sentences: a list of sentences in a tweet
    - History:
    June 9, 2019 --> implementation, probably not useful.
    '''
    sentences = sent_tokenize(text)
    
    return sentences

In [23]:
def tokenize_sent_collection(array):
    array_stokens = [tokenized_sent(text) for text in array]
    
    return array_stokens

In [24]:
#tokenize_sent_collection(tweets)

### 1.4 appos
No appos in French?

### 1.5 stopwords

In [25]:
#import nltk
#nltk.download(stopwords) # uncomment to download required corpora, e.g. stopwords, punkt
#nltk.download('punkt')

In [26]:
from nltk.corpus import stopwords 

In [27]:
# the list of french stopwords is downloaded from nltk corpus
# the stopwords are in lower case, thus must call lower_case before removing stopwords
# Careful: the list is not complete, e.g. "les" is not included.
stop_words = set(stopwords.words("french"))

In [28]:
def remove_stopwords(text, stop_words):
    '''
    - Description:
    remove_stopwords removes stopwords from a tweet. 
    - History:
    June 9, 2019 --> implementation, to fix: stop_words list is not complete
    '''
    filtered_sentence = [] 
    # the tweet is tokenized before searching for stopwords and removing them
    word_tokens = word_tokenize(text)                                                
    filtered_sentence = [w for w in word_tokens if not w in stop_words]   
    # words (-stopwords) are joined in a sentence 
    text_nostopwords =' '.join(filtered_sentence) 
    
    return text_nostopwords

In [29]:
def remove_stopwords_collection(array, stop_words):
    array_nostopwords = [remove_stopwords(text,stop_words) for text in array]

    return array_nostopwords

In [30]:
#remove_stopwords_collection(tweets, stop_words)

### 1.6 Remove objects (urls, undesired symbols, etc.)

In [31]:
import re # regular expressions
### pattern for \n ?
# patterns for twitter elements "#", "rt" and "@"?
# regular expression of urls
url_pattern = r'(https?:\/\/)(\s)?(www\.)?(\s?)(\w+\.)*([\w\-\s]+\/)*([\w-]+)\/?'
# space symbols (e.g. ISO coding) that should be replaced by " "
space_pattern = u'\xa0'
# alphanumeric symbols should be kept
#nonalphanumeric_pattern = r'[^a-zA-z0-9\s]'
nonalphanumeric_pattern = r'[^a-zA-z0-9àâäèéêëîïôœùûüÿçÀÂÄÈÉÊËÎÏÔŒÙÛÜŸÇ\s]'

In [32]:
def remove_pattern(text, pattern, new=''):
    '''
    - Description:
    remove_pattern either remove or replace a pattern in a text.
    - Input:
    text
    pattern: pattern to be replaced
    new: '' if not specified, otherwise some text 
    - History:
    June 9, 2019 --> implementation
    July 17, 2019 --> replacing with a string (new) is allowed
    '''
    # re.sub searches for pattern in text and replaced by the string new
    # if new is empty, it simply removes pattern from text
    text_clean = re.sub(pattern, new, text)
    
    return text_clean

In [33]:
def remove_pattern_collection(array, url_pattern, new=''):
    array_clean = [remove_pattern(text, url_pattern, new) for text in array]

    return array_clean

In [34]:
#remove_pattern_collection(tweets, url_pattern)
#remove_pattern_collection(tweets, space_pattern, " ")
#remove_pattern_collection(tweets, nonalphanumeric_pattern)

### 1.7 Lemmatization

In [35]:
# !pip install -U spacy # uncomment to install spacy (library for lemmatization, )

In [36]:
# from nltk.stem import WordNetLemmatizer # not used because not adapted to french text
# spaCy is a free open-source library for Natural Language Processing in Python. 
# It features lemmatization, NER, POS tagging, dependency parsing, word vectors...
import spacy  

In [37]:
# !python -m spacy download fr_core_news_sm

In [38]:
# !python -m spacy download fr

In [39]:
sp = spacy.load('fr') # lemmatization for french text

In [40]:
def lemmatize(model, sent):
    '''
    - Description:
    lemmatize computes the lemmatization of a text.
    - Input:
    model: basically spacy for french lemmatization
    sent: a sentence to be lemmatized
    - Output:
    lemmas: pairs of (word,lemmatized instance)
    - History:
    June 9, 2019 --> implementation
    July 17, 2019 --> model as parameter, removed the mapping word to lemmatized instance
    '''
    sentence = model(sent)
    lemmas = []
    for word in sentence:  
        lemmas.append((word.text, word.lemma_))
    
    #new_sentence = map(lambda x:sentence.replace(x[0],x[1]),sentence)
    return lemmas#, new_sentence

In [41]:
def lemmatize_collection(model, array):
    '''
    - Description:
    lemmatize_collection computes the lemmatization of a list of texts.
    - Input:
    model: basically spacy for french lemmatization
    array: list of texts to be lemmatized
    - Output:
    array_lemmas: pairs of (word,lemmatized instance) for each text
    - History:
    June 9, 2019 --> implementation
    July 17, 2019 --> model as parameter, removed the mapping word to lemmatized instance
    '''
    array_stokens = tokenize_sent_collection(array)
    array_lemmas = []
    for para in array_stokens:
        for sent in para:
            array_lemmas.append(lemmatize(model, sent))
    
    return array_lemmas

In [42]:
lemmatize(sp,'La vie est courte')

[('La', 'le'), ('vie', 'vie'), ('est', 'être'), ('courte', 'court')]

In [43]:
# warning: lemmatization of a collection of tweet is slow ~ 8s
#start = time.time()
#lemmatize_collection(sp,tweets)
#end = time.time()
#end - start

## 2. Semantics

Library for French language? 

### 2.1 POS tagging

### 2.2 Chunking

### 2.3 Dependency parsing

### 2.4 Readability features

In [44]:
# !pip install textstat # uncomment to install textstat for evaluating the readability. French?

In [45]:
import textstat

In [46]:
no_url_tweets = remove_pattern_collection(tweets, url_pattern)

In [47]:
text = ''.join(no_url_tweets)
#text

In [48]:
textstat.flesch_reading_ease(text)

41.16

In [49]:
textstat.smog_index(text)

14.3

In [50]:
textstat.flesch_kincaid_grade(text)

17.0

In [51]:
textstat.coleman_liau_index(text)

13.12

In [52]:
textstat.automated_readability_index(text)

23.1

In [53]:
textstat.dale_chall_readability_score(text)

7.53

In [54]:
textstat.difficult_words(text)

1386

In [55]:
textstat.linsear_write_formula(text)

34.0

In [56]:
textstat.gunning_fog(text)

17.04

In [57]:
textstat.text_standard(text)

'16th and 17th grade'

### 2.5 Topic modelling

In [58]:
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer

In [59]:
def clean(doc, lem=False):
    '''
    '''
    no_url = remove_pattern(doc,url_pattern)
    doc_with_space = remove_pattern(no_url, space_pattern, " ")
    #print(doc_with_space)
    lower_doc = lower_case(doc_with_space)
    no_punc_doc = remove_punctuation(lower_doc)
    # token_doc = tokenize_word(lower_doc) tokinezation has been already covered in remove_stopwords function
    # print(token_doc)
    
    no_stop_doc = remove_stopwords(no_punc_doc,stop_words)
    #print(no_stop_doc)
    #no_url = remove_pattern(no_stop_doc,url_pattern)
    #print(no_stop_doc)
    clean = remove_pattern(no_stop_doc,nonalphanumeric_pattern) # letters with accent should be added to nonalphanumeric_pattern, otherwsie removed!
    #clean = no_stop_doc
    
    if lem:
        #print(lemmatize(sp,clean))
        normalized = [w[1] for w in lemmatize(sp,clean)]
        #print(doc,clean,normalized)
        return " ".join(normalized)
    else:
        return clean    
    

In [60]:
def clean_collection(docs, lem=False):
    clean_tweets = [clean(tw, lem) for tw in docs]
    return clean_tweets

In [61]:
doc = "#RCA : L'eau potable est devenue très inaccessible à Bangui https://t.co/6bJB5z0lPl"
doc

"#RCA : L'eau potable est devenue très inaccessible à Bangui https://t.co/6bJB5z0lPl"

In [62]:
clean(doc, True)

'rca leau potable devenir très inaccessible bangui'

In [63]:
tweets.shape

(535,)

In [64]:
tweets_clean = clean_collection(tweets,lem=True)
#tweets_clean

In [65]:
# !pip install -U gensim # uncomment to install gensim

In [66]:
import gensim #https://pypi.org/project/gensim/
from gensim import corpora, models

In [67]:
tok_doc = [x.split() for x in tweets_clean]

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(tok_doc)
    
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(doc) for doc in tok_doc]

# generate LDA model #### LDA model?? how it works?
ldamodel = gensim.models.ldamodel.LdaModel(corpus, 
                                           num_topics=2, 
                                           id2word = dictionary, 
                                           passes=20)

In [68]:
#print(ldamodel.print_topics(num_topics=10, num_words=2))

In [69]:
dictionary[14]

'mercredi'

In [70]:
[[(dictionary[id], freq) for id, freq in cp] for cp in corpus[:20]]

[[('coupure', 1),
  ('eau', 1),
  ('gouvernorat', 1),
  ('jendouber', 1),
  ('kasserin', 1),
  ('le', 1),
  ('potable', 1),
  ('siliana', 1),
  ('souffrir', 1)],
 [('coupure', 1),
  ('eau', 1),
  ('gouvernorat', 1),
  ('kasserin', 1),
  ('le', 1),
  ('potable', 1),
  ('siliana', 1),
  ('approvisionnemer', 1),
  ('jendouba', 1),
  ('perturbation', 1)],
 [('eau', 1),
  ('potable', 1),
  ('lapprovisionnement', 1),
  ('mardi', 1),
  ('mercredi', 1),
  ('nuit', 1),
  ('reprendre', 1)],
 [('coupure', 1),
  ('eau', 1),
  ('potable', 1),
  ('approvisionnemer', 1),
  ('jendouba', 1),
  ('perturbation', 1),
  ('béjer', 1),
  ('quelque', 1),
  ('région', 1)],
 [('eau', 1),
  ('potable', 1),
  ('approvisionnemer', 1),
  ('jendouba', 1),
  ('perturbation', 1),
  ('quelque', 1),
  ('région', 1),
  ('béjà', 1)],
 [('progressif', 1), ('reprise', 1)],
 [('coupure', 1),
  ('eau', 1),
  ('potable', 1),
  ('approvisionnemer', 1),
  ('jendouba', 1),
  ('perturbation', 1),
  ('béjer', 1),
  ('tunisie', 1)],

### 2.6 N-grams

In [71]:
def generate_ngrams(text, n):
    words = text.split()
    output = []  
    for i in range(len(words)-n+1):
        output.append(words[i:i+n])
    return output

[generate_ngrams(tw, 2) for tw in tweets]

[[['Les', 'gouvernorats'],
  ['gouvernorats', 'de'],
  ['de', 'Siliana,'],
  ['Siliana,', 'Kasserine'],
  ['Kasserine', 'et'],
  ['et', 'Jendouba'],
  ['Jendouba', 'souffrent'],
  ['souffrent', 'de'],
  ['de', 'coupures'],
  ['coupures', 'de'],
  ['de', 'l’eau'],
  ['l’eau', 'potable'],
  ['potable', 'https://t.co/j0bbbzzVcp']],
 [['Perturbations', 'et'],
  ['et', 'coupures'],
  ['coupures', 'de'],
  ['de', 'l’approvisionnement'],
  ['l’approvisionnement', 'en'],
  ['en', 'eau'],
  ['eau', 'potable'],
  ['potable', 'dans'],
  ['dans', 'les'],
  ['les', 'gouvernorats'],
  ['gouvernorats', 'de'],
  ['de', 'Siliana,'],
  ['Siliana,', 'Kasserine'],
  ['Kasserine', 'et'],
  ['et', 'Jendouba'],
  ['Jendouba', 'https://t.co/3Sk2V370g0']],
 [["L'approvisionnement", 'en'],
  ['en', 'eau'],
  ['eau', 'potable'],
  ['potable', 'reprendra,'],
  ['reprendra,', 'dans'],
  ['dans', 'la'],
  ['la', 'nuit'],
  ['nuit', 'du'],
  ['du', 'mardi'],
  ['mardi', 'au'],
  ['au', 'mercredi...'],
  ['mercredi..

## 3. Similarity

### 3.1 Edit similarity

In [72]:
def LD(s,t):
    s = ' ' + s      #-------------------------------------STEP:1
    t = ' ' + t      #--------------------------------------STEP:2
    d = {}
    S = len(s)
    T = len(t)
    for i in range(S):
        d[i, 0] = i #---------------------------------------STEP:3
    for j in range (T):
        d[0, j] = j #---------------------------------------STEP:4
    for j in range(1,T):
        for i in range(1,S):
            if s[i] == t[j]:
                d[i, j] = d[i-1, j-1]
            else:
                d[i, j] = min(d[i-1, j], d[i, j-1], d[i-1, j-1]) + 1
    return d[S-1, T-1]

In [73]:
string1="potable"
string2="perturbation"

In [74]:
LD(string1, string2)

9

In [75]:
tweets[1]

'Perturbations et coupures de l’approvisionnement en eau potable dans les gouvernorats de Siliana, Kasserine et Jendouba https://t.co/3Sk2V370g0'

In [76]:
tweets[0].split()

['Les',
 'gouvernorats',
 'de',
 'Siliana,',
 'Kasserine',
 'et',
 'Jendouba',
 'souffrent',
 'de',
 'coupures',
 'de',
 'l’eau',
 'potable',
 'https://t.co/j0bbbzzVcp']

In [77]:
list(map(lambda x:LD("potable",x),tweets[1].split()))

[11, 6, 6, 6, 16, 7, 6, 0, 6, 6, 11, 6, 7, 8, 6, 8, 21]

In [78]:
def compare_tweets(tw1, tw2, cl=False, lem=False):
    
    if clean:
        tw1 = clean(tw1, lem)
        tw2 = clean(tw2, lem)
    
     
    LD_matrix = []
    for w in tw1.split():
        LD_matrix_row = []
        for w2 in tw2.split():
            LD_matrix_row.append(LD(w, w2))

        LD_matrix.append(LD_matrix_row)
    
    data = pd.DataFrame(LD_matrix, columns=tw2.split(), index=tw1.split())
    
    return data

In [79]:
ld = compare_tweets(tweets[0],tweets[1], cl=True)

In [80]:
ld

Unnamed: 0,perturbations,coupures,approvisionnement,eau,potable,les,gouvernorats,siliana,kasserine,jendouba
les,11,6,16,3,6,0,10,6,8,7
gouvernorats,10,8,15,10,11,10,0,10,10,10
siliana,11,8,14,6,7,6,10,0,7,7
kasserine,10,8,14,8,8,8,10,7,0,9
jendouba,10,8,16,6,8,7,10,7,9,0
souffrent,11,5,12,8,7,8,8,7,7,9
coupures,10,0,15,7,6,6,8,8,8,8
eau,11,7,16,0,6,3,10,6,8,6
potable,10,6,14,6,0,6,11,7,8,8


In [81]:
# define a metric for similarity between 2 sentences
# Paper: SHORT TEXT SIMILARITY ALGORITHM BASED ON THE EDIT DISTANCE AND THESAURUS

### 3.2 Cosine similarity

In [82]:
import math
from collections import Counter #A counter is a container that stores elements as dictionary keys, 
                                 # and their counts are stored as dictionary values.

In [83]:
def text_to_vector(text): 
    words = text.split() 
    return Counter(words)

In [84]:
text_to_vector(clean(tweets[0]))

Counter({'les': 1,
         'gouvernorats': 1,
         'siliana': 1,
         'kasserine': 1,
         'jendouba': 1,
         'souffrent': 1,
         'coupures': 1,
         'eau': 1,
         'potable': 1})

In [85]:
text_to_vector(clean(tweets[1]))

Counter({'perturbations': 1,
         'coupures': 1,
         'approvisionnement': 1,
         'eau': 1,
         'potable': 1,
         'les': 1,
         'gouvernorats': 1,
         'siliana': 1,
         'kasserine': 1,
         'jendouba': 1})

In [86]:
def get_cosine(vec1, vec2):
    common = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in common])

    sum1 = sum([vec1[x]**2 for x in vec1.keys()]) 
    sum2 = sum([vec2[x]**2 for x in vec2.keys()]) 
    denominator = math.sqrt(sum1) * math.sqrt(sum2)
   
    if denominator:
        return float(numerator) / denominator
    else:
        return 0.0

In [87]:
tw1_vec = text_to_vector(clean(tweets[0]))
#print("tweet 1: ",clean(tweets[0]))
tw2_vec = text_to_vector(clean(tweets[1]))
#print("tweet 2: ",clean(tweets[1]))
get_cosine(tw1_vec,tw2_vec)

0.8432740427115678

In [88]:
tw1_vec = text_to_vector(clean(tweets[0]))
#print("tweet 1: ",clean(tweets[0]))
tw2_vec = text_to_vector(clean(tweets[10]))
#print("tweet 2: ",clean(tweets[10]))
get_cosine(tw1_vec,tw2_vec)

0.2357022603955158