In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

In [2]:
tw_path = open("/Users/basho/fadouaproject/SafeWater/files/twData.csv","r")
tw_data = pd.read_csv(tw_path, header=0)

In [3]:
def data_desc(data):
    print("Columns names",data.columns)
    print("Size", data.shape)
    print("Basic statistics", data.describe())

data_desc(tw_data)

Columns names Index(['Timestamp', 'TwDate', 'TwLoc', 'TwUserName', 'TwUserID', 'TwID',
       'TwContent', 'ContentLoc', 'urls', 'Unnamed: 9'],
      dtype='object')
Size (284, 10)
Basic statistics            TwUserID          TwID
count  2.840000e+02  2.840000e+02
mean   2.760923e+17  1.039389e+18
std    4.386059e+17  2.867855e+17
min    8.057970e+05  0.000000e+00
25%    1.454432e+08  1.105132e+18
50%    5.346214e+08  1.122058e+18
75%    7.700654e+17  1.124268e+18
max    1.123009e+18  1.141677e+18


In [4]:
tweets = tw_data.TwContent.values

## 1. Normalization pipeline
### 1.1 Lower case

In [5]:
def lower_case(text):
    text_lower = text.lower()
    
    return text_lower

In [6]:
def lower_case_collection(array):
    array_lower = [lower_case(text) for text in array]
    
    return array_lower

### 1.2  Remove punctuation

In [7]:
def remove_punctuation(text):
    obj = str.maketrans('', '',string.punctuation)
    text_unpunct = text.translate(obj)
    
    return text_unpunct

def remove_punctuation_collection(array):
    array_unpunct = [remove_punctuation(text) for text in array]
    
    return array_unpunct

In [8]:
#remove_punctuation_collection(tweets)

### 1.3 Tokenization

In [9]:
def tokenize_word(text):
    words = word_tokenize(text)
    
    return words

def tokenize_word_collection(array):
    array_wtokens = [tokenize_word(text) for text in array]
    
    return array_wtokens

In [10]:
#tokenize_word_collection(tweets)

In [11]:
def tokenized_sent(text):
    sentences = sent_tokenize(text)
    
    return sentences

def tokenize_sent_collection(array):
    array_stokens = [tokenized_sent(text) for text in array]
    
    return array_stokens

In [12]:
#tokenize_sent_collection(tweets)

### 1.4 appos
No appos in French?

### 1.5 stopwords

In [13]:
from nltk.corpus import stopwords 

In [14]:
stop_words = set(stopwords.words("french"))

In [15]:
def remove_stopwords(text):
    filtered_sentence = [] 
    word_tokens = word_tokenize(text)                                                
    filtered_sentence = [w for w in word_tokens if not w in stop_words]               
    text_nostopwords =' '.join(filtered_sentence) 
    
    return text_nostopwords

def remove_stopwords_collection(array):
    array_nostopwords = [remove_stopwords(text) for text in array]

    return array_nostopwords

In [16]:
#remove_stopwords_collection(tweets)

### 1.6 Remove objects (urls, undesired symbols, etc.)

In [66]:
import re
url_pattern = r'(https?:\/\/)(\s)?(www\.)?(\s?)(\w+\.)*([\w\-\s]+\/)*([\w-]+)\/?'
space_pattern = u'\xa0'
#nonalphanumeric_pattern = r'[^a-zA-z0-9\s]'
nonalphanumeric_pattern = r'[^a-zA-z0-9àâäèéêëîïôœùûüÿçÀÂÄÈÉÊËÎÏÔŒÙÛÜŸÇ\s]'

In [67]:
def remove_pattern(text, pattern):
    text_clean = re.sub(pattern, '', text)
    
    return text_clean

def remove_pattern_collection(array, url_pattern):
    array_clean = [remove_pattern(text, url_pattern) for text in array]

    return array_clean

In [19]:
#remove_pattern_collection(tweets, url_pattern)
#remove_pattern_collection(tweets, space_pattern)
#remove_pattern_collection(tweets, nonalphanumeric_pattern)

### 1.7 Lemmatization

In [20]:
from nltk.stem import WordNetLemmatizer
import spacy  

In [21]:
#!python -m spacy download fr_core_news_sm

In [22]:
#!python -m spacy download fr

In [23]:
sp = spacy.load('fr') 

In [24]:
def lemmatize(sent):
    sentence = sp(sent)
    lemmas = []
    for word in sentence:  
        lemmas.append((word.text, word.lemma_))
    
    new_sentence = map(lambda x:sentence.replace(x[0],x[1]),sentence)
    return lemmas, new_sentence

def lemmatize_collection(array):
    array_stokens = tokenize_sent_collection(array)
    array_lemmas = []
    for para in array_stokens:
        for sent in para:
            array_lemmas.append(lemmatize(sent))
    
    return array_lemmas

In [25]:
lemmatize('La vie est courte')

([('La', 'le'), ('vie', 'vie'), ('est', 'être'), ('courte', 'court')],
 <map at 0x1a2a0d8320>)

In [26]:
lemmatize_collection(tweets)

[([('Les', 'le'),
   ('gouvernorats', 'gouvernorat'),
   ('de', 'de'),
   ('Siliana', 'Siliana'),
   (',', ','),
   ('Kasserine', 'Kasserine'),
   ('et', 'et'),
   ('Jendouba', 'Jendouba'),
   ('souffrent', 'souffrir'),
   ('de', 'de'),
   ('coupures', 'coupure'),
   ('de', 'de'),
   ('l’', 'l’'),
   ('eau', 'eau'),
   ('potable', 'potable'),
   ('https://t.co/j0bbbzzVcp', 'https://t.co/j0bbbzzvcp')],
  <map at 0x1a221c2390>),
 ([('Perturbations', 'perturbation'),
   ('et', 'et'),
   ('coupures', 'coupure'),
   ('de', 'de'),
   ('l’', 'l’'),
   ('approvisionnement', 'approvisionnement'),
   ('en', 'en'),
   ('eau', 'eau'),
   ('potable', 'potable'),
   ('dans', 'dans'),
   ('les', 'le'),
   ('gouvernorats', 'gouvernorat'),
   ('de', 'de'),
   ('Siliana', 'Siliana'),
   (',', ','),
   ('Kasserine', 'Kasserine'),
   ('et', 'et'),
   ('Jendouba', 'jendouba'),
   ('https://t.co/3Sk2V370g0', 'https://t.co/3Sk2V370g0')],
  <map at 0x1a2d9d5da0>),
 ([("L'", 'le'),
   ('approvisionnement', 'ap

## 2. Semantics

Library for French language? 

### 2.1 POS tagging

### 2.2 Chunking

### 2.3 Dependency parsing

### 2.4 Readability features

In [27]:
import textstat

In [28]:
no_url_tweets = remove_pattern_collection(tweets, url_pattern)

In [29]:
text = ''.join(no_url_tweets)
text

'Les gouvernorats de Siliana, Kasserine et Jendouba souffrent de coupures de l’eau potable Perturbations et coupures de l’approvisionnement en eau potable dans les gouvernorats de Siliana, Kasserine et Jendouba L\'approvisionnement en eau potable reprendra, dans la nuit du mardi au mercredi... Perturbations et coupures dans l’approvisionnement en eau potable dans quelques régions à Jendouba et Béja Perturbations dans l’approvisionnement en eau potable dans quelques régions à Jendouba et Béjà La reprise sera progressive... "#Tunisie : Perturbations et coupures dans l’approvisionnement en eau potable à Jendouba et à Béja Tunisie - Tozeur : La SONEDE rassure sur la qualité de l’eau potable Nos gouvernants ont l\'habitude de prendre de l\'eau minérale pdt que les populations sont en quête de l\'eau potable. Les habitants d\'un village Kenyan ont servi de l\'eau sale aux autorités lors d\'une réunion. Cela doit servir de leçon à tous. #Centrafrique  Jendouba nord : le vol des équipements de

In [30]:
textstat.flesch_reading_ease(text)

44.95

In [31]:
textstat.smog_index(text)

14.6

In [32]:
textstat.flesch_kincaid_grade(text)

17.6

In [33]:
textstat.coleman_liau_index(text)

13.25

In [34]:
textstat.automated_readability_index(text)

25.2

In [35]:
textstat.dale_chall_readability_score(text)

7.69

In [36]:
textstat.difficult_words(text)

765

In [37]:
textstat.linsear_write_formula(text)

34.0

In [38]:
textstat.gunning_fog(text)

18.88

In [39]:
textstat.text_standard(text)

'17th and 18th grade'

### 2.5 Topic modelling

In [40]:
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer

In [159]:
def clean(doc, lem=False):
    no_url = remove_pattern(doc,url_pattern)
    lower_doc = lower_case(no_url)
    no_punc_doc = remove_punctuation(lower_doc)
    # token_doc = tokenize_word(lower_doc) tokinezation has been already covered in remove_stopwords function
    # print(token_doc)
    no_stop_doc = remove_stopwords(no_punc_doc)
    #print(no_stop_doc)
    #no_url = remove_pattern(no_stop_doc,url_pattern)
    #print(no_stop_doc)
    clean = remove_pattern(no_stop_doc,nonalphanumeric_pattern) # letters with accent should be added to nonalphanumeric_pattern, otherwsie removed!
    #clean = no_stop_doc
    if lem:
        print(lemmatize(clean)[0])
        normalized = [w[1] for w in lemmatize(clean)[0]]
        return " ".join(normalized)
    else:
        return clean    
    
def clean_collection(docs, lem=False):
    clean_tweets = [clean(tw).split() for tw in tweets]
    return clean_tweets

In [112]:
doc = tweets[3]
doc

'Perturbations et coupures dans l’approvisionnement en eau potable dans quelques régions à Jendouba et Béja https://t.co/qnFhE9rRU0'

In [113]:
clean(doc, True)

[('perturbations', 'perturbation'), ('coupures', 'coupure'), (' ', ' '), ('approvisionnement', 'approvisionnemer'), ('eau', 'eau'), ('potable', 'potable'), ('quelques', 'quelque'), ('régions', 'région'), ('jendouba', 'jendouba'), ('béja', 'béjer')]


'perturbation coupure   approvisionnemer eau potable quelque région jendouba béjer'

In [115]:
tweets_clean = clean_collection(tweets)

In [116]:
import gensim
from gensim import corpora, models

In [117]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(tweets_clean)
    
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(doc) for doc in tweets_clean]

# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)

In [118]:
print(ldamodel.print_topics(num_topics=3, num_words=3))

[(0, '0.039*"eau" + 0.026*"potable" + 0.012*"rt"'), (1, '0.023*"leau" + 0.018*"potable" + 0.018*"les"')]


### 2.6 N-grams

In [120]:
def generate_ngrams(text, n):
    words = text.split()
    output = []  
    for i in range(len(words)-n+1):
        output.append(words[i:i+n])
    return output

[generate_ngrams(tw, 2) for tw in tweets]

[[['Les', 'gouvernorats'],
  ['gouvernorats', 'de'],
  ['de', 'Siliana,'],
  ['Siliana,', 'Kasserine'],
  ['Kasserine', 'et'],
  ['et', 'Jendouba'],
  ['Jendouba', 'souffrent'],
  ['souffrent', 'de'],
  ['de', 'coupures'],
  ['coupures', 'de'],
  ['de', 'l’eau'],
  ['l’eau', 'potable'],
  ['potable', 'https://t.co/j0bbbzzVcp']],
 [['Perturbations', 'et'],
  ['et', 'coupures'],
  ['coupures', 'de'],
  ['de', 'l’approvisionnement'],
  ['l’approvisionnement', 'en'],
  ['en', 'eau'],
  ['eau', 'potable'],
  ['potable', 'dans'],
  ['dans', 'les'],
  ['les', 'gouvernorats'],
  ['gouvernorats', 'de'],
  ['de', 'Siliana,'],
  ['Siliana,', 'Kasserine'],
  ['Kasserine', 'et'],
  ['et', 'Jendouba'],
  ['Jendouba', 'https://t.co/3Sk2V370g0']],
 [["L'approvisionnement", 'en'],
  ['en', 'eau'],
  ['eau', 'potable'],
  ['potable', 'reprendra,'],
  ['reprendra,', 'dans'],
  ['dans', 'la'],
  ['la', 'nuit'],
  ['nuit', 'du'],
  ['du', 'mardi'],
  ['mardi', 'au'],
  ['au', 'mercredi...'],
  ['mercredi..

## 3. Similarity

### 3.1 Edit similarity

In [121]:
def LD(s,t):
    s = ' ' + s      #-------------------------------------STEP:1
    t = ' ' + t      #--------------------------------------STEP:2
    d = {}
    S = len(s)
    T = len(t)
    for i in range(S):
        d[i, 0] = i #---------------------------------------STEP:3
    for j in range (T):
        d[0, j] = j #---------------------------------------STEP:4
    for j in range(1,T):
        for i in range(1,S):
            if s[i] == t[j]:
                d[i, j] = d[i-1, j-1]
            else:
                d[i, j] = min(d[i-1, j], d[i, j-1], d[i-1, j-1]) + 1
    return d[S-1, T-1]

In [122]:
string1="potable"
string2="perturbation"

In [129]:
LD(string1, string2)

9

In [131]:
tweets[1]

'Perturbations et coupures de l’approvisionnement en eau potable dans les gouvernorats de Siliana, Kasserine et Jendouba https://t.co/3Sk2V370g0'

In [147]:
tweets[0].split()

['Les',
 'gouvernorats',
 'de',
 'Siliana,',
 'Kasserine',
 'et',
 'Jendouba',
 'souffrent',
 'de',
 'coupures',
 'de',
 'l’eau',
 'potable',
 'https://t.co/j0bbbzzVcp']

In [139]:
list(map(lambda x:LD("potable",x),tweets[1].split()))

[11, 6, 6, 6, 16, 7, 6, 0, 6, 6, 11, 6, 7, 8, 6, 8, 21]

In [216]:
def compare_tweets(tw1, tw2, cl=False, lem=False):
    
    if clean:
        tw1 = clean(tw1, lem)
        tw2 = clean(tw2, lem)
    
     
    LD_matrix = []
    for w in tw1.split():
        LD_matrix_row = []
        for w2 in tw2.split():
            LD_matrix_row.append(LD(w, w2))

        LD_matrix.append(LD_matrix_row)
    
    data = pd.DataFrame(LD_matrix, columns=tw2.split(), index=tw1.split())
    
    return data

In [218]:
ld = compare_tweets(tweets[0],tweets[1], cl=True)

In [224]:
ld

Unnamed: 0,perturbations,coupures,approvisionnement,eau,potable,les,gouvernorats,siliana,kasserine,jendouba
les,11,6,16,3,6,0,10,6,8,7
gouvernorats,10,8,15,10,11,10,0,10,10,10
siliana,11,8,14,6,7,6,10,0,7,7
kasserine,10,8,14,8,8,8,10,7,0,9
jendouba,10,8,16,6,8,7,10,7,9,0
souffrent,11,5,12,8,7,8,8,7,7,9
coupures,10,0,15,7,6,6,8,8,8,8
eau,11,7,16,0,6,3,10,6,8,6
potable,10,6,14,6,0,6,11,7,8,8


In [None]:
# define a metric for similarity between 2 sentences
# Paper: SHORT TEXT SIMILARITY ALGORITHM BASED ON THE EDIT DISTANCE AND THESAURUS

### 3.2 Cosine similarity

In [225]:
import math
from collections import Counter #A counter is a container that stores elements as dictionary keys, 
                                 # and their counts are stored as dictionary values.

In [226]:
def text_to_vector(text): 
    words = text.split() 
    return Counter(words)

In [228]:
text_to_vector(clean(tweets[0]))

Counter({'les': 1,
         'gouvernorats': 1,
         'siliana': 1,
         'kasserine': 1,
         'jendouba': 1,
         'souffrent': 1,
         'coupures': 1,
         'eau': 1,
         'potable': 1})

In [229]:
text_to_vector(clean(tweets[1]))

Counter({'perturbations': 1,
         'coupures': 1,
         'approvisionnement': 1,
         'eau': 1,
         'potable': 1,
         'les': 1,
         'gouvernorats': 1,
         'siliana': 1,
         'kasserine': 1,
         'jendouba': 1})

In [230]:
def get_cosine(vec1, vec2):
    common = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in common])

    sum1 = sum([vec1[x]**2 for x in vec1.keys()]) 
    sum2 = sum([vec2[x]**2 for x in vec2.keys()]) 
    denominator = math.sqrt(sum1) * math.sqrt(sum2)
   
    if denominator:
        return float(numerator) / denominator
    else:
        return 0.0

In [240]:
tw1_vec = text_to_vector(clean(tweets[0]))
print("tweet 1: ",clean(tweets[0]))
tw2_vec = text_to_vector(clean(tweets[1]))
print("tweet 2: ",clean(tweets[1]))
get_cosine(tw1_vec,tw2_vec)

tweet 1:  les gouvernorats siliana kasserine jendouba souffrent coupures  eau potable
tweet 2:  perturbations coupures  approvisionnement eau potable les gouvernorats siliana kasserine jendouba


0.8432740427115678

In [241]:
tw1_vec = text_to_vector(clean(tweets[0]))
print("tweet 1: ",clean(tweets[0]))
tw2_vec = text_to_vector(clean(tweets[10]))
print("tweet 2: ",clean(tweets[10]))
get_cosine(tw1_vec,tw2_vec)

tweet 1:  les gouvernorats siliana kasserine jendouba souffrent coupures  eau potable
tweet 2:  vol équipements sonede prive plusieurs régions  eau potable


0.2357022603955158