# Data Representation in Natural Language Processing

In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import gensim

In [6]:
tw_path = open("/Users/basho/fadouaproject/SafeWater/files/twData.csv","r")
tw_data = pd.read_csv(tw_path, header=0)
tweets = tw_data.TwContent.values

## 1. CountVectorize

In [7]:
def count_vec(text):
    vectorizer = CountVectorizer()
    vocabulary=vectorizer.fit(text)
    doc_term_matrix= vectorizer.transform(text)
    final=doc_term_matrix.toarray()
    
    return final

In [8]:
count_vec([tweets[0]])

array([[1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

## 2. Word embedding

### 2.1 Making corporus

In [9]:
from ipynb.fs.full.Water_nlp import clean_collection
import os
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

[('perturbations', 'perturbation'), ('coupures', 'coupure'), (' ', ' '), ('approvisionnement', 'approvisionnemer'), ('eau', 'eau'), ('potable', 'potable'), ('quelques', 'quelque'), ('régions', 'région'), ('jendouba', 'jendouba'), ('béja', 'béjer')]
tweet 1:  les gouvernorats siliana kasserine jendouba souffrent coupures  eau potable
tweet 2:  perturbations coupures  approvisionnement eau potable les gouvernorats siliana kasserine jendouba
tweet 1:  les gouvernorats siliana kasserine jendouba souffrent coupures  eau potable
tweet 2:  vol équipements sonede prive plusieurs régions  eau potable


In [10]:
corpusdir = 'corpus/'
if not os.path.isdir(corpusdir):
    os.mkdir(corpusdir)

In [11]:
def make_text(data):
    filename = 0
    for text in data:
        print(text)
        filename+=1
        file = open(corpusdir+str(filename)+'.txt','w')
        file.write(text)
    file.close()

In [12]:
data = clean_collection(tweets, lem=True)

In [13]:
newcorpus = PlaintextCorpusReader(corpusdir, '.*')

In [14]:
# cccess the plaintext; outputs pure string/basestring.
#newcorpus.raw().strip()

# Access paragraphs in the corpus. (list of list of list of strings)
# NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and 
#       nltk.tokenize.word_tokenize.
#
# Each element in the outermost list is a paragraph, and
# Each paragraph contains sentence(s), and
# Each sentence contains token(s)
#newcorpus.paras()
#newcorpus.paras(newcorpus.fileids()[43])

# Access sentences in the corpus. (list of list of strings)
# NOTE: That the texts are flattened into sentences that contains tokens.
#newcorpus.sents()
#newcorpus.sents(newcorpus.fileids()[103])

# Access just tokens/words in the corpus. (list of strings)
#newcorpus.words()
newcorpus.words(newcorpus.fileids()[56])

['ariana', 'pertrubation', 'distribution', 'deau', ...]

### 2.2 Word2Vec via gensim

In [15]:
model = gensim.models.Word2Vec(newcorpus.sents())

In [16]:
X= list(model.wv.vocab)
X

['les',
 'jendouba',
 'coupures',
 'eau',
 'potable',
 'nord',
 'vol',
 'équipements',
 'sonede',
 'plusieurs',
 'régions',
 'leau',
 'travaux',
 'rt',
 'électricité',
 'a',
 'bangui',
 'odilon236',
 'centrafrique',
 'gouvernement',
 'périphéries',
 'faut',
 'aussi',
 'tunisie',
 'prix',
 'enfants',
 'vie',
 'accès',
 'assainissement',
 'millions',
 'personnes',
 'sans',
 'dun',
 'dune',
 'avant',
 'deau',
 'travers',
 'milliards',
 'cette',
 'droit',
 'reprise',
 'approvisionnement',
 'région',
 'toujours',
 'via',
 'selon',
 'peau',
 'plus',
 'besoin',
 'dhydratation',
 'peaux',
 'mature',
 'délicates',
 'sensibles',
 'fragiles',
 'voici',
 'bonheur',
 'rééquilibrante',
 'calmante',
 'adoucissante',
 'apaisante',
 'disponible',
 'immédiatement',
 'intestin',
 'ni',
 'beaucoup',
 'jamais',
 'cest',
 'sait',
 'reconnu',
 'nont',
 'acaweadvocate',
 'reportage',
 'mort',
 '4',
 'recherche',
 'wolordé',
 'lextrême',
 'cameroun',
 'mesure',
 'con',
 'bouteille',
 'depuis',
 'là',
 'pénurie

In [17]:
data=model.most_similar('eau')
data

  """Entry point for launching an IPython kernel.


[('potable', 0.6736690998077393),
 ('mature', 0.6441358327865601),
 ('leau', 0.6415693759918213),
 ('rt', 0.597144365310669),
 ('fragiles', 0.5935059189796448),
 ('unités', 0.5852980613708496),
 ('voici', 0.5848841071128845),
 ('épineux', 0.5736327171325684),
 ('devient', 0.5702157616615295),
 ('faut', 0.5639740824699402)]

In [18]:
dissimlar_words = model.doesnt_match('coupure eau potable'.split())
print(dissimlar_words)

coupure


  """Entry point for launching an IPython kernel.
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


In [19]:
def similarity_two_words(w1, w2):
    sim = model.similarity(w1,w2)
    print("The similarity between <{}> and <{}>: ".format(w1, w2), sim)
    return sim

In [20]:
similarity_two_words('eau', 'potable')

The similarity between <eau> and <potable>:  0.6736691


  


0.6736691

In [21]:
similarity_two_words('eau', 'coupure')

The similarity between <eau> and <coupure>:  0.53463686


  


0.53463686

In [22]:
similarity_two_words('eau', 'jendouba')

The similarity between <eau> and <jendouba>:  -0.00029225182


  


-0.00029225182

In [19]:
similarity_two_words('eau', 'tunis')

The similarity between <eau> and <tunis>:  0.43528774


  


0.43528774

### 2.3 Pre-trained word embedding

We use:
- Godin word2vec for twitter https://fredericgodin.com/software/ 
--> Not adapted for tweets in French.
- fastText for french docs https://fasttext.cc/docs/en/support.html
--> Too general... not adapted to twitter or to the topic (water)

#### 2.3.1 Godin model

In [20]:
from gensim.models import KeyedVectors

In [21]:
path = '/Users/basho/fadouaproject/SafeWater/model/word2vec_twitter_model/word2vec_twitter_model.bin'

In [22]:
model = KeyedVectors.load_word2vec_format(path, binary=True, unicode_errors='ignore')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [42]:
# Access vectors for specific words with a keyed lookup:
vector = model['eau']
#vector

In [43]:
vector.shape

AttributeError: 'list' object has no attribute 'shape'

In [44]:
#vector = model['gouvernorats'] # gouvernorats not included in the vocabulary. Godin word2vec is not for french language
#vector

#### 2.3.2 fastText

In [2]:
import fasttext

In [3]:
def make_data_from_tweets(tweets):
    filename = 0
    file = open('data.txt','a')
    for tw in tweets:
        file.write(tw)
    file.close()

In [4]:
make_data_from_tweets(clean_collection(tweets, lem=True))

NameError: name 'clean_collection' is not defined

In [29]:
path = '/Users/basho/fadouaproject/SafeWater/model/cc.fr.300.bin'

# CBOW model
model = fasttext.cbow('/Users/basho/fadouaproject/SafeWater/data.txt', path)

# Access vectors for specific words with a keyed lookup:
#

In [30]:
vector = model['eau']
#vector

In [31]:
len(vector)

100

In [32]:
vector = model['gouvernorats']
#vector

In [33]:
vectors = [model[x] for x in [s for s in clean_collection(tweets, lem=True)]]

In [34]:
vectors=np.array(vectors)

In [35]:
embedding_matrix=np.vstack(vectors)
embedding_matrix

array([[ 0.01510056, -0.04534726, -0.02222504, ..., -0.03518727,
         0.15241031,  0.12867574],
       [ 0.01911572, -0.06287023, -0.03116055, ..., -0.04759455,
         0.20960853,  0.17671601],
       [ 0.01325244, -0.04773147, -0.02495549, ..., -0.03608626,
         0.16396153,  0.1380997 ],
       ...,
       [ 0.00328554, -0.00538237, -0.00224848, ..., -0.0023036 ,
         0.00906369,  0.00774935],
       [ 0.01953404, -0.07345748, -0.03722223, ..., -0.05470436,
         0.24624388,  0.20672752],
       [ 0.00290398, -0.00419246, -0.00149947, ..., -0.00213428,
         0.00539392,  0.00481484]])

In [36]:
embedding_matrix.shape

(286, 100)

In [37]:
model.cosine_similarity('eau','tunis')

0.9991000063539089

In [38]:
model.cosine_similarity('eau','coupure')

0.9998339438408912

In [39]:
model.cosine_similarity('eau','projet')

0.9994097436713149

In [40]:
model.cosine_similarity('eau','distribution')

0.9996437167357842

In [41]:
model.words #.difference('eau','coupure')

{'108',
 '12',
 '15',
 '19',
 '2',
 '20',
 '2010',
 '2019',
 '3',
 '35',
 '4',
 '5',
 '55',
 '611',
 '63',
 '70',
 '85rt',
 'a',
 'absorbe',
 'acaweadvocate',
 'accès',
 'adoucissante',
 'analysé',
 'annonce',
 'années',
 'ans',
 'antiâge',
 'apaisante',
 'apprend',
 'approvisionnement',
 'ariana',
 'arrêt',
 'assainissement',
 'astes',
 'augmentation',
 'augmente',
 'aujourd',
 'aussi',
 'autant',
 'autorités',
 'avant',
 'baie',
 'balbutiante',
 'bangui',
 'basé',
 'beaucoup',
 'besoin',
 'bien',
 'boire',
 'bonheur',
 'bouteille',
 'calmante',
 'cameroun',
 'canal',
 'candidat',
 'capacité',
 'casa',
 'centrafrique',
 'cest',
 'cette',
 'cettert',
 'chambres',
 'chercheurs',
 'choix',
 'chronique',
 'citoyens',
 'comme',
 'commun',
 'compose',
 'conrt',
 'consacrés',
 'contre',
 'coucher',
 'coupe',
 'coupure',
 'coupures',
 'crise',
 'cuisine',
 'deau',
 'demain',
 'denfants',
 'depuis',
 'dernières',
 'desserte',
 'desservant',
 'deux',
 'devient',
 'dhydratation',
 'dinars',
 'di

In [1]:
classifier = fasttext.supervised('/Users/basho/fadouaproject/SafeWater/data.txt', path)

NameError: name 'fasttext' is not defined

In [None]:
result = classifier.test('Coupure deau prevue a Tunis et Ariana le soir')