<a href="https://colab.research.google.com/github/EAsencios/DEEP-LEARING/blob/master/fi_on_text_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [50]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [51]:
corpus = ['The sky is blue and beautiful.',
          'Love this blue and beautiful sky!',
          'The quick brown fox jumps over the lazy dog.',
          'The brown fox is quick and the blue dog is lazy!',
          'The sky es very blue and the sky is very beautiful today',
          'The dog is lazy but the brown fox is quick!'
]
labels = ['weather', 'weather', 'animal', 'animal', 'weather', 'animal']
corpus = np.array(corpus)
corpus_df = pd.DataFrame({'Document':corpus,
                          'Category':labels})
corpus_df = corpus_df[['Document', 'Category']]
corpus_df

Unnamed: 0,Document,Category
0,The sky is blue and beautiful.,weather
1,Love this blue and beautiful sky!,weather
2,The quick brown fox jumps over the lazy dog.,animal
3,The brown fox is quick and the blue dog is lazy!,animal
4,The sky es very blue and the sky is very beaut...,weather
5,The dog is lazy but the brown fox is quick!,animal


In [52]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

In [53]:
def normalize_document(doc):
  # lower case and remove special characters\whitespaces
  doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I)
  doc = doc.lower()
  doc = doc.strip()
  # tokenize document
  tokens = wpt.tokenize(doc)
  # filter stopwords out of document
  filtered_tokens = [token for token in tokens if token not in stop_words]
  # re-create document from filtered tokens
  doc = ' '.join(filtered_tokens)
  return doc

In [54]:
normalize_corpus = np.vectorize(normalize_document)
norm_corpus = normalize_corpus(corpus)
norm_corpus

array(['sky blue beautiful', 'love blue beautiful sky',
       'quick brown fox jumps lazy dog', 'brown fox quick blue dog lazy',
       'sky es blue sky beautiful today', 'dog lazy brown fox quick'],
      dtype='<U31')

In [55]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(min_df=0, max_df=1.)
cv_matrix = cv.fit_transform(norm_corpus)
cv_matrix = cv_matrix.toarray()
cv_matrix

array([[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0],
       [0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0],
       [0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0],
       [1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 2, 1],
       [0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0]])

In [56]:
vocab = cv.get_feature_names()
pd.DataFrame(cv_matrix, columns=vocab)



Unnamed: 0,beautiful,blue,brown,dog,es,fox,jumps,lazy,love,quick,sky,today
0,1,1,0,0,0,0,0,0,0,0,1,0
1,1,1,0,0,0,0,0,0,1,0,1,0
2,0,0,1,1,0,1,1,1,0,1,0,0
3,0,1,1,1,0,1,0,1,0,1,0,0
4,1,1,0,0,1,0,0,0,0,0,2,1
5,0,0,1,1,0,1,0,1,0,1,0,0


In [57]:
bv = CountVectorizer(ngram_range=(2,2))
bv_matrix = bv.fit_transform(norm_corpus)
bv_matrix = bv_matrix.toarray()
vocab = bv.get_feature_names()
pd.DataFrame(bv_matrix, columns=vocab)



Unnamed: 0,beautiful sky,beautiful today,blue beautiful,blue dog,blue sky,brown fox,dog lazy,es blue,fox jumps,fox quick,jumps lazy,lazy brown,lazy dog,love blue,quick blue,quick brown,sky beautiful,sky blue,sky es
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,1,0,0,0
3,0,0,0,1,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0
4,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1
5,0,0,0,0,0,1,1,0,0,1,0,1,0,0,0,0,0,0,0


In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(min_df=0, max_df=1., use_idf=True)
tv_matrix = tv.fit_transform(norm_corpus)
tv_matrix = tv_matrix.toarray()

vocab = tv.get_feature_names()
pd.DataFrame(np.around(tv_matrix, 2), columns=vocab)



Unnamed: 0,beautiful,blue,brown,dog,es,fox,jumps,lazy,love,quick,sky,today
0,0.6,0.52,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0
1,0.46,0.39,0.0,0.0,0.0,0.0,0.0,0.0,0.66,0.0,0.46,0.0
2,0.0,0.0,0.38,0.38,0.0,0.38,0.54,0.38,0.0,0.38,0.0,0.0
3,0.0,0.36,0.42,0.42,0.0,0.42,0.0,0.42,0.0,0.42,0.0,0.0
4,0.32,0.27,0.0,0.0,0.46,0.0,0.0,0.0,0.0,0.0,0.64,0.46
5,0.0,0.0,0.45,0.45,0.0,0.45,0.0,0.45,0.0,0.45,0.0,0.0


In [59]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(tv_matrix)
similarity_df = pd.DataFrame(similarity_matrix)
similarity_df

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.753128,0.0,0.185447,0.717486,0.0
1,0.753128,1.0,0.0,0.139665,0.540359,0.0
2,0.0,0.0,1.0,0.784362,0.0,0.839987
3,0.185447,0.139665,0.784362,1.0,0.097425,0.933779
4,0.717486,0.540359,0.0,0.097425,1.0,0.0
5,0.0,0.0,0.839987,0.933779,0.0,1.0


In [60]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=2)
km.fit_transform(similarity_df)
cluster_labels = km.labels_
cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])
pd.concat([corpus_df, cluster_labels], axis=1)

Unnamed: 0,Document,Category,ClusterLabel
0,The sky is blue and beautiful.,weather,0
1,Love this blue and beautiful sky!,weather,0
2,The quick brown fox jumps over the lazy dog.,animal,1
3,The brown fox is quick and the blue dog is lazy!,animal,1
4,The sky es very blue and the sky is very beaut...,weather,0
5,The dog is lazy but the brown fox is quick!,animal,1


In [61]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=2, max_iter=100, random_state=42)
dt_matrix = lda.fit_transform(tv_matrix)
features = pd.DataFrame(dt_matrix, columns=['T1', 'T2'])
features

Unnamed: 0,T1,T2
0,0.191131,0.808869
1,0.177271,0.822729
2,0.846406,0.153594
3,0.817973,0.182027
4,0.168328,0.831672
5,0.839354,0.160646


In [62]:
from sklearn.utils.extmath import weighted_mode
tt_matrix = lda.components_
for topic_weights in tt_matrix:
  topic = [(token, weight) for token, weight in zip(vocab, topic_weights)]
  topic = sorted(topic, key=lambda x: -x[1])
  topic = [item for item in topic if item[1] > 0.6]
  print(topic)
  print()

[('brown', 1.7279413066420188), ('dog', 1.7279413066420188), ('fox', 1.7279413066420188), ('lazy', 1.7279413066420188), ('quick', 1.7279413066420188), ('jumps', 1.03310482744227), ('blue', 0.7828939229380227)]

[('sky', 2.1846841668045975), ('beautiful', 1.8668438278226838), ('blue', 1.7557208168256953), ('love', 1.1478159291395924), ('es', 0.9500895007344254), ('today', 0.9500895007344254)]



In [63]:
km = KMeans(n_clusters=2)
km.fit_transform(features)
cluster_labels = km.labels_
cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])
pd.concat([corpus_df, cluster_labels], axis=1)

Unnamed: 0,Document,Category,ClusterLabel
0,The sky is blue and beautiful.,weather,0
1,Love this blue and beautiful sky!,weather,0
2,The quick brown fox jumps over the lazy dog.,animal,1
3,The brown fox is quick and the blue dog is lazy!,animal,1
4,The sky es very blue and the sky is very beaut...,weather,0
5,The dog is lazy but the brown fox is quick!,animal,1


In [64]:
from gensim.models import word2vec

wpt = nltk.WordPunctTokenizer()
tokenized_corpus = [wpt.tokenize(document) for document in norm_corpus]

# Sent values for various parameters
feature_size = 10       # Word vector dimensionality
window_context = 10     # Context window size
min_word_count = 1      # Minimum word count
sample = 1e-3           # Downsample setting for frequent words

w2v_model = word2vec.Word2Vec(tokenized_corpus, size=feature_size, window=window_context, min_count=min_word_count, sample=sample)
w2v_model.wv['sky']

array([ 0.03272785,  0.01029981, -0.04694484, -0.04454439, -0.0368256 ,
        0.01843927,  0.02390699,  0.01083927,  0.0197903 , -0.00282629],
      dtype=float32)

In [65]:
def average_word_vectors(words, model, vocabulary, num_features):
  feature_vector = np.zeros((num_features,), dtype='float64')
  nwords = 0.

  for word in words:
    if word in vocabulary:
      nwords = nwords + 1
      feature_vector = np.add(feature_vector, model[word])
    if nwords:
      feature_vector = np.divide(feature_vector, nwords)
  return feature_vector

def average_word_vectorizer (corpus, model, num_features):
  vocabulary = set(model.wv.index2word)
  feature = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features) for tokenized_sentence in corpus]
  return np.array(feature)

In [66]:
w2v_feature_array = average_word_vectorizer(corpus=tokenized_corpus, model=w2v_model, num_features=feature_size)
pd.DataFrame(w2v_feature_array)

  


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.00352,-0.002166,-0.002963,-0.022464,0.005675,-0.003831,-0.016757,-0.004159,0.013609,-0.007229
1,0.006123,0.003369,-0.012533,-0.014533,-0.00534,0.003287,0.002636,-0.000718,0.007074,-0.003025
2,-0.005798,0.007539,0.001904,-0.005705,0.006478,0.003021,0.003698,-0.000111,-0.003761,-0.006264
3,-0.000494,-1.4e-05,0.007365,-0.003511,-0.003488,-0.005263,8.3e-05,0.005952,-0.008556,0.000322
4,0.000309,-0.003349,-0.000498,0.005711,-0.003899,0.006712,-0.006397,0.003062,0.000819,0.002763
5,-0.005299,-0.006875,0.004151,-0.002624,0.009781,0.006046,0.010465,0.009582,0.003528,0.008465


In [67]:
from sklearn import cluster
from sklearn.cluster import AffinityPropagation

ap = AffinityPropagation()
ap.fit(w2v_feature_array)
cluster_labels = ap.labels_
cluster_labels = pd.DataFrame(cluster_labels, columns=['ClasterLabel'])
pd.concat([corpus_df, cluster_labels], axis=1)

Unnamed: 0,Document,Category,ClasterLabel
0,The sky is blue and beautiful.,weather,0
1,Love this blue and beautiful sky!,weather,1
2,The quick brown fox jumps over the lazy dog.,animal,2
3,The brown fox is quick and the blue dog is lazy!,animal,2
4,The sky es very blue and the sky is very beaut...,weather,2
5,The dog is lazy but the brown fox is quick!,animal,3
