In [1]:
import pandas as pd
import pickle
import nltk
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#from scipy import sparse
import re
import gensim
from skimage import io
import matplotlib.pyplot as plt
from gensim.test.utils import get_tmpfile
import string


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ac\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ac\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ac\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# !unzip Articles_full.zip
# print ('uzipping done')

In [2]:
import os
os.listdir()

['.ipynb_checkpoints',
 'Articles_full',
 'pre_processed.csv',
 'train_model.ipynb']

In [3]:
folder_name = 'Articles_full'
import numpy as np
np.random.seed(1)
import glob2
import os


In [4]:
data_files =   glob2.glob(os.getcwd()+'/'+folder_name+"/*.txt") 
content_list = []

for i in data_files:
    file = open(i, encoding='utf8')
    content_list.append(('').join(file.readlines()))

In [5]:
df = pd.DataFrame(content_list)

In [6]:
df['title'] = [os.path.basename(i) for i in data_files]

# Load Data

In [7]:
# # Load description features
# df = pd.read_pickle("data/perfume_data.pkl")
# print (df.shape)
# df.head(3)

## Clean text

In [8]:
def lemmatize(text):
    text_list = word_tokenize(text)
    stemmed_words = [wordnet_lemmatizer.lemmatize(i) for i in text_list]
    text = " ".join(stemmed_words)
    return text

def make_lower_case(text):
    return text.lower()

def remove_stop_words(text):
    text_list = word_tokenize(text)
    text = [i for i in text_list if i not in stop_words]
    text = " ".join(text)
    return text

def remove_punctuation(text):
   # tokenizer = RegexpTokenizer(r'\w+')
    text_list = word_tokenize(text)
    text_list = [i for i in text_list if i not in string.punctuation]
    text_list = [x for x in text_list if not (x.isdigit())]
    text = " ".join(text_list)
    return text

In [9]:
df['description'] = df[0].apply(func=make_lower_case)
df['description'] = df.description.apply(func=remove_stop_words)
df['description'] = df.description.apply(func=remove_punctuation)
df['description'] = df.description.apply(func=lemmatize)

In [10]:
df.to_csv('pre_processed.csv', index=False)

In [11]:
# df['notes'] = df.notes.apply(func=make_lower_case)
# df['notes'] = df.notes.apply(func=remove_punctuation)
# df['notes'] = df.notes.apply(func=stem_words)

In [12]:
# df['reviews'] = df.reviews.apply(func=make_lower_case)
# df['reviews'] = df.reviews.apply(func=remove_stop_words)
# df['reviews'] = df.reviews.apply(func=remove_punctuation)
# df['reviews'] = df.reviews.apply(func=stem_words)

# TF-IDF Model

In [13]:
#df['full_document'] = df['description'] + ' ' + df['notes'] + ' ' + df['reviews']

In [14]:
#Fit TFIDF 
#Learn vocabulary and tfidf from all style_ids.
tf = TfidfVectorizer(analyzer='word', 
                     min_df=4,
                     ngram_range=(1, 2),
                     #max_features=1000,
                     stop_words='english')
tf.fit(df['description'])

#Transform style_id products to document-term matrix.
tfidf_matrix = tf.transform(df['description'])
#break

In [15]:
if not os.path.isdir(os.getcwd()+'/models'):
  os.mkdir(os.getcwd()+'/models')

In [16]:

pickle.dump(tf, open(os.getcwd()+"/models/tfidf_model.pkl", "wb"))

print (tfidf_matrix.shape)

(3361, 70686)


In [17]:
# Compress with SVD
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=500)
latent_matrix = svd.fit_transform(tfidf_matrix)
pickle.dump(svd, open(os.getcwd()+"/models/svd_model.pkl", "wb"))

print (latent_matrix.shape)

(3361, 500)


In [18]:
df.head(4)

Unnamed: 0,0,title,description
0,\n\nORIGINAL ARTICLE Year : 2018 | Volume : 1 ...,Article10830.txt,original article year volume issue page 25-29 ...
1,"Filed on July 29, 2018 | Last updated on July ...",Article10833.txt,filed july last updated july 07.21 health expe...
2,Health State of UAE\n\nThe factors that affect...,Article10835.txt,health state uae factor affect health lifestyl...
3,Dubai: Doctors from Dubai Health Authority (DH...,Article10836.txt,dubai doctor dubai health authority dha discus...


In [19]:
n = 25 #pick components
#Use elbow and cumulative plot to pick number of components. 
#Need high ammount of variance explained. 
doc_labels = df.title
svd_feature_matrix = pd.DataFrame(latent_matrix[:,0:n] ,index=doc_labels)
print (svd_feature_matrix.shape)
svd_feature_matrix.head()

pickle.dump(svd_feature_matrix, open(os.getcwd()+"/models/lsa_embeddings.pkl", "wb"))

(3361, 25)


# Doc2Vec Model

Doc to vec preverves word order in the embeddings, so "I hate rose" and "I love rose" will be treated differently.

In [20]:
#Use reviews, descriptions, and notes for vocabulary 
#reviews = df.reviews.values.tolist()
descriptions = df.description.values.tolist()
#notes = df.notes.values.tolist() #not using notes because sematics and order of list is not meaningfull. 

documents = []
for i in range(len(df)):
   # mystr = reviews[i]
    mystr = descriptions[i]
    documents.append(re.sub("[^\w]", " ",  mystr).split())

In [21]:
print (len(df))
print (len(documents))

3361
3361


In [22]:
df.loc[278]

0              [first paragraph of article]\n\nKawasaki disea...
title                                           Article11352.txt
description    first paragraph article kawasaki disease kd hy...
Name: 278, dtype: object

In [23]:
formatted_documents = [gensim.models.doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(documents)]

model = gensim.models.doc2vec.Doc2Vec(vector_size=25, min_count=5, epochs=200, seed=0, window=3, dm=1)
model.build_vocab(formatted_documents)

In [24]:
%time model.train(formatted_documents, total_examples=model.corpus_count, epochs=model.epochs)

Wall time: 9min 27s


In [25]:
fname = get_tmpfile("models/doc2vec_model")
model.save(os.getcwd()+"/models/doc2vec_model")
model = gensim.models.doc2vec.Doc2Vec.load("./models/doc2vec_model")

In [26]:
vector = model.infer_vector(doc_words=["this", "is", "a", "test"], epochs=50)
vector

array([-0.01667839,  0.50377184, -0.60562116,  0.8766946 ,  0.25604793,
       -0.11461622, -0.24817497,  0.25780392,  0.04501468,  0.11840003,
       -0.67256427, -0.07016043, -1.4059721 , -0.20856468,  0.8309469 ,
       -0.80398434,  0.46019238, -0.4275276 , -0.27626002,  0.17008889,
       -0.00263448,  0.14841184, -0.01025613, -0.3984194 ,  0.24076329],
      dtype=float32)

In [27]:
doctovec_feature_matrix = pd.DataFrame(model.docvecs.vectors_docs, index=df.title)
print (doctovec_feature_matrix.shape)
doctovec_feature_matrix.head(3)
pickle.dump(doctovec_feature_matrix, open(os.getcwd()+"/models/doctovec_embeddings.pkl", "wb"))

(3361, 25)


In [0]:
# !zip -r /content/models.zip /content/models

  adding: content/models/ (stored 0%)
  adding: content/models/doc2vec_model (deflated 32%)
  adding: content/models/doctovec_embeddings.pkl (deflated 22%)
  adding: content/models/tfidf_model.pkl (deflated 59%)
  adding: content/models/lsa_embeddings.pkl (deflated 13%)
  adding: content/models/svd_model.pkl (deflated 5%)


In [36]:
import boto3
aws_access_key_id = "AKIAILJ6DPN5TSYIUNDQ"
aws_secret_access_key = "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"
bucket = 'trainingerrortracker'

s3 = boto3.client("s3",
                   aws_access_key_id = aws_access_key_id,
                   aws_secret_access_key= aws_secret_access_key
                  )

In [39]:
s3.list_objects(Bucket=bucket)['Contents']

EndpointConnectionError: Could not connect to the endpoint URL: "https://trainingerrortracker.s3.ap-south-1.amazonaws.com/?encoding-type=url"

In [46]:
s3 = boto3.resource('s3',aws_access_key_id= aws_access_key_id,
    aws_secret_access_key= aws_secret_access_key)
model_bucket = 'inputdatatraining'
s3.Bucket(model_bucket).download_file('lsa_embeddings.pkl','lsa_embeddings.pkl')
    #s3.download_file(bucket+'/models/doc2vec_model','/content/models/doc2vec_model')
# s3.download_file(model_bucket,'lsa_embeddings.pkl','/content/models/lsa_embeddings.pkl')

In [44]:
ls

 Volume in drive C is KOCWX10651
 Volume Serial Number is 1E93-24E8

 Directory of C:\Users\ac\Downloads\model_training\Deploy_model

04/17/2019  10:27 AM    <DIR>          .
04/17/2019  10:27 AM    <DIR>          ..
04/12/2019  02:53 PM    <DIR>          .ipynb_checkpoints
04/12/2019  04:17 PM    <DIR>          Articles_full
04/16/2019  12:00 PM    <DIR>          models
04/16/2019  11:49 AM        39,301,692 pre_processed.csv
04/17/2019  10:27 AM            72,925 train_model.ipynb
               2 File(s)     39,374,617 bytes
               5 Dir(s)  71,900,450,816 bytes free
