# Task1:
## Process and examine google bookmarks json file

## 1.1: Create a copy of google chrome bookmark file into working folder data subfolder

In [17]:
# Copy the google chrome bookmarks file into the current folder data subfolder

from platform import system
from os import environ, path, curdir
from shutil import copyfile
import datetime
        
def convertChrometime(dtms):
    seconds, micros = divmod(dtms, 1000000)
    days, seconds = divmod(seconds, 86400)
    return datetime.datetime(1601, 1, 1) + datetime.timedelta(days, seconds, micros)
#print( convertChrometime(13024882639633631).strftime( '%a, %d %B %Y %H:%M:%S %Z' ) )

def get_chrome_bookmarks_path(syst):
    chrome_bookmarks = ''
    if syst == "Darwin":
        chrome_bookmarks = path.expanduser("~/Library/Application Support/Google/Chrome/Default/Bookmarks")
    elif syst == "Linux":
        chrome_bookmarks = path.expanduser("~/.config/google-chrome/Default/Bookmarks")
    elif syst == "Windows":
        chrome_bookmarks = environ["LOCALAPPDATA"] + r"\Google\Chrome\User Data\Default\Bookmarks"
    else:
        print('Your system ("{}") is not handled. File path requested.'.format(syst))
        chrome_bookmarks = input("Please provide the full path to Chrome Bookmarks file")

    if len(chrome_bookmarks)>0:
        return chrome_bookmarks
    else:
        exit(1)

BKM_master = get_chrome_bookmarks_path(system())

BKM_file = "Bookmarks.json"
BKM_copy = path.join(curdir, "data", BKM_file)

copyfile(BKM_master, BKM_copy)

'.\\data\\Bookmarks.json'

## 1.2 Process the file

### The bookmarks file has a deeply nested structure even before addition of subfolders:
This turns out to be a problem on its own as there are to my knowledge no straightforward way /
to output a json file into a pandas frame in a generic way: one has to know the structure and the key labels. /
    
The following command lines use the line indentation (3 char) to get some sense of the structure: /

Level 1:
```
> grep -E '^ {3}"' Bookmarks.json
   "checksum": "fbefde58fad3cc708c0beffbbfdfb0a7",
   "roots": {
   "version": 1
```

Level 2:
```
> grep -E '^ {6}"' Bookmarks.json
      "bookmark_bar": {
      "other": {                                     **# "Other bookmarks" is in there**
      "sync_transaction_version": "1320",
      "synced": {
```

Level 3:
```      
> grep -E '^ {9}"' Bookmarks.json
         "children": [ {
         "date_added": "13109116088391906",
         "date_modified": "13169923254937857",
         "id": "1",
         "name": "Bookmarks bar",
         "type": "folder"
         "children": [ {
         "date_added": "13109116088391910",
         "date_modified": "13163607256316120",
         "id": "2",
         "name": "Other bookmarks",                  **# Found "Other bookmarks": children needed**
         "type": "folder"
         "children": [ {
         "date_added": "13109116088391912",
         "date_modified": "13145377539948572",
         "id": "3",
         "name": "Mobile bookmarks",
         "type": "folder"
```

What are the values under 'Type'?
```             
> grep -e 'type\"' Bookmarks.json| tr -s \ | sort | uniq
 "type": "folder"
 "type": "url",
```
Nice to know!


In [23]:
'''
# This is Level 1: dict_keys(['version', 'roots', 'checksum'])
data.keys()            

# Level 2:  dict_keys(['sync_transaction_version', 'bookmark_bar', 'other', 'synced'])
data['roots'].keys()

# Level 3:  dict_keys(['date_added', 'date_modified', 'id', 'type', 'children', 'name'])
data['roots']['other'].keys()

# Level 4:  dict_keys(['date_added', 'date_modified', 'id', 'type', 'children', 'name']):
'''

subf = len(data['roots']['other']['children'])
#print(subf)

location = ''.join('Other bookmarks')
location = location.join('|')

for f in range(subf):
    other_level = data['roots']['other']['children'][f]
    label = other_level['name']
    '''
    difference in the keys btw fldr and bmrk:
    FLD: dict_keys(['sync_transaction_version', 'date_modified', 'type', 'date_added', 'id', 'name', 'children'])
    BKM: dict_keys(['sync_transaction_version', 'meta_info', 'url', 'type', 'date_added', 'id', 'name'])
   '''
    
    try:
        has_children = bool(other_level['children'])
        print("FLD {} {}\n{}".format(label, other_level['date_added'], other_level.keys()))  # 'Other bookmarks' subfolder name
    except KeyError:
        print("BKM {} {}\n{}".format(label, other_level['date_added'], other_level.keys()))  # 'Other bookmarks' bookmark item name

FLD Perso 13109174777723357
dict_keys(['sync_transaction_version', 'children', 'id', 'date_added', 'date_modified', 'name', 'type'])
FLD News 13109174777705078
dict_keys(['sync_transaction_version', 'children', 'id', 'date_added', 'date_modified', 'name', 'type'])
FLD Accounts 13151775038046138
dict_keys(['sync_transaction_version', 'children', 'id', 'date_added', 'date_modified', 'name', 'type'])
FLD CUNY 13109116258767161
dict_keys(['sync_transaction_version', 'children', 'id', 'date_added', 'date_modified', 'name', 'type'])
FLD Resources & Tools 13109116258767360
dict_keys(['sync_transaction_version', 'children', 'id', 'date_added', 'date_modified', 'name', 'type'])
FLD Courses & Textbooks 13109116258767557
dict_keys(['sync_transaction_version', 'children', 'id', 'date_added', 'date_modified', 'name', 'type'])
FLD Science sites & tools 13109116258768141
dict_keys(['sync_transaction_version', 'children', 'id', 'date_added', 'date_modified', 'name', 'type'])
FLD Misc 13109116258768527

### json 'whole upload ' testing:

- Need to keep track of folder/subfolder names as this will be one of the classification features to be tested
- Ideally, I want the entire contents of the 'Other bookmarks' folder into a df & only then proceed with the data cleaning and selection.

** From my search in varous programming spheres, finding a solution to this particular problem 
would help a lot of people! **

Getting closer!

In [18]:
import json

with open(BKM_copy, encoding='utf-8') as f:
    data = json.loads(f.read())

In [4]:
import pandas as pd
from pandas.io.json import json_normalize

In [30]:
df1 = json_normalize(data['roots']['other']['children'], 
                    meta=[['children', 'type'], ['name', 'url', 'id', 'meta_info', 'date_added', 'type']])

In [31]:
df1.head(20)

Unnamed: 0,children,date_added,date_modified,id,meta_info,name,sync_transaction_version,type,url
0,"[{'sync_transaction_version': '1', 'children':...",13109174777723357,1.3117039265083786e+16,1203,,Perso,1,folder,
1,"[{'sync_transaction_version': '1', 'name': '10...",13109174777705078,1.317009776574367e+16,1137,,News,1,folder,
2,"[{'sync_transaction_version': '1', 'name': 'Vi...",13151775038046138,1.3163106408774756e+16,3688,,Accounts,1,folder,
3,"[{'sync_transaction_version': '1', 'children':...",13109116258767161,1.3167924695837332e+16,8,,CUNY,1,folder,
4,"[{'sync_transaction_version': '1', 'name': '10...",13109116258767360,1.316957177775262e+16,9,,Resources & Tools,1,folder,
5,"[{'sync_transaction_version': '1', 'name': 'Bi...",13109116258767557,1.316475019069891e+16,10,,Courses & Textbooks,1,folder,
6,"[{'sync_transaction_version': '1', 'children':...",13109116258768141,1.3169487040373304e+16,13,,Science sites & tools,1,folder,
7,"[{'sync_transaction_version': '1', 'children':...",13109116258768527,1.3146764550541136e+16,15,,Misc,1,folder,
8,"[{'sync_transaction_version': '1', 'children':...",13109174778060091,1.3167438541781746e+16,1617,,Progg,1,folder,
9,"[{'sync_transaction_version': '1', 'children':...",13109174777774261,1.316862477252592e+16,1399,,Jobs,1,folder,


### other tests

In [None]:
class DictQuery(dict):
    def get(self, path, default = None):
        keys = path.split("/")
        val = None

        for key in keys:
            if val:
                if isinstance(val, list):
                    val = [ v.get(key, default) if v else None for v in val]
                else:
                    val = val.get(key, default)
            else:
                val = dict.get(self, key, default)

            if not val:
                break;

        return val
    
            
#Now you can do this:
for item in animals:
    print DictQuery(item).get("animal/type") 

In [None]:
def walk_json(tree, path=[]):
    try:
        for root, child in tree.items():
            yield from walk_json(child, path + [root])
    except AttributeError: # in case .items() is not possible (on leaves)
        yield path + [tree]
        
list(walk_json(data['roots']['other']['children']))

In [34]:
#https://stackoverflow.com/questions/45168524/deeply-nested-json-response-to-pandas-dataframe

def dict_generator(indict, pre=None):
    pre = pre[:] if pre else []
    if isinstance(indict, dict):
        for key, value in indict.items():
            if isinstance(value, dict):
                for d in dict_generator(value, pre + [key]):
                    yield d
            elif isinstance(value, list) or isinstance(value, tuple):
                for v in value:
                    for d in dict_generator(v, pre = [key]):
                        yield d
            else:
                yield pre + [key, value]
    else:
        yield indict       

In [None]:
"""
new_string = []
for line in session.execute('select json ....'):
    new_string.append(json.loads(line.json))
"""

cols = ['ID', 'criteria', 'type', 'name', 'value']

rows = []
for line in data:
    data_id = line['ID']
    criteria = line['profile']['criteria']
    for d in criteria:
        rows.append([data_id, criteria.index(d)+1, *list(d.values())[:-1]])

df = pd.DataFrame(rows, columns=cols)

In [35]:
raw =[ {'countries':'North Hemisphere'}, 
       {'Members': [{'country':'USA'},
                        {'components': [ {'state': 'Florida', 
                                        'shortname': 'FL',
                                        'info': { 'governor': 'Rick Scott' },
                                        'counties': [{'name': 'Dade', 'population': 12345},
                                                  {'name': 'Broward', 'population': 40000},
                                                  {'name': 'Palm Beach', 'population': 60000}]},
                                        {'state': 'Ohio',
                                         'shortname': 'OH',
                                         'info': {'governor': 'John Kasich'},
                                         'counties': [{'name': 'Summit', 'population': 1234},
                                                      {'name': 'Cuyahoga', 'population': 1337}]}]
                         },
                    {'country':'France'},
                         {'components': [ {'state': 'Bretagne', 
                                         'shortname': 'bre',
                                         'info': { 'governor': 'Rick Marcel' },
                                         'counties': [{'name': 'villeun', 'population': 12345},
                                                      {'name': 'deux', 'population': 40000},
                                                      {'name': 'trois', 'population': 60000}]},
                                          {'state': 'Bourgogne',
                                         'shortname': 'bou',
                                         'info': {'governor': 'Martin Kasich'},
                                         'counties': [{'name': 'bou1', 'population': 1234},
                                                      {'name': 'bou2', 'population': 1337}]}]
                         }]
       }]

#result = json_normalize(raw, record_path=[['components']], meta=['state', 'shortname', ['info', 'governor']])
#result = json_normalize(raw, record_path=['Members']) #, record_path=['Members'], meta=['components', 'state', 'shortname', ['info', 'governor']])
#, [['_source', 'authors']], ['_id', ['_source', 'journal'], ['_source', 'title']])

In [23]:
df = pd.DataFrame( raw)
schem = pd.io.json.build_table_schema(df);
schem

{'fields': [{'name': 'index', 'type': 'integer'},
  {'name': 'Members', 'type': 'string'},
  {'name': 'countries', 'type': 'string'}],
 'pandas_version': '0.20.0',
 'primaryKey': ['index']}

In [77]:
from collections import deque

def generate_children(tree):
    queue = deque()
    queue.append((tree, None))

    while queue:
        node, parent = queue.pop()
        children = []
        for child in node['children']:
            queue.append((child, node['id']))
            children.append(child['id'])
        yield node['id'], parent, children

In [None]:
import numpy as np
import scipy as sp

import matlplotlib.pyplot as plt
plt.rcParams["patch.force_edgecolor"] = True

from sklearn.datasets import fetch_20newsgroups

from __future__ import print_function

import tensorflow as tf

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

In [None]:
# Blog post: Machine Learning, NLP: Text Classification using scikit-learn, python and NLTK.
#            https://github.com/javedsha/text-classification
#
# coding: utf-8

# In[1]:
#Loading the data set - training data.

twenty_train = fetch_20newsgroups(subset='train', shuffle=True)


# In[2]:
# You can check the target names (categories) and some data files by following commands.
twenty_train.target_names #prints all the categories


# In[3]:
print("\n".join(twenty_train.data[0].split("\n")[:3])) #prints first line of the first data file


# In[4]:
# Extracting features from text files
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape


# In[5]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape


# In[6]:
# Machine Learning: Training Naive Bayes (NB) classifier on training data.
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)


# In[7]:
# Building a pipeline: 
# We can write less code and do all of the above, by building a pipeline as follows:
# The names ‘vect’ , ‘tfidf’ and ‘clf’ are arbitrary but will be used later.
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer()), 
                     ('tfidf', TfidfTransformer()), 
					 ('clf', MultinomialNB())])

text_clf = text_clf.fit(twenty_train.data, twenty_train.target)


# In[8]:
# Performance of NB Classifier
import numpy as np
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)


# In[9]:
# Training Support Vector Machines - SVM and calculating its performance
from sklearn.linear_model import SGDClassifier

text_clf_svm = Pipeline([('vect', CountVectorizer()), 
                         ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', 
						                           penalty='l2',
												   alpha=1e-3, 
												   n_iter=5, 
												   random_state=42)
						 )
						])

text_clf_svm = text_clf_svm.fit(twenty_train.data, twenty_train.target)
predicted_svm = text_clf_svm.predict(twenty_test.data)
np.mean(predicted_svm == twenty_test.target)


# In[10]:
# Performance tuning using Grid Search
# Here, we are creating a list of parameters for which we would like to do performance tuning. 
# All the parameters name start with the classifier name (remember the arbitrary name we gave). 
# E.g. vect__ngram_range; here we are telling to use unigram and bigrams and choose the one which 
# is optimal.

from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 
              'tfidf__use_idf': (True, False), 
			  'clf__alpha': (1e-2, 1e-3)}


# In[11]:
# Next, we create an instance of the grid search by passing the classifier, parameters 
# and n_jobs=-1 which tells to use multiple cores from user machine.

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)

# See the best mean score and the params:
gs_clf.best_score_
gs_clf.best_params_

# Output for above should be: 
#  The accuracy has now increased to ~90.6% for the NB classifier, and the corresponding parameters 
#  are {‘clf__alpha’: 0.01, 
#       ‘tfidf__use_idf’: True, 
#		‘vect__ngram_range’: (1, 2)}.


# In[12]:
# Similarly doing grid search for SVM
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)], 
                  'tfidf__use_idf': (True, False),
				  'clf-svm__alpha': (1e-2, 1e-3)}

gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(twenty_train.data, twenty_train.target)

gs_clf_svm.best_score_
gs_clf_svm.best_params_


# In[13]:
# NLTK


# In[14]:
# Removing stop words
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), 
                     ('tfidf', TfidfTransformer()), 
                     ('clf', MultinomialNB())])


# In[15]:
# Stemming Code
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
    
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect), 
                             ('tfidf', TfidfTransformer()), 
                             ('mnb', MultinomialNB(fit_prior=False))])

text_mnb_stemmed = text_mnb_stemmed.fit(twenty_train.data, twenty_train.target)

predicted_mnb_stemmed = text_mnb_stemmed.predict(twenty_test.data)

np.mean(predicted_mnb_stemmed == twenty_test.target)



In [None]:
from __future__ import print_function

# Create simple data: Try to differentiate the two first classes of the iris data
import numpy as np

from sklearn import svm
from sklearn import datasets
from sklearn.model_selection import train_test_split

from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve

import matplotlib.pyplot as plt
plt.rcParams["patch.force_edgecolor"] = True


# [ A ] In binary classification settings ********************************

iris = datasets.load_iris()
X = iris.data
y = iris.target

# Add noisy features
random_state = np.random.RandomState(0)

n_samples, n_features = X.shape
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]

# Limit to the two first classes, and split into training and test
X_train, X_test, y_train, y_test = train_test_split(X[y < 2], y[y < 2],
                                                    test_size=0.5,
                                                    random_state=random_state)

# Create a simple classifier
classifier = svm.LinearSVC(random_state=random_state)
classifier.fit(X_train, y_train)

y_score = classifier.decision_function(X_test)

# Compute the average precision score
average_precision = average_precision_score(y_test, y_score)

print('Average precision-recall score: {0:0.2f}'.format(average_precision))


# Plot the Precision-Recall curve
precision, recall, _ = precision_recall_curve(y_test, y_score)

plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class PR curve: mean P={0:0.2f}'.format(average_precision))


# [ B ] In multi-label classification settings ********************************

from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
#from sklearn.metrics import precision_recall_curve
#from sklearn.metrics import average_precision_score

# Create multi-label data, fit, and predict

# We create a multi-label dataset, to illustrate PR in multi-label settings:
# Use label_binarize to be multi-label like settings

Y = label_binarize(y, classes=[0, 1, 2])
n_classes = Y.shape[1]

# Split into training and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.5,
                                                    random_state=random_state)

# We use OneVsRestClassifier for multi-label prediction
classifier = OneVsRestClassifier(svm.LinearSVC(random_state=random_state))
classifier.fit(X_train, Y_train)

y_score = classifier.decision_function(X_test)


# The average precision score in multi-label settings

precision = dict()
recall = dict()
average_precision = dict()

for i in range(n_classes):
    precision[i], recall[i], _ = precision_recall_curve(Y_test[:, i], y_score[:, i])
    average_precision[i] = average_precision_score(Y_test[:, i], y_score[:, i])


# A "micro-average": quantifying score on all classes jointly:
precision["micro"], recall["micro"], _ = precision_recall_curve(Y_test.ravel(), y_score.ravel())
average_precision["micro"] = average_precision_score(Y_test, y_score, average="micro")

print('Average precision score, micro-averaged over all classes: {0:0.2f}'
      .format(average_precision["micro"]))

# Plot the micro-averaged Precision-Recall curve:
plt.figure()
plt.step(recall['micro'], precision['micro'], color='b', alpha=0.2, where='post')
plt.fill_between(recall["micro"], precision["micro"], step='post', alpha=0.2, color='b')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Aver. precision score, micro-averaged over all classes: mean P={0:0.2f}'
          .format(average_precision["micro"]))


# Plot Precision-Recall curve for each class and iso-f1 curves

from itertools import cycle

# setup plot details
colors = cycle(['navy', 'turquoise', 'darkorange', 'cornflowerblue', 'teal'])

plt.figure(figsize=(7, 8))
f_scores = np.linspace(0.2, 0.8, num=4)
lines = []
labels = []
for f_score in f_scores:
    x = np.linspace(0.01, 1)
    y = f_score * x / (2 * x - f_score)
    l, = plt.plot(x[y >= 0], y[y >= 0], color='gray', alpha=0.2)
    plt.annotate('f1={0:0.1f}'.format(f_score), xy=(0.9, y[45] + 0.02))

lines.append(l)
labels.append('iso-f1 curves')
l, = plt.plot(recall["micro"], precision["micro"], color='gold', lw=2)
lines.append(l)
labels.append('micro-average Precision-recall (area = {0:0.2f})'
              ''.format(average_precision["micro"]))

for i, color in zip(range(n_classes), colors):
    l, = plt.plot(recall[i], precision[i], color=color, lw=2)
    lines.append(l)
    labels.append('Precision-recall for class {0} (area = {1:0.2f})'
                  ''.format(i, average_precision[i]))

fig = plt.gcf()
fig.subplots_adjust(bottom=0.25)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Extension of Precision-Recall curve to multi-class')
plt.legend(lines, labels, loc=(0, -.38), prop=dict(size=14))


plt.show()
		  