Sometimes your input data can be nested with more difficult structure than a simple table or a matrix.

In such cases it is sometime useful to shift mental orientation to analyze and extract information froms rows rather then non-defined columns.

In [1]:
from utils import css_from_file
css_from_file('style/style.css')

In [2]:
!pip install nltk



In [3]:
import json
import numpy as np
import pprint
from nltk import download, word_tokenize

download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Antoine\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [10]:
with open("data/companies/companies.json", encoding='utf-8') as dataf:
    data = [json.loads(line) for line in dataf]
#     for line in dataf:
#         try:
#             json.loads(line)
#         except:
#             print(line)
#             break

An example of deeply nested data with various data types:

Exercise:

1. Name variable types
2. What do you do with lists, geo location?
3. What do you do with counts?

In [51]:
pprint.pprint(data[3])

{'address': {'city': 'Düsseldorf',
             'country': 'Germany',
             'postalCode': '40221',
             'raw': 'Speditionstr. 15a\n\nDüsseldorf,\nNRW\n40221\nGermany',
             'region': 'NRW',
             'street': 'Speditionstr. 15a'},
 'description': '1stMOVER is a startup incubator & enterprise consultancy for '
                'new digital business. We support startups and enterprises '
                'with specialised incubation & consulting services. For '
                'Startups: - Seed funding (100k€ + co-investors) - Support of '
                'business model & product design - Operational incubation '
                'support - Initiation of enterprise co-operations - Finding '
                'strategic investors For Enterprises: - Development & '
                'implementation of models for successful startup co-operations '
                '- Building up bootcamp, acceleration & incubation programmes '
                'with startups - Development

Exercise
--------------

Write a pipeline to transform company records.
1. Select 3 types of features you want to transform (like descrpition, list of skills, technologies, address etc)
2. Create a pipeline in this format:
```python
make_union(
    make_pipeline(TechnologyFeatures(), DictVectorizer()),
    make_pipeline(AddressFeatures(), DictVectorizer()),
    make_pipeline(ExtractDescription(), CountVectorizer())
)
```
3. Classify industry (like in the previous exercise)

In [46]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline, make_union
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier

In [56]:
class TechnologyFeatures(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

    def fit(self, x, y = None):
        return self

    def transform(self, x):
        technologies = []
        # row.get('technologies', [])
        for company in x:
            techs = {}
            company_techs = company.get('technologies', [])
            for tech in company_techs:
                techs[tech] = 1
            technologies.append(techs)
        return technologies
    
    
class AddressFeatures(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        addresses = []
        for company in x:
#             country = company.get('address', {}).get('country','')
#             if country == '':
#                 country = company.get('extension',{}).get('geo_location',{}).get('country','')
#             if country == '':
#                 country = company.get('extension',{}).get('geo_location_triple',{}).get('country','')
                
            country = (
                company.get('address', {}).get('country', None)
                or
                company.get('extension',{}).get('geo_location',{}).get('country', None)
                or
                company.get('extension',{}).get('geo_location_triple',{}).get('country', '')
            )
            
#             city = company.get('address', {}).get('city', '')
#             if city == '':
#                 city = company.get('extension',{}).get('geo_location',{}).get('city','')
#             if city == '':
#                 city = company.get('extension',{}).get('geo_location_triple',{}).get('city','')
            city = (
                company.get('address', {}).get('city', None)
                or
                company.get('extension',{}).get('geo_location',{}).get('city', None)
                or
                company.get('extension',{}).get('geo_location_triple',{}).get('city', '')
            )
                
            addresses.append({'country': country, 'city':city})
        return addresses
    

class ExtractDescription():
    def __init__(self):
        pass
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        descriptions = []
        for company in x:
            descriptions.append(company.get('description', ''))
        
        return descriptions
    
class ExtractIndustry():
    def __init__(self):
        pass
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        industries_list = []
        for company in x:
            industries = company.get('industries', [])
            if industries:
                industries_list.append(industries[0])
            else:
                industries_list.append('unknown')
        return industries_list
    
# tech_feat = TechnologyFeatures()
# tech_feat.transform(data)
# address_feat = AddressFeatures()
# address_feat.transform(data)
# descriptions_feat = ExtractDescription()
# descriptions_feat.transform(data)
industries = ExtractIndustry()
industries.transform(data)

10000

In [49]:
extract_industries = ExtractIndustry()
Y = extract_industries.transform(data)

pipe = make_pipeline(
    make_union(
        make_pipeline(TechnologyFeatures(), DictVectorizer()),
        make_pipeline(AddressFeatures(), DictVectorizer()),
        make_pipeline(ExtractDescription(), CountVectorizer())
    ),
    StandardScaler(with_mean=False, with_std=False),
    SGDClassifier()
)

pipe.fit(data[:8000], Y[:8000])

y_pred = pipe.predict(data[8000:])




ValueError: bad input shape ()

Exercise
===============

1. Write a transformation class called SparsityFilter that accepts a minimum frequency. Watch out for fit function - this class has some state that you must save

```
class SparsityFilter(BaseEstimator, TransformerMixin):
    def __init__(self, min_nnz=None):
        self.min_nnz = min_nnz

    def fit(self, X, y=None):
        ???
        return self

    def transform(self, X):
        return ???
```

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction import DictVectorizer

In [None]:
class Sparsity(BaseEstimator, TransformerMixin):

    def __init__(self, min_nnz=None):
        min_nnz= min_nnz

    def fit(self, x, y = None):
        return self

    def transform(self, x):
        return x

Double click to see the solution 

<div class="spoiler">

class SparsityFilter(BaseEstimator, TransformerMixin):
    def __init__(self, min_nnz=None):
        self.min_nnz = min_nnz

    def fit(self, X, y=None):
        self.sparsity = X.getnnz(0)
        return self

    def transform(self, X):
        return X[:, self.sparsity >= self.min_nnz]
</div>