### IMPORTS

In [1]:
!pip install --user simplemma

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [2]:
import pandas as pd
import numpy as np

import sys
import re
import pickle

# data modelling
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, top_k_accuracy_score, f1_score

import nltk
from nltk.corpus import stopwords 
import string
import simplemma
from sklearn.feature_extraction.text import TfidfTransformer
from helpers import plot_confusion_matrix, get_top_features, fix_sdg_name

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer

pd.options.mode.chained_assignment = None
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### LOAD AND CONCATENATE THE DATABASES

In [None]:
df = pd.concat([df_comm, df_parl_ue], axis = 0)
df = pd.concat([df, df_pol_it], axis = 0)
df = pd.concat([df, df_giorn], axis = 0)
df = pd.concat([df, df_crit], axis = 0)
df = pd.concat([df, df_fin], axis = 0)
df = pd.concat([df, df_ist], axis = 0)
df = pd.concat([df, df_ong], axis = 0)
df = pd.concat([df, df_uni], axis = 0)
df = pd.concat([df, df_orgassoc], axis = 0)
df = pd.concat([df, df_prof], axis = 0)
df = pd.concat([df, df_manager], axis = 0)
df = pd.concat([df, df_fornitori], axis = 0)

### CLEAN AND PREPROCESS THE BIOGRAPHIES

In [6]:
def get_cleaned_text(text):
    """
    text cleaning.
    Nota: 
    - valutare eliminazione caratteri non ascii
    - valutare necessità stemming e lemmatization
    """
    # Rimuovi testo della forma qualcosa.qualcosa (link ma anche 900.000)
    text = re.sub(r'\b[^\s]*\.[^\s]*\b', ' ', text)
    # Rimuovi a capo
    text = re.sub(r'\\n+', ' ', text)
    # Rimuovi tutto ciò che non è alfanumerico o spazio
    text = re.sub(r'[^\w\s]', ' ', text)
    # Rimuovi RT
    text = re.sub(r'^RT', ' ', text)
    # Imposta lowercase
    text = text.lower()
    # Elimina spazi multipli
    text = re.sub('\s+', ' ', text)
    return text

df['docs'] = df['description'].apply(lambda x: get_cleaned_text(x) )
df['name']=df['name'].apply(lambda x: get_cleaned_text(x) )
df['docs_name']=df['name']+' '+df['docs']
df=df[df['name']!='non_trovato']
df.head()

Unnamed: 0,name,description,categ,docs,docs_name
1,ursula von der leyen,President of the @EU_Commission. Mother of sev...,politici,president of the eu_commission mother of seven...,ursula von der leyen president of the eu_commi...
3,frans timmermans,Executive Vice-President for the European Gree...,politici,executive vice president for the european gree...,frans timmermans executive vice president for ...
5,margrethe vestager,Executive Vice-President of the European Commi...,politici,executive vice president of the european commi...,margrethe vestager executive vice president of...
7,valdis dombrovskis,@EU_Commission Executive Vice-President for an...,politici,eu_commission executive vice president for an...,valdis dombrovskis eu_commission executive vi...
11,josep borrell fontelles,High Representative of the EU for Foreign Affa...,politici,high representative of the eu for foreign affa...,josep borrell fontelles high representative of...


In [7]:
def tokenizer(text):
    tokens=[nltk.word_tokenize(passage.lower()) for passage in text ]
    return tokens

def del_stopwords(stop_words,tokens):
    a_filter=[]
    for passage in tokens:
        new_sentence = [word for word in passage if not word in stop_words]
        a_filter.append(new_sentence)
    return a_filter

punctuations = list(string.punctuation)
punctuations.append('’')
punctuations.append('“')
punctuations.append('”')
punctuations.append('')
punctuations.append('...')
punctuations.append('``')
punctuations.append("''")

def del_punct(punctuations,tokens):
    b_filter=[]
    for passage in tokens:
        new_sentence = [word for word in passage if not word in punctuations]
        b_filter.append(new_sentence)
    return b_filter

def lemmatize(tokens):
    c_filter=[]
    for passage in tokens:
        new_sentence=[simplemma.lemmatize(word, langdata) for word in passage]
        c_filter.append(new_sentence)
    return c_filter

tokens=tokenizer(df['docs_name'])

- APPLY LEMMATIZATION AND STOPWORDS ELIMINATION IN BOTH ITALIAN AND ENGLISH

In [8]:
#LEMMAS BOTH ITA AND ENG
stop_words=stopwords.words('italian')
a_filter=del_stopwords(stop_words,tokens)
stop_words=stopwords.words('english')
a_filter=del_stopwords(stop_words,a_filter)
b_filter=del_punct(punctuations,a_filter)

try:
    langdata = simplemma.load_data('it')
    lemmas_a=lemmatize(b_filter)
    langdata = simplemma.load_data('en')

    lemmas_italiano_inglese=lemmatize(lemmas_a)
except:
    lemmas_italiano_inglese=b_filter

In [9]:
#aggregate tokens to pass them in the pipeline
full_text=[]
for sentence in lemmas_italiano_inglese:
    sent=''
    for word in sentence:
        sent+=word+' '
    sent=sent.strip()
    full_text.append(sent)
df['docs_name_lemmatized']=full_text

#### SGDC CLASS

In [12]:
from sklearn.linear_model import SGDClassifier
cols=["investors","critics","media","politicians","ONG","ORG","Researchers","Universities","Top Managers","Providers"]

for col in cols:
    labelpos = df.loc[df['categ'] == col] 
    labelneg = df.loc[df['categ'] != col]
    
    #resample
    labelneg = df.loc[df['categ'] != col].sample(n=min(labelneg.shape[0],5*labelpos.shape[0]))
    labelneg['categ'] = 0 
    labelpos['categ'] = 1 
    balanced_dataset = pd.concat([labelneg, labelpos], axis = 0)
    
    X_train, X_test, y_train, y_test = train_test_split(
    balanced_dataset['docs_name_lemmatized'].values,
    balanced_dataset['categ'].values,
    test_size = .1,
    random_state = 42
    )
    
    pipe = Pipeline([
    ('vectoriser', CountVectorizer(ngram_range = (1, 2))),
    ('selector', SelectKBest(chi2, k = 200)),
    ('clf',SGDClassifier(**params)),
    ])

    pipe.fit(X_train, y_train)
    y_hat = pipe.predict(X_test)
    accuracy_sc = accuracy_score(y_test, y_hat)

    df_lambda = get_top_features(pipe['vectoriser'], pipe['clf'], pipe['selector'], top_n = 20)
    display(df_lambda.head(10))

    filename_model=col+'_pipe_model.sav'
    #pickle.dump(pipe, open('models/to_use/'+filename_model, 'wb'))
    print(classification_report(y_test, y_hat))



Unnamed: 0,sdg,feature,coef
0,1,procurement,4.213957
1,1,enelgroup,3.933026
2,1,fabio,3.652096
3,1,sofia,3.652096
4,1,oecd,3.371165
5,1,confindustria,3.090235
6,1,sales manager,2.809304
7,1,enel,2.528374
8,1,personal opinions,2.528374
9,1,idee,2.528374


              precision    recall  f1-score   support

           0       0.95      0.95      0.95       111
           1       0.76      0.73      0.74        22

    accuracy                           0.92       133
   macro avg       0.85      0.84      0.85       133
weighted avg       0.92      0.92      0.92       133

