In [14]:
import spacy
import es_core_news_lg
import re
import pandas as pd
import unidecode
import unicodedata as uc
from string import punctuation
from spacy.lang.es import STOP_WORDS
from spacy import displacy
import os
import string
from unidecode import unidecode
from heapq import nlargest

# models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier

# Connection libraries.
import urllib.request, urllib.parse, urllib.error
import requests
from bs4 import BeautifulSoup
import pickle

In [15]:
# NLP
def find_organizations(text,nlp)->list:
    """
    This function uses of entity labels from spacy to find organizations
    """
    doc = nlp(text)
    list=[]
    for ent in doc.ents:
        if ent.label_=="ORG":
            #print(ent.text, ent.label_)
            list.append(ent)

    return list


def find_dates(text,nlp)-> list:
    """
    This function uses of entity labels from spacy to find dates. It also use the re library to find patterns in the text
    that could lead in to a date.
    input: 
    output: 
    """
    months=["enero","ene", "january","jan","febrero","february","feb","marzo","march","mar","abril","april","mayo","may",'junio','june', "jun","julio","july", "jul",
           "agosto","ago","august","aug","septiembre",'september',"sep",'octubre','october',"oct","noviembre",'november',"nov","diciembre","december", "dec"]
    doc = nlp(text)
    lista=[]
    for ent in doc.ents:
        if ent.label_=="DATE":
            #print(ent.text, ent.label_)
            lista.append(ent)
    for m in months:
        if(len(re.findall("([0-9]{2}\s"+m+"\s[0-9]{4})", text))>0):
            lista.append(re.findall("([0-9]{2}\s"+m+"\s[0-9]{4})", text))
        if(len(re.findall("([0-9]{2}\s"+m+"\s)", text))>0):
            lista.append(re.findall("([0-9]{2}\s"+m+"\s)", text))  
        if(len(re.findall("(\s"+m+"\s[0-9]{4})", text))>0):
            lista.append(re.findall("(\s"+m+"\s[0-9]{4})", text))
        if(len(re.findall("([0-9]{4}\s"+m+"\s)", text))>0):
            lista.append(re.findall("([0-9]{4}\s"+m+"\s)", text))  
        if(len(re.findall("(\s"+m+"\s[0-9]{2})", text))>0):
            lista.append(re.findall("(\s"+m+"\s[0-9]{2})", text))
        if(len(re.findall("([0-9]{1}\s"+m+"\s[0-9]{4})", text))>0):
            lista.append(re.findall("([0-9]{1}\s"+m+"\s[0-9]{4})", text))
        if(len(re.findall("([0-9]{1}\s"+m+"\s)", text))>0):
            lista.append(re.findall("([0-9]{1}\s"+m+"\s)", text))   
        if(len(re.findall("(\s"+m+"\s[0-9]{1})", text))>0):
            lista.append(re.findall("(\s"+m+"\s[0-9]{1})", text))

    return lista

"""
This function uses the entity labels from spacy to find locations. It also use the re library to find patterns in the text
that could lead in to a location or address
"""
def find_locations(text,nlp)->list:
    municipios=["victoria", "miriti-parana", "puerto santander", "pedrera", "tarapaca", "leticia", "puerto nariño", "puerto arica", "encanto", "chorrera", "puerto alegria"]
    cardinales=["Norte","Sur","Este","Oeste","Occidente","Oriente"]
    direccion=["Calle","Avenida", "Carrera","Diagonal"]
    doc = nlp(text)
    lista=[]
    for ent in doc.ents:
        if ent.label_=="LOC":
            lista.append(ent)
    for l in municipios:
        if(len(re.findall("("+l+")", text))>0):
            lista.append(re.findall("("+l+")", text))
    for c in cardinales:
        if(len(re.findall("("+c+")", text))>0):
            lista.append(re.findall("("+c+")", text))
    for d in direccion:
        if(len(re.findall("("+d+"[0-9]{2}\s)", text))>0):
            lista.append(re.findall("("+d+"[0-9]{2}\s)", text))
    return lista

"""
This function makes an attemp of finding person names.
"""

def find_names(text, nlp)->list:
    doc = nlp(text)
    person = []
    for ent in doc.ents:
        if ent.label_ == "PER":
            person.append(ent)
    return person


In [16]:
# Feature adicional de resumen
def summarize(text, per):
    # nlp = spacy.load('en_core_web_sm')
    nlp = spacy.load('es_core_news_lg')
    doc= nlp(text)
    tokens=[token.text for token in doc]
    word_frequencies={}
    for word in doc:
        if word.text.lower() not in list(STOP_WORDS):
            if word.text.lower() not in punctuation:
                if word.text not in word_frequencies.keys():
                    word_frequencies[word.text] = 1
                else:
                    word_frequencies[word.text] += 1
    max_frequency=max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word]=word_frequencies[word]/max_frequency
    sentence_tokens= [sent for sent in doc.sents]
    sentence_scores = {}
    for sent in sentence_tokens:
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                if sent not in sentence_scores.keys():                            
                    sentence_scores[sent]=word_frequencies[word.text.lower()]
                else:
                    sentence_scores[sent]+=word_frequencies[word.text.lower()]
    select_length=int(len(sentence_tokens)*per)
    summary=nlargest(select_length, sentence_scores,key=sentence_scores.get)
    final_summary=[word.text for word in summary]
    summary=''.join(final_summary)
    return summary 

In [17]:
# def ner_from_str(text, output_path)
# def ner_from_file(tex_pstht, output_path)
# def ner_from_url(url, output_path)
# def ner_from_df(text, output_path)

In [18]:
# URL
r = requests.get("https://www.wwf.org.co/_donde_trabajamos_/amazonas/las_seis_grandes_amenazas_de_la_amazonia/#:~:text=Desde%20el%20a%C3%B1o%202000%20hasta,la%20deforestaci%C3%B3n%20en%20la%20regi%C3%B3n")

data = r.text
soup = BeautifulSoup(data, "html.parser")

In [19]:
def extractScripts(soup):
    for s in soup.select('style'):
        s.extract()
    
    return soup

In [20]:
soup = extractScripts(soup)
text = soup.get_text()
# text

In [21]:
text=text.replace("\n","")
text=text.replace("\r","")

# text

# Extrae el texto de la url

In [22]:
nlp = spacy.load("es_core_news_lg")

In [23]:
def preprocess_text(text):
    s = ""
    text = unidecode(text)
    # print(text)
    for char in text:
        s += char.lower()
    # print(s)
    words = ""
    for word in s.split():
        if word not in STOP_WORDS:
            words=words+" "+ word
    return words

In [24]:
def replace_tildes(text):
    text = text.replace("á", "a").replace("é", "e").replace("í", "i").replace("ó", "o").replace("ú", "u").replace("▪", "").replace("ü", "u")
    return text

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    s = regrex_pattern.sub(r'',text)
    val_aux = re.sub(r'[\u200b|\u2010|\u1967]+', '', s)  
    s = uc.normalize('NFC',val_aux)
    return s

def remove_special_characters(text):
    s = ""
    for char in text:
        if char.isalnum() or char.isspace() or char in punctuation:
            s+=char
    s_ = " ".join(s.split())
    val_aux = re.sub(r'[\u200b|\u2010]+', '', s_)  
    s = uc.normalize('NFC',val_aux)
    return s

# Metodo que reune todos los metodos de preprocesamiento
def process_text_pipeline(text:str)->str:
    text = preprocess_text(text)
    text = replace_tildes(text)
    text = deEmojify(text)
    text = remove_special_characters(text)
    text = clean_string(text)
    text = delete_punctuation(text)
    return text

def clean_string(string):
    # string = string.replace("  ","")
    string = string.replace("\n","")
    return string

In [25]:
print(os.listdir())
%pwd
%cd

['.ipynb_checkpoints', 'spacy_test.ipynb']
/root


In [26]:
path:str = "codefest-data/Textos/noticias.xlsx"
datos:pd.DataFrame = pd.read_excel(path)

In [27]:
datos.head()

Unnamed: 0,FECHA,FUENTE,TEXTO,LINK,ETIQUETA
0,2016-2023,WWF LIVING AMAZON REPORT 2016.,El aumento de las represas en la Amazonia amen...,https://www.wwf.org.co/_donde_trabajamos_/amaz...,AUMENTO DE REPRESAS HIDROELECTRICAS.
1,2016-2023,WWF LIVING AMAZON REPORT 2016.,"Las carreteras, las vías férreas y las nuevas ...",https://www.wwf.org.co/_donde_trabajamos_/amaz...,DEFORESTACION.
2,06 DE JUNIO DE 2015,EL TIEMPO,La afectación ambiental ocasionada por el derr...,https://www.eltiempo.com/archivo/documento/CMS...,CONTAMINACION.
3,ENERO DE 2014,CORPOAMAZONIA,Afectación por contaminacion con hidrocarburos...,https://www.corpoamazonia.gov.co/index.php/not...,CONTAMINACION.
4,05 DE OCTUBRE DE 2019,NOTICIAS RCN,En 2019 se han registrado 19 atentados contra ...,https://www.noticiasrcn.com/nacional/derrame-d...,CONTAMINACION.


In [28]:
def delete_punctuation(text):
    new_string = text.translate(str.maketrans('', '', string.punctuation))
    return new_string

In [29]:
# mineria, contaminacion, deforestacion, narcotrafico, infraestructura, otra

def main_preprocess(df, tags:list):
    final_list = []
    target_list = []
    for index, row in df.iterrows():
        text = str(row[df.columns.get_loc("TEXTO")])
        text2 = str(row[df.columns.get_loc("ETIQUETA")])
        final_list.append(process_text_pipeline(text))
        
        tag_parts:list = process_text_pipeline(text2).split(" ") 
        
        tag:str = "" 
        for t in tag_parts:
            if t in tags:    
                tag = t
                break
        if tag == "":
            tag = "other"
        target_list.append(tag)
        
    df["text_preprocess"] = final_list
    df["etiqueta_preprocess"] = target_list
    return df

In [30]:
tags:list = ["mineria", "contaminacion", "deforestacion", "narcotrafico", "infraestructura"]
datos_preprocess:pd.DataFrame = main_preprocess(datos, tags)
datos_preprocess

Unnamed: 0,FECHA,FUENTE,TEXTO,LINK,ETIQUETA,text_preprocess,etiqueta_preprocess
0,2016-2023,WWF LIVING AMAZON REPORT 2016.,El aumento de las represas en la Amazonia amen...,https://www.wwf.org.co/_donde_trabajamos_/amaz...,AUMENTO DE REPRESAS HIDROELECTRICAS.,aumento represas amazonia amenaza flujo natura...,other
1,2016-2023,WWF LIVING AMAZON REPORT 2016.,"Las carreteras, las vías férreas y las nuevas ...",https://www.wwf.org.co/_donde_trabajamos_/amaz...,DEFORESTACION.,carreteras vias ferreas rutas transporte fluvi...,deforestacion
2,06 DE JUNIO DE 2015,EL TIEMPO,La afectación ambiental ocasionada por el derr...,https://www.eltiempo.com/archivo/documento/CMS...,CONTAMINACION.,afectacion ambiental ocasionada derrame cerca ...,contaminacion
3,ENERO DE 2014,CORPOAMAZONIA,Afectación por contaminacion con hidrocarburos...,https://www.corpoamazonia.gov.co/index.php/not...,CONTAMINACION.,afectacion contaminacion hidrocarburos afluent...,contaminacion
4,05 DE OCTUBRE DE 2019,NOTICIAS RCN,En 2019 se han registrado 19 atentados contra ...,https://www.noticiasrcn.com/nacional/derrame-d...,CONTAMINACION.,2019 registrado 19 atentados oleoducto transpo...,contaminacion
...,...,...,...,...,...,...,...
181,Mayo 26 de 2023,EL PAIS,La crisis con las disidencias de Iván Mordisco...,https://elpais.com/america-colombia/2023-05-26...,DEFORESTACION,crisis disidencias ivan mordisco impacta lucha...,deforestacion
182,Julio 15 de 2021,BBC,"El ""impacto inmenso"" de las regiones del Amazo...",https://www.bbc.com/mundo/noticias-57820472,DEFORESTACION,impacto inmenso regiones amazonas emiten carbo...,deforestacion
183,Marzo 7 de 2022,National Geographic,El Amazonas está cerca del punto de inflexión ...,Así podría afectar la pérdida de la selva amaz...,DEFORESTACION,amazonas cerca punto inflexion convertirse sab...,deforestacion
184,13 febrero 2020,BBC,"La gran mentira verde"": cómo la pérdida del Am...",https://www.bbc.com/mundo/noticias-america-lat...,DEFORESTACION,mentira verde perdida amazonas alla deforestacion,deforestacion


In [31]:
def getCorpus(df:pd.DataFrame)->None:
    path = "training_stuff/corpus.txt"
    file = open(path, "w")
    file.write("")
    file.close
    file = open(path,"a")
    for index, row in df.iterrows():
        text = str(row[df.columns.get_loc("text_preprocess")])
        file.write(text + "    ")
    file.close
        
getCorpus(datos_preprocess)


In [32]:
# mineria, contaminacion, deforestacion, narcotrafico, infraestructura, otra
datos['etiqueta_preprocess'].value_counts()

deforestacion    67
other            62
mineria          26
contaminacion    18
narcotrafico     13
Name: etiqueta_preprocess, dtype: int64

In [33]:
nlp = spacy.load("es_core_news_lg")

def main_ner(df:pd.DataFrame)-> pd.DataFrame:
    names_list:list = []
    locs_list:list = []
    orgs_list:list = []
    dates_list:list = []
    
    for index, row in df.iterrows():
        text_pre_process = str(row[df.columns.get_loc("text_preprocess")])
        names:str = find_names(text_pre_process,nlp)
        locs:str = find_locations(text_pre_process, nlp)
        orgs:str = find_organizations(text_pre_process, nlp)
        dates:str = find_dates(text_pre_process, nlp)
        
        names_list.append(names)
        locs_list.append(locs)
        orgs_list.append(orgs)
        dates_list.append(dates)
        
        
    df["NAMES"] = names_list
    df["LOCS"] = locs_list
    df["ORGS"] = orgs_list
    df["DATES"] = dates_list
    return df

In [34]:
datos_new_columns = main_ner(datos_preprocess)
datos_new_columns

Unnamed: 0,FECHA,FUENTE,TEXTO,LINK,ETIQUETA,text_preprocess,etiqueta_preprocess,NAMES,LOCS,ORGS,DATES
0,2016-2023,WWF LIVING AMAZON REPORT 2016.,El aumento de las represas en la Amazonia amen...,https://www.wwf.org.co/_donde_trabajamos_/amaz...,AUMENTO DE REPRESAS HIDROELECTRICAS.,aumento represas amazonia amenaza flujo natura...,other,[],[(amazonia)],[],[]
1,2016-2023,WWF LIVING AMAZON REPORT 2016.,"Las carreteras, las vías férreas y las nuevas ...",https://www.wwf.org.co/_donde_trabajamos_/amaz...,DEFORESTACION.,carreteras vias ferreas rutas transporte fluvi...,deforestacion,[],[(amazonia)],[],[]
2,06 DE JUNIO DE 2015,EL TIEMPO,La afectación ambiental ocasionada por el derr...,https://www.eltiempo.com/archivo/documento/CMS...,CONTAMINACION.,afectacion ambiental ocasionada derrame cerca ...,contaminacion,"[(farc, putumayo)]",[],[],[]
3,ENERO DE 2014,CORPOAMAZONIA,Afectación por contaminacion con hidrocarburos...,https://www.corpoamazonia.gov.co/index.php/not...,CONTAMINACION.,afectacion contaminacion hidrocarburos afluent...,contaminacion,[],[],[],[]
4,05 DE OCTUBRE DE 2019,NOTICIAS RCN,En 2019 se han registrado 19 atentados contra ...,https://www.noticiasrcn.com/nacional/derrame-d...,CONTAMINACION.,2019 registrado 19 atentados oleoducto transpo...,contaminacion,"[(rio, guamuez, orito)]",[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...
181,Mayo 26 de 2023,EL PAIS,La crisis con las disidencias de Iván Mordisco...,https://elpais.com/america-colombia/2023-05-26...,DEFORESTACION,crisis disidencias ivan mordisco impacta lucha...,deforestacion,"[(ivan, mordisco)]",[],[],[]
182,Julio 15 de 2021,BBC,"El ""impacto inmenso"" de las regiones del Amazo...",https://www.bbc.com/mundo/noticias-57820472,DEFORESTACION,impacto inmenso regiones amazonas emiten carbo...,deforestacion,[],[],[],[]
183,Marzo 7 de 2022,National Geographic,El Amazonas está cerca del punto de inflexión ...,Así podría afectar la pérdida de la selva amaz...,DEFORESTACION,amazonas cerca punto inflexion convertirse sab...,deforestacion,[],"[(amazonas), (amazonas)]",[],[]
184,13 febrero 2020,BBC,"La gran mentira verde"": cómo la pérdida del Am...",https://www.bbc.com/mundo/noticias-america-lat...,DEFORESTACION,mentira verde perdida amazonas alla deforestacion,deforestacion,[],[(amazonas)],[],[]


In [35]:
doc = nlp("amazonia colombiana perdio 52 mil hectareas bosque semestre 2022 mes medio carlos eduardo correa exministro ambiente revelo cifras deforestacion colombia 2021 ambientalistas colombianos llevaran meses preguntandose tardanza mostrar panorama bosques pais 7 agosto fecha presidente ivan duque dejaba casa narino darle paso sucesor gustavo petro")
displacy.render(doc,style="ent", jupyter=True)

In [36]:
data_new2 = datos_new_columns.copy()                          # Create duplicate of data
# data_new2.dropna(inplace = True)  
data_new2.dropna(subset = ['ETIQUETA'], inplace = True) 
data_new2

Unnamed: 0,FECHA,FUENTE,TEXTO,LINK,ETIQUETA,text_preprocess,etiqueta_preprocess,NAMES,LOCS,ORGS,DATES
0,2016-2023,WWF LIVING AMAZON REPORT 2016.,El aumento de las represas en la Amazonia amen...,https://www.wwf.org.co/_donde_trabajamos_/amaz...,AUMENTO DE REPRESAS HIDROELECTRICAS.,aumento represas amazonia amenaza flujo natura...,other,[],[(amazonia)],[],[]
1,2016-2023,WWF LIVING AMAZON REPORT 2016.,"Las carreteras, las vías férreas y las nuevas ...",https://www.wwf.org.co/_donde_trabajamos_/amaz...,DEFORESTACION.,carreteras vias ferreas rutas transporte fluvi...,deforestacion,[],[(amazonia)],[],[]
2,06 DE JUNIO DE 2015,EL TIEMPO,La afectación ambiental ocasionada por el derr...,https://www.eltiempo.com/archivo/documento/CMS...,CONTAMINACION.,afectacion ambiental ocasionada derrame cerca ...,contaminacion,"[(farc, putumayo)]",[],[],[]
3,ENERO DE 2014,CORPOAMAZONIA,Afectación por contaminacion con hidrocarburos...,https://www.corpoamazonia.gov.co/index.php/not...,CONTAMINACION.,afectacion contaminacion hidrocarburos afluent...,contaminacion,[],[],[],[]
4,05 DE OCTUBRE DE 2019,NOTICIAS RCN,En 2019 se han registrado 19 atentados contra ...,https://www.noticiasrcn.com/nacional/derrame-d...,CONTAMINACION.,2019 registrado 19 atentados oleoducto transpo...,contaminacion,"[(rio, guamuez, orito)]",[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...
181,Mayo 26 de 2023,EL PAIS,La crisis con las disidencias de Iván Mordisco...,https://elpais.com/america-colombia/2023-05-26...,DEFORESTACION,crisis disidencias ivan mordisco impacta lucha...,deforestacion,"[(ivan, mordisco)]",[],[],[]
182,Julio 15 de 2021,BBC,"El ""impacto inmenso"" de las regiones del Amazo...",https://www.bbc.com/mundo/noticias-57820472,DEFORESTACION,impacto inmenso regiones amazonas emiten carbo...,deforestacion,[],[],[],[]
183,Marzo 7 de 2022,National Geographic,El Amazonas está cerca del punto de inflexión ...,Así podría afectar la pérdida de la selva amaz...,DEFORESTACION,amazonas cerca punto inflexion convertirse sab...,deforestacion,[],"[(amazonas), (amazonas)]",[],[]
184,13 febrero 2020,BBC,"La gran mentira verde"": cómo la pérdida del Am...",https://www.bbc.com/mundo/noticias-america-lat...,DEFORESTACION,mentira verde perdida amazonas alla deforestacion,deforestacion,[],[(amazonas)],[],[]


In [37]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier
import pickle

In [38]:
# datos_new_columns
X = data_new2.text_preprocess
y = data_new2.etiqueta_preprocess

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [39]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)
my_tags = ["mineria", "contaminacion", "deforestacion", "narcotrafico", "infraestructura"]
# %%time

y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))

accuracy 0.4791666666666667
                 precision    recall  f1-score   support

        mineria       0.00      0.00      0.00         7
  contaminacion       0.44      1.00      0.61        19
  deforestacion       0.80      0.80      0.80         5
   narcotrafico       0.00      0.00      0.00         3
infraestructura       0.00      0.00      0.00        14

       accuracy                           0.48        48
      macro avg       0.25      0.36      0.28        48
   weighted avg       0.26      0.48      0.33        48



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [42]:
sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)

# %%time

y_pred = sgd.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))

# save the model to disk
filename = 'modelSGD.sav'
pickle.dump(sgd, open(filename, 'wb'))

accuracy 0.5833333333333334
                 precision    recall  f1-score   support

        mineria       0.67      0.57      0.62         7
  contaminacion       0.56      0.95      0.71        19
  deforestacion       0.57      0.80      0.67         5
   narcotrafico       0.00      0.00      0.00         3
infraestructura       0.67      0.14      0.24        14

       accuracy                           0.58        48
      macro avg       0.49      0.49      0.44        48
   weighted avg       0.57      0.58      0.51        48



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [43]:
%pwd

'/root'

In [42]:
logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg.fit(X_train, y_train)

# %%time

y_pred = logreg.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))

accuracy 0.5625
                 precision    recall  f1-score   support

        mineria       0.50      0.29      0.36         7
  contaminacion       0.54      1.00      0.70        19
  deforestacion       0.57      0.80      0.67         5
   narcotrafico       0.00      0.00      0.00         3
infraestructura       1.00      0.14      0.25        14

       accuracy                           0.56        48
      macro avg       0.52      0.45      0.40        48
   weighted avg       0.64      0.56      0.47        48



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [43]:
# LogisticRegression
# SGDClassifier
# MultinomialNB
# RandomForestClassifier
# LinearSVC
# KNeighborsClassifier

svc = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LinearSVC()),
               ])
svc.fit(X_train, y_train)

# %%time

y_pred = svc.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))

accuracy 0.5416666666666666
                 precision    recall  f1-score   support

        mineria       0.33      0.14      0.20         7
  contaminacion       0.53      1.00      0.69        19
  deforestacion       0.57      0.80      0.67         5
   narcotrafico       0.00      0.00      0.00         3
infraestructura       1.00      0.14      0.25        14

       accuracy                           0.54        48
      macro avg       0.49      0.42      0.36        48
   weighted avg       0.61      0.54      0.45        48



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [44]:
rf = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)),
               ])
rf.fit(X_train, y_train)

# %%time

y_pred = rf.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))

accuracy 0.4375
                 precision    recall  f1-score   support

        mineria       0.00      0.00      0.00         7
  contaminacion       0.41      1.00      0.58        19
  deforestacion       1.00      0.40      0.57         5
   narcotrafico       0.00      0.00      0.00         3
infraestructura       0.00      0.00      0.00        14

       accuracy                           0.44        48
      macro avg       0.28      0.28      0.23        48
   weighted avg       0.27      0.44      0.29        48



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [45]:
kn = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', KNeighborsClassifier(n_neighbors=5)),
               ])
kn.fit(X_train, y_train)

# %%time

y_pred = kn.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))

accuracy 0.5208333333333334
                 precision    recall  f1-score   support

        mineria       0.62      0.71      0.67         7
  contaminacion       0.59      0.84      0.70        19
  deforestacion       0.20      0.40      0.27         5
   narcotrafico       0.00      0.00      0.00         3
infraestructura       0.67      0.14      0.24        14

       accuracy                           0.52        48
      macro avg       0.42      0.42      0.37        48
   weighted avg       0.54      0.52      0.47        48



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
class named_entities_recognition_astroamazonicos ():
# ----------------------------------------------------- CLASE --------------------------------------------------------------
    def __init__(self):
        self.nlp = spacy.load("es_core_news_lg")
        
    def find_organizations(text,nlp)->list:
    """
    This function uses of entity labels from spacy to find organizations
    """
    doc = nlp(text)
    list=[]
    for ent in doc.ents:
        if ent.label_=="ORG":
            #print(ent.text, ent.label_)
            list.append(ent)

    return list
        
        
        
        