In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

### CLEAN DATA

In [2]:
reports = pd.read_csv('../data/complete_smell.csv')
reports.head(10)

Unnamed: 0,EpochTime,skewed_latitude,skewed_longitude,smell_value,smell_description,feelings_symptoms,additional_comments,zipcode
0,1477935134,40.4293,-79.872,1,,,,15221
1,1477935767,40.3624,-79.9749,1,,,,15227
2,1477955141,40.3275,-79.9606,1,,,,15236
3,1477956180,40.4252,-79.8956,2,Woodsmoke,,,15218
4,1477956293,40.4237,-79.8964,3,Wood smoke,,,15218
5,1477970157,40.3627,-79.9729,3,Industrial,,Penrated the house. Made me go outside to sm...,15227
6,1477973293,40.4195,-79.9384,5,Industrial,"Eye irritation, nose burns, headache, woke me up",This smell happens often. It's always at night...,15207
7,1478001707,40.3974,-80.0416,4,"Industrial, sulfur","Congestion, coughing",,15216
8,1478001989,40.4308,-79.9527,2,Industrial smoke,,,15213
9,1478003840,40.4238,-79.8958,3,Industrial,,,15218


In [3]:
additional_comments = reports['additional_comments'].dropna()

In [4]:
additional_comments

5        Penrated  the house.  Made me go outside to sm...
6        This smell happens often. It's always at night...
11       Noticeable odor of hydrocarbons and tar. Cause...
47                                          The air stinks
48                                   This smell woke me up
                               ...                        
93320    Smells like chemicals, almost like paint thinn...
93328                                     Usual smell here
93336                          Smells like burning plastic
93347                                         Strong winds
93351    My house is closed up yet here we are. scr*w m...
Name: additional_comments, Length: 16184, dtype: object

In [5]:
reports_symptoms_string = reports['feelings_symptoms'].tolist()
reports_symptoms_string = [x for x in reports_symptoms_string if pd.notna(x)]

In [6]:
reports_smells_string = reports['smell_description'].tolist()
reports_smells_string = [x for x in reports_smells_string if pd.notna(x)]

In [7]:
reports_smells_string

['Woodsmoke',
 'Wood smoke',
 'Industrial ',
 'Industrial ',
 'Industrial, sulfur',
 'Industrial smoke',
 'Industrial ',
 'Coke oven',
 'Tar and hydrocarbon smell ',
 'Smell fine',
 'Industrial',
 'Industrial',
 'Industrial ',
 'Industrial ',
 'Chemical smell. Like hair dye and rotten eggs.',
 'Chemical smell. Like hair dye and rotten eggs.',
 'Chemical smell. Like hair dye and rotten eggs.',
 'Industrial',
 'Chemical smell. Like hair dye and rotten eggs.',
 'Chemical smell. Like hair dye and rotten eggs.',
 'Industrial',
 'Industrial',
 'Industrial irritant',
 'Chemical smell. Like hair dye and rotten eggs.',
 'Acrid odor',
 'Acrid odor',
 'Acrid odor',
 'Chemical',
 'Chemical',
 'Chemical',
 'Chemical',
 'Chemical',
 'metallic',
 'Acrid Air Continues To Persist.',
 'Horrible sulfur smell. VERY strong.',
 'sulfur',
 'sulfur',
 'sulfur',
 'sulfur',
 'sulfur',
 'sulfur',
 'Coal / Coke',
 'Industrial, acrid',
 'Acrid and foul smell ',
 'Industrial like burning tire',
 'Not fresh, rotten 

### PRIMERA FORMA: Encontrar palabras mas repetidas

In [8]:
import spacy
from collections import Counter

# Cargar el modelo de lenguaje de spaCy (requiere haber instalado el modelo 'en_core_web_sm')
nlp = spacy.load("en_core_web_sm")

def find_keywords_nlp(smell_list):
    keywords = Counter()

    for smell in smell_list:
        # Tokenizar la cadena de olor usando spaCy
        tokens = nlp(smell.lower())  # Convertir a minúsculas para una coincidencia de casos

        # Lematizar y filtrar los tokens para obtener palabras clave
        for token in tokens:
            # Ignorar los tokens que son signos de puntuación o palabras cortas
            if not token.is_punct and not token.is_stop and len(token.text) > 2:
                # Lematizar el token
                lemma = token.lemma_
                # Agregar el lema a las palabras clave
                keywords[lemma] += 1

    # Obtener las 10 palabras clave más comunes
    top_keywords = keywords.most_common(20)

    return [keyword[0] for keyword in top_keywords]

In [9]:
keywords = find_keywords_nlp(reports_smells_string)
print(keywords)

['industrial', 'sulfur', 'egg', 'rotten', 'smell', 'burn', 'chemical', 'like', 'smoke', 'acrid', 'sulphur', 'coke', 'clairton', 'work', 'woodsmoke', 'exhaust', 'steel', 'wood', 'plastic', 'stench', 'air', 'gas', 'rubber', 'burning', 'sewage', 'coal', 'odor', 'stink', 'mill', 'sulfuric', 'pollution', 'toxic', 'sulphuric', 'strong', 'diesel', 'hydrogen', 'dioxide', 'metallic', 'bad', 'fume', 'heavy', 'plant', 'fire', 'metal', 'sulfide', 'tar', 'sulfurous', 'helicopter', 'smoky', 'smog', 'wind', 'irvin', 'industry', 'asphalt', 'thick', 'sulfer', 'smokey', 'tire', 'house', 'fuel', 'oil', 'rot', 'bitter', 'day', 'sewer', 'garbage', 'awful', 'dirty', 'outside', 'emission', 'come', 'usual', 'skunk', 'car', 'high', 'night', 'morning', 'fart', 'horrible', 'sulfate', 'sour', 'allegheny', 'sweet', 'foul', 'clariton', 'stinky', 'n*sty', 'hospital', 'disgusting', 'gasoline', 'trash', 'general', 'poison', 'putrid', 'mix', 'inside', 'sooty', 'natural', 'window', 'agh']


In [10]:
keywords = find_keywords_nlp(reports_symptoms_string)
print(keywords)

['headache', 'throat', 'eye', 'irritation', 'sore', 'nose', 'burn', 'cough', 'breathe', 'nausea', 'sinus', 'irritated', 'congestion', 'sleep', 'asthma', 'breath', 'difficulty', 'hard', 'nasal', 'chest', 'breathing', 'wake', 'smell', 'lung', 'scratchy', 'sneeze', 'itchy', 'dry', 'shortness', 'trouble', 'bad', 'runny', 'stuffy', 'air', 'outside', 'window', 'pain', 'night', 'insomnia', 'pressure', 'head', 'watery', 'respiratory', 'anxiety', 'anger', 'feel', 'ache', 'day', 'burning', 'house', 'like', 'taste', 'eyes', 'open', 'issue', 'choke', 'hurt', 'symptom', 'disgust', 'sick', 'sting', 'stomach', 'dizziness', 'inside', 'tightness', 'nostril', 'mouth', 'migraine', 'close', 'fatigue', 'irritate', 'mild', 'sleeplessness', 'nauseous', 'passage', 'problem', 'steel', 'time', 'bother', 'depression', 'nauseate', 'slight', 'health', 'go', 'tight', 'difficult', 'water', 'skin', 'odor', 'gag', 'stink', 'frustration', 'short', 'get', 'drip', 'allergy', 'wheezing', 'pollution', 'swollen', 'baby']


#### CAP

In [11]:
def find_keywords_nlp_strings(text_list):
    keywords = Counter()

    for text in text_list:
        # Tokenizar el texto usando spaCy
        doc = nlp(text.lower())  # Convertir a minúsculas para una coincidencia de casos

        # Filtrar sustantivos y sustantivos propios
        for token in doc:
            if token.pos_ in ["NOUN", "PROPN"]:
                # Ignorar stopwords y puntuación
                if not token.is_stop and not token.is_punct:
                    # Contar sustantivos y sustantivos propios que no sean stopwords ni puntuación
                    keywords[token.text] += 1

    # Obtener las 10 palabras clave más comunes
    top_keywords = keywords.most_common(20)
    return [keyword[0] for keyword in top_keywords]

In [12]:
# Encontrar palabras clave en las cadenas de texto
keywords = find_keywords_nlp_strings(reports_smells_string)
print(keywords)

['sulfur', 'eggs', 'smell', 'chemical', 'coke', 'clairton', 'smoke', 'sulphur', 'steel', 'acrid', 'wood', 'exhaust', 'egg', 'stench', 'air', 'plastic', 'rubber', 'burning', 'sewage', 'coal']


### TERCERA FORMA:

In [23]:
smell_keywords = {'industrial','sulphur','egg','burning plastic','smoke','woodsmoke','acrid','coke','tar','smog','chemical','sewage','gas','trash','coal'}

In [24]:
def find_keywords_nlp_strings(text_list):
    keywords = Counter()

    for text in text_list:
        # Tokenizar el texto usando spaCy
        doc = nlp(text.lower())  # Convertir a minúsculas para una coincidencia de casos

        # Filtrar palabras clave relacionadas con olores
        for token in doc:
            # Lematizar el token y filtrar por sustantivos y sustantivos propios
            lemma = token.lemma_
            if lemma in smell_keywords:
                # Contar palabras clave relacionadas con olores
                keywords[lemma] += 1

    # Obtener las 15 palabras clave más comunes
    top_keywords = keywords.most_common(15)
    return [keyword[0] for keyword in top_keywords]

In [25]:
top_keywords = find_keywords_nlp_strings(reports_smells_string)
print(top_keywords)

['industrial', 'egg', 'chemical', 'smoke', 'acrid', 'sulphur', 'coke', 'woodsmoke', 'gas', 'sewage', 'coal', 'tar', 'smog', 'trash']
