In [1]:
import pandas as pd
from os import listdir
from os.path import isfile, join

In [2]:
def resampleData(df, method="mean", rule="60Min"):
    df = df.copy(deep=True)
    # Because we want data from the past, so label need to be "right"
    df = epochtimeIdxToDatetime(df).resample(rule, label="right")
    if method == "sum":
        return df.sum()
    elif method == "count":
        return df.count()
    elif method == "mean":
        return df.mean()
    else:
        return df.mean()
    
def epochtimeIdxToDatetime(df):
    """Convert the epochtime index in a pandas dataframe to datetime index"""
    df = df.copy(deep=True)
    df.sort_index(inplace=True)
    df.index = pd.to_datetime(df.index, unit="s", utc=True)
    df.index.name = "DateTime"
    return df

def getAllFileNamesInFolder(path):
    """Return a list of all files in a folder"""
    return  [f for f in listdir(path) if isfile(join(path, f))]

def aggregateSmellData(df):
    if df is None: return None

    # Select only the reports within the range of 3 and 5
    df = df[(df["smell_value"]>=3)&(df["smell_value"]<=5)]

    # If empty, return None
    if df.empty:
        return None

    # Group by zipcode and output a vector with zipcodes
    # TODO: need to merge the reports submitted by the same user in an hour with different weights
    # TODO: for example, starting from the n_th reports, give them discounted weights, like 0.25
    data = []
    for z, df_z in df.groupby("zipcode"):
        # Select only smell values

        df_z = df_z["smell_value"]

        df_z = resampleData(df_z, method="sum")
        print(df_z)
    #     df_z.name = z
    #     data.append(df_z)

    # # Merge all
    # df = data.pop(0).reset_index()
    # while len(data) != 0:
    #     df = pd.merge_ordered(df, data.pop(0).reset_index(), on="DateTime", how="outer", fill_method=None)

    # # Fill NaN with 0
    # df = df.fillna(0)

    # return df

def correlation_analysis_smell(path):
    df_smell_analysis = pd.read_csv(path, index_col="EpochTime")
    df_smell_analysis = aggregateSmellData(df_smell_analysis)
    return df_smell_analysis



In [2]:
import pandas as pd

def resampleData(df, rule="60Min"):
    df = df.copy(deep=True)
    # Convert epochtime index to datetime
    df.index = pd.to_datetime(df.index, unit="s", utc=True)
    # Round datetime index to nearest 60 minutes
    df.index = df.index.round("60min")
    return df

def epochtimeIdxToDatetime(df):
    """Convert the epochtime index in a pandas dataframe to datetime index"""
    df = df.copy(deep=True)
    df.sort_index(inplace=True)
    df.index = pd.to_datetime(df.index, unit="s", utc=True)
    df.index.name = "DateTime"
    return df

def aggregateSmellData(df):
    if df is None: return None

    # If empty, return None
    if df.empty:
        return None

    # Convert epochtime index to datetime
    df = epochtimeIdxToDatetime(df)

    # Select only the reports within the range of 3 and 5
    df = df[(df["smell_value"] >= 3) & (df["smell_value"] <= 5)]

    # Group by datetime and output a vector with smell descriptions
    data = []
    for dt, df_dt in df.groupby(pd.Grouper(freq="60Min")):
        # df_dt = df_dt[["smell_description",'feelings_symptoms', "zipcode"]]  # Include zipcode column
        df_dt = df_dt[["smell_description","zipcode"]]  
        df_dt = resampleData(df_dt)  # Llamada a la función resampleData
        data.append(df_dt)

    # Concatenate all dataframes in the list
    result = pd.concat(data)
    return result

def correlation_analysis_smell(path):
    df_smell_analysis = pd.read_csv(path, index_col="EpochTime")
    df_smell_analysis = aggregateSmellData(df_smell_analysis)
    return df_smell_analysis

In [3]:
path = '../data/complete_smell.csv'
df = correlation_analysis_smell(path)

In [4]:
df


Unnamed: 0_level_0,smell_description,zipcode
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-10-31 23:00:00+00:00,Wood smoke,15218
2016-11-01 03:00:00+00:00,Industrial,15227
2016-11-01 04:00:00+00:00,Industrial,15207
2016-11-01 12:00:00+00:00,"Industrial, sulfur",15216
2016-11-01 13:00:00+00:00,Industrial,15218
...,...,...
2024-02-28 22:00:00+00:00,Woodsmoke,15232
2024-02-29 01:00:00+00:00,"Skunk, burning wood",15210
2024-02-29 02:00:00+00:00,Sewer Gas,15205
2024-02-29 02:00:00+00:00,Woodsmoke,15216


### Se encontraran las KEYWORDS encontradas anteriormente

In [5]:
df.dropna(inplace=True)

In [6]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [7]:
smell_keywords = {'industrial','sulfur','egg','burning','smoke','woodsmoke','acrid','coke','tar','smog','chemical','sewage','gas','trash','coal','diesel'}

In [8]:
def lemmatize(text):
    doc = nlp(text.lower())
    lemmas = []
    for token in doc:
        lemma = token.lemma_
        # Normalizar el término si es 'sulphur'
        if lemma == 'sulphur':
            lemma = 'sulfur'
        lemmas.append(lemma)
        # Si hay una palabra compuesta que contiene una palabra clave, agregarla
        if " " in lemma:
            for word in lemma.split():
                if word in smell_keywords:
                    lemmas.append(word)
    return lemmas

def extract_smells(df):
    df['lemmatized'] = df['smell_description'].apply(lemmatize)
    smells = []
    for lemmas in df['lemmatized']:
        smell_list = []
        for lemma in lemmas:
            if lemma in smell_keywords:
                smell_list.append(lemma)
                if len(smell_list) >= 3:
                    break
        # Eliminar duplicados antes de asignarlos a las columnas smell_1, smell_2 y smell_3
        unique_smells = list(set(smell_list))
        smells.append(unique_smells)
    df['smell_1'] = [smells[i][0] if len(smells[i]) > 0 else None for i in range(len(smells))]
    df['smell_2'] = [smells[i][1] if len(smells[i]) > 1 else None for i in range(len(smells))]
    df['smell_3'] = [smells[i][2] if len(smells[i]) > 2 else None for i in range(len(smells))]
    df = df.drop(columns=['lemmatized'])
    return df

In [9]:
df = extract_smells(df)


In [10]:
df

Unnamed: 0_level_0,smell_description,zipcode,smell_1,smell_2,smell_3
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-10-31 23:00:00+00:00,Wood smoke,15218,smoke,,
2016-11-01 03:00:00+00:00,Industrial,15227,industrial,,
2016-11-01 04:00:00+00:00,Industrial,15207,industrial,,
2016-11-01 12:00:00+00:00,"Industrial, sulfur",15216,sulfur,industrial,
2016-11-01 13:00:00+00:00,Industrial,15218,industrial,,
...,...,...,...,...,...
2024-02-28 22:00:00+00:00,Woodsmoke,15232,woodsmoke,,
2024-02-29 01:00:00+00:00,"Skunk, burning wood",15210,,,
2024-02-29 02:00:00+00:00,Sewer Gas,15205,gas,,
2024-02-29 02:00:00+00:00,Woodsmoke,15216,woodsmoke,,


In [11]:
df_without_duplicates = df.drop_duplicates()

#df_without_duplicates = df.drop_duplicates(subset=['smell_1','smell_2','smell_3'], keep='first', inplace=False)


In [12]:
df_without_duplicates.to_csv( "../data/smell_preprocessed" + ".csv")