# Creación y Limpieza de datos

In [1]:
import pandas as pd
import os
import shutil
import zipfile
import re

# Descargar el archivo zip usando curl
!curl -L -o diseases-symptoms.zip https://www.kaggle.com/api/v1/datasets/download/anshulgupta1502/diseases-symptoms

# Definir la ruta de destino
destination_path = "../datasets"

# Crear la carpeta de destino si no existe
os.makedirs(destination_path, exist_ok=True)

# Extraer el archivo zip en la carpeta de destino
with zipfile.ZipFile("diseases-symptoms.zip", 'r') as zip_ref:
    zip_ref.extractall(destination_path)

# Eliminar dataset no usado
os.remove("../datasets/training_data.csv")

# Mostrar los nombres de los archivos movidos
for file_name in os.listdir(destination_path):
    print(f"Archivo movido: {file_name}")

# Eliminar el archivo zip descargado
os.remove("diseases-symptoms.zip")

print("Direcotio del dataset:", destination_path)

Archivo movido: Diseases_Symptoms.csv
Direcotio del dataset: ../datasets


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 60504  100 60504    0     0  49174      0  0:00:01  0:00:01 --:--:--  107k


## Cargar el dataset

In [2]:
df = pd.read_csv(os.path.join(destination_path, "Diseases_Symptoms.csv"))
df.head()

Unnamed: 0,Code,Name,Symptoms,Treatments
0,1,Panic disorder,"Palpitations, Sweating, Trembling, Shortness o...","Antidepressant medications, Cognitive Behavior..."
1,2,Vocal cord polyp,"Hoarseness, Vocal Changes, Vocal Fatigue","Voice Rest, Speech Therapy, Surgical Removal"
2,3,Turner syndrome,"Short stature, Gonadal dysgenesis, Webbed neck...","Growth hormone therapy, Estrogen replacement t..."
3,4,Cryptorchidism,"Absence or undescended testicle(s), empty scro...",Observation and monitoring (in cases of mild o...
4,5,Ethylene glycol poisoning-1,"Nausea, vomiting, abdominal pain, General mala...","Supportive Measures, Gastric Decontamination, ..."


## Creación de dataset de sintomas y de tratamientos

In [3]:
df_symptoms = df.drop(columns=["Treatments"])
df_treatments = df.drop(columns=["Symptoms"])

# Información de los datasets

In [4]:
print("symptoms\n", df_symptoms.head())
print("treatments\n", df_treatments.head())

symptoms
    Code                         Name  \
0     1               Panic disorder   
1     2             Vocal cord polyp   
2     3              Turner syndrome   
3     4               Cryptorchidism   
4     5  Ethylene glycol poisoning-1   

                                            Symptoms  
0  Palpitations, Sweating, Trembling, Shortness o...  
1           Hoarseness, Vocal Changes, Vocal Fatigue  
2  Short stature, Gonadal dysgenesis, Webbed neck...  
3  Absence or undescended testicle(s), empty scro...  
4  Nausea, vomiting, abdominal pain, General mala...  
treatments
    Code                         Name  \
0     1               Panic disorder   
1     2             Vocal cord polyp   
2     3              Turner syndrome   
3     4               Cryptorchidism   
4     5  Ethylene glycol poisoning-1   

                                          Treatments  
0  Antidepressant medications, Cognitive Behavior...  
1       Voice Rest, Speech Therapy, Surgical Removal  
2

In [5]:
df_symptoms.count()

Code        400
Name        400
Symptoms    400
dtype: int64

In [6]:
df_treatments.count()

Code          400
Name          400
Treatments    399
dtype: int64

In [7]:
for column in df_symptoms.columns:
    print(f"{column}: {df_symptoms[column].unique()}")

Code: [  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
 235 236 237 238 239 240 241 242 243 244 245 

In [8]:
for column in df_treatments.columns:
    print(f"{column}: {df_treatments[column].unique()}")

Code: [  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
 235 236 237 238 239 240 241 242 243 244 245 

## Tranformación

In [9]:
# Pasar los nombres de las columnas a minúsculas
df_symptoms.columns = df_symptoms.columns.str.lower()

# Pasar registros de las columnas a minúsculas
for column in df_symptoms.columns:
    if df_symptoms[column].dtype == "object":
        df_symptoms[column] = df_symptoms[column].str.lower()

In [10]:
# Pasar los nombres de las columnas a minúsculas
df_treatments.columns = df_treatments.columns.str.lower()

# Pasar registros de las columnas a minúsculas
for column in df_treatments.columns:
    if df_treatments[column].dtype == "object":
        df_treatments[column] = df_treatments[column].str.lower()

In [11]:
# Contabilizar registros con nexos distintos de ','
# Contar registros que contienen la palabra "or"
count_or = df_symptoms['symptoms'].str.contains(r'\bor\b', regex=True).sum()

# Contar registros que contienen la palabra "and"
count_and = df_symptoms['symptoms'].str.contains(r'\band\b', regex=True).sum()

# Mostrar los resultados
print(f"Registros que contienen la palabra 'or': {count_or}")
print(f"Registros que contienen la palabra 'and': {count_and}")

Registros que contienen la palabra 'or': 164
Registros que contienen la palabra 'and': 50


In [12]:
# Contabilizar registros con (), que puden ser tanto para aclaraciones como para indicar sintomas, tambein cuantos de ellos tinenen ','
# Contar registros con algo entre paréntesis
count_parentheses = df_symptoms['symptoms'].str.contains(r'\([^)]*\)').sum()

# Contar registros con algo entre paréntesis que contenga una coma
count_commas_in_parentheses = df_symptoms['symptoms'].str.contains(r'\([^)]*,[^)]*\)').sum()

# Mostrar los resultados
print(f"Registros con algo entre paréntesis: {count_parentheses}")
print(f"Registros con algo entre paréntesis que contiene una coma: {count_commas_in_parentheses}")

Registros con algo entre paréntesis: 57
Registros con algo entre paréntesis que contiene una coma: 18


In [13]:
# Contar las ocurrencias de cada valor en la columna 'name'
name_counts = df['Name'].value_counts()

# Filtrar los nombres que aparecen más de una vez
duplicate_names = name_counts[name_counts > 1]

# Mostrar los nombres duplicados y sus conteos
print(duplicate_names)

Name
Sciatica                                 3
Fibromyalgia                             2
Urinary Tract Infection (UTI)            2
Complex Regional Pain Syndrome (CRPS)    2
Endometriosis                            2
Dermatitis due to Sun Exposure           2
Mucocele                                 2
Name: count, dtype: int64


In [14]:
# Contar las ocurrencias de cada valor en la columna 'name'
name_counts = df['Name'].value_counts()

# Filtrar los nombres que aparecen más de una vez
duplicate_names = name_counts[name_counts > 1].index

# Filtrar los registros que tienen nombres duplicados
duplicate_records = df[df['Name'].isin(duplicate_names)]

# Mostrar los registros duplicados
duplicate_records.sort_values('Name')

# Existen registros duplicados, pero tanto los sintomas como los tratamientos son diferentes, por lo que no se eliminaran

Unnamed: 0,Code,Name,Symptoms,Treatments
52,53,Complex Regional Pain Syndrome (CRPS),"Severe and persistent pain, changes in skin co...","Medications (such as pain relievers, corticost..."
378,379,Complex Regional Pain Syndrome (CRPS),"Intense burning pain, swelling, changes in ski...","Physical therapy, medications, nerve blocks"
216,217,Dermatitis due to Sun Exposure,"Redness, itching, rash, blistering","Topical corticosteroids, moisturizers, avoidin..."
389,390,Dermatitis due to Sun Exposure,"Sunburn-like rash, redness, itching","Moisturizers, cool compresses, topical cortico..."
162,163,Endometriosis,"Pelvic pain (during menstruation, intercourse,...",Pain management (such as nonsteroidal anti-inf...
380,381,Endometriosis,"Pelvic pain, painful periods, infertility","Pain medications, hormonal therapies, surgery"
50,51,Fibromyalgia,"Widespread musculoskeletal pain, fatigue, slee...","Medications (such as pain relievers, antidepre..."
80,81,Fibromyalgia,"Widespread musculoskeletal pain, fatigue, slee...",Multidisciplinary approach: Medications for pa...
18,19,Mucocele,Painless fluid-filled swelling in the oral cavity,"Observation, Surgical removal if necessary"
27,28,Mucocele,"Painless swelling, usually on the lower lip or...","Observation, surgical removal if necessary"


In [15]:
df_symptoms.head()

Unnamed: 0,code,name,symptoms
0,1,panic disorder,"palpitations, sweating, trembling, shortness o..."
1,2,vocal cord polyp,"hoarseness, vocal changes, vocal fatigue"
2,3,turner syndrome,"short stature, gonadal dysgenesis, webbed neck..."
3,4,cryptorchidism,"absence or undescended testicle(s), empty scro..."
4,5,ethylene glycol poisoning-1,"nausea, vomiting, abdominal pain, general mala..."


In [16]:
df_treatments.head()

Unnamed: 0,code,name,treatments
0,1,panic disorder,"antidepressant medications, cognitive behavior..."
1,2,vocal cord polyp,"voice rest, speech therapy, surgical removal"
2,3,turner syndrome,"growth hormone therapy, estrogen replacement t..."
3,4,cryptorchidism,observation and monitoring (in cases of mild o...
4,5,ethylene glycol poisoning-1,"supportive measures, gastric decontamination, ..."


In [17]:
for column in df_symptoms.columns:
    print(f"{column}: {df_symptoms[column].unique()}")

code: [  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
 235 236 237 238 239 240 241 242 243 244 245 

In [18]:
for column in df_treatments.columns:
    print(f"{column}: {df_treatments[column].unique()}")

code: [  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
 235 236 237 238 239 240 241 242 243 244 245 

In [19]:
# La coma sera usado como separador de valores, para ello tambie cambiaremos ' or ' y ' and ' por ',' 
def process_multivalued_column(df, column_name):
    unique_values = set()
    df[column_name] = df[column_name].fillna('')

    # Reemplazar ";" por ","
    df[column_name] = df[column_name].str.replace(";", ",")

    # Reemplazar " and " por ","
    df[column_name] = df[column_name].str.replace(r"\band\b", ",", regex=True)

    # Reemplazar " or " por ","
    df[column_name] = df[column_name].str.replace(r"\bor\b", ",", regex=True)

    df[column_name] = df[column_name].str.replace(r"\s*,\s*", ",", regex=True)
    df[column_name] = df[column_name].str.replace(r"\s+", " ", regex=True)

    # Reemplazar comas dentro de paréntesis por puntos y comas
    df[column_name] = df[column_name].apply(lambda x: re.sub(r'\(([^)]+)\)', lambda m: m.group(0).replace(',', ';'), x))

    df[column_name] = df[column_name].fillna('')
    for values in df[column_name]:
        unique_values.update(value.strip() for value in values.split(','))

    unique_values = sorted(unique_values)
    for value in unique_values:
        df[value] = df[column_name].apply(lambda x: 1 if value in x else 0)

    # Eliminar la columna vacía si existe
    if '' in df.columns:
        df = df.drop(columns=[''])

    return df.drop(columns=[column_name])


In [20]:
# Procesar los síntomas
df_symptoms = process_multivalued_column(df_symptoms, "symptoms")

# Guardar los datasets en archivos CSV separados
df_symptoms.to_csv("../datasets/df_Diseases_Symptoms_Processed.csv", index=False)

# Mensaje final indicando que el proceso ha finalizado
print("Archivos procesados y guardados correctamente.") 

  df[value] = df[column_name].apply(lambda x: 1 if value in x else 0)
  df[value] = df[column_name].apply(lambda x: 1 if value in x else 0)
  df[value] = df[column_name].apply(lambda x: 1 if value in x else 0)
  df[value] = df[column_name].apply(lambda x: 1 if value in x else 0)
  df[value] = df[column_name].apply(lambda x: 1 if value in x else 0)
  df[value] = df[column_name].apply(lambda x: 1 if value in x else 0)
  df[value] = df[column_name].apply(lambda x: 1 if value in x else 0)
  df[value] = df[column_name].apply(lambda x: 1 if value in x else 0)
  df[value] = df[column_name].apply(lambda x: 1 if value in x else 0)
  df[value] = df[column_name].apply(lambda x: 1 if value in x else 0)
  df[value] = df[column_name].apply(lambda x: 1 if value in x else 0)
  df[value] = df[column_name].apply(lambda x: 1 if value in x else 0)
  df[value] = df[column_name].apply(lambda x: 1 if value in x else 0)
  df[value] = df[column_name].apply(lambda x: 1 if value in x else 0)
  df[value] = df[col

Archivos procesados y guardados correctamente.


In [21]:

# Procesar los tratamientos
df_treatments = process_multivalued_column(df_treatments, "treatments")

# Guardar los datasets en archivos CSV separados
df_treatments.to_csv("../datasets/df_Diseases_Treatments_Processed.csv", index=False)

# Mensaje final indicando que el proceso ha finalizado
print("Archivos procesados y guardados correctamente.") 

  df[value] = df[column_name].apply(lambda x: 1 if value in x else 0)
  df[value] = df[column_name].apply(lambda x: 1 if value in x else 0)
  df[value] = df[column_name].apply(lambda x: 1 if value in x else 0)
  df[value] = df[column_name].apply(lambda x: 1 if value in x else 0)
  df[value] = df[column_name].apply(lambda x: 1 if value in x else 0)
  df[value] = df[column_name].apply(lambda x: 1 if value in x else 0)
  df[value] = df[column_name].apply(lambda x: 1 if value in x else 0)
  df[value] = df[column_name].apply(lambda x: 1 if value in x else 0)
  df[value] = df[column_name].apply(lambda x: 1 if value in x else 0)
  df[value] = df[column_name].apply(lambda x: 1 if value in x else 0)
  df[value] = df[column_name].apply(lambda x: 1 if value in x else 0)
  df[value] = df[column_name].apply(lambda x: 1 if value in x else 0)
  df[value] = df[column_name].apply(lambda x: 1 if value in x else 0)
  df[value] = df[column_name].apply(lambda x: 1 if value in x else 0)
  df[value] = df[col

Archivos procesados y guardados correctamente.


In [22]:
df_symptoms.head()

Unnamed: 0,code,name,abdominal,abdominal cramps,abdominal distension,abdominal pain,abdominal pain (often in the upper right quadrant),abdominal swelling,abdominal swelling (ascites),abdominal tenderness,...,widespread musculoskeletal pain,with associated swelling,withdrawal symptoms (such as irritability;insomnia) when attempting to stop,withdrawal symptoms (such as nausea;muscle aches;anxiety) when attempting to stop,withdrawal symptoms (such as rebound anxiety;insomnia;tremors) when attempting to stop,withdrawal symptoms (such as tremors;anxiety;sweating) when attempting to stop,worsens with deep breaths,worthlessness,yellowing of the skin,yellowish
0,1,panic disorder,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,vocal cord polyp,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,turner syndrome,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,cryptorchidism,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,ethylene glycol poisoning-1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
df_treatments.head()

Unnamed: 0,code,name,abscess,accommodations,acidic foods,activated charcoal (if indicated),activated charcoal administration,acupuncture,address underlying causes. it may include: lifestyle modifications (e.g.;avoiding alcohol;maintaining a healthy diet),addressing any medical complications,...,weight management,which can be done in the early stages of pregnancy. surgical abortion: a healthcare professional performs a procedure to remove the pregnancy,which can involve suction aspiration,wigs,wound care,wound care (cleaning;dressing changes),wound debridement (if necessary),wound dressings,wrist splints,x-ray
0,1,panic disorder,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,vocal cord polyp,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,turner syndrome,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,cryptorchidism,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,ethylene glycol poisoning-1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
df_symptoms.columns

Index(['code', 'name', 'abdominal', 'abdominal cramps', 'abdominal distension',
       'abdominal pain', 'abdominal pain (often in the upper right quadrant)',
       'abdominal swelling', 'abdominal swelling (ascites)',
       'abdominal tenderness',
       ...
       'widespread musculoskeletal pain', 'with associated swelling',
       'withdrawal symptoms (such as irritability;insomnia) when attempting to stop',
       'withdrawal symptoms (such as nausea;muscle aches;anxiety) when attempting to stop',
       'withdrawal symptoms (such as rebound anxiety;insomnia;tremors) when attempting to stop',
       'withdrawal symptoms (such as tremors;anxiety;sweating) when attempting to stop',
       'worsens with deep breaths', 'worthlessness', 'yellowing of the skin',
       'yellowish'],
      dtype='object', length=997)

In [25]:
df_treatments.columns

Index(['code', 'name', 'abscess', 'accommodations', 'acidic foods',
       'activated charcoal (if indicated)',
       'activated charcoal administration', 'acupuncture',
       'address underlying causes. it may include: lifestyle modifications (e.g.;avoiding alcohol;maintaining a healthy diet)',
       'addressing any medical complications',
       ...
       'weight management',
       'which can be done in the early stages of pregnancy. surgical abortion: a healthcare professional performs a procedure to remove the pregnancy',
       'which can involve suction aspiration', 'wigs', 'wound care',
       'wound care (cleaning;dressing changes)',
       'wound debridement (if necessary)', 'wound dressings', 'wrist splints',
       'x-ray'],
      dtype='object', length=1218)

In [26]:
df_symptoms.shape

(400, 997)

In [27]:
df_treatments.shape

(400, 1218)

In [28]:
# Elinación del dataset original
os.remove("../datasets/Diseases_Symptoms.csv")