## Dependencias

In [1]:
import numpy as np
import pandas as pd
import json #JSON = Java Script Object Notation
from sqlalchemy import create_engine
from sqlalchemy.types import VARCHAR,FLOAT,INTEGER,DATE,CHAR,DATETIME
import os

pd.set_option('display.max_columns',None)

In [2]:
# Funcion que nos da  los missings en un df
def missing_zero_values_table(df):

        zero_val = (df == 0.00).astype(int).sum(axis=0)

        mis_val = df.isnull().sum()

        mis_val_percent = 100 * df.isnull().sum() / len(df)

        mz_table = pd.concat([zero_val, mis_val, mis_val_percent], axis=1)

        mz_table = mz_table.rename(

        columns = {0 : 'Zero Values', 1 : 'Missing Values', 2 : '% of Total Values'})

        mz_table['Total Zero Missing Values'] = mz_table['Zero Values'] + mz_table['Missing Values']

        mz_table['% Total Zero Missing Values'] = 100 * mz_table['Total Zero Missing Values'] / len(df)

        mz_table['Data Type'] = df.dtypes

        mz_table = mz_table[

            mz_table.iloc[:,1] != 0].sort_values(

        '% of Total Values', ascending=False).round(1)

        print ("Your selected dataframe has " + str(df.shape[1]) + " columns and " + str(df.shape[0]) + " Rows.\n"      

            "There are " + str(mz_table.shape[0]) +

              " columns that have missing values.")

        return mz_table

## Credenciales

In [3]:
creds = json.load(open('credenciales1.json','rb'))

## Crear conexión a base de datos

In [6]:
url = f"mysql+pymysql://{creds['user']}:{creds['password']}@{creds['servidor']}/roller_coaster"
cnx = create_engine(url,encoding='utf8')
cnx = cnx.connect()

In [7]:
cnx.closed

False

## Lectura y limpieza de datos

In [8]:
os.getcwd()

'/home/aura/ClaseBDD'

In [9]:
ruta = os.getcwd()

In [10]:
datos = pd.read_csv(ruta + '/safer-parks-accident-dataset.csv')

In [11]:
datos.shape

(14884, 24)

In [12]:
datos.head(3)

Unnamed: 0,acc_id,acc_date,acc_state,acc_city,fix_port,source,bus_type,industry_sector,device_category,device_type,tradename_or_generic,manufacturer,num_injured,age_youngest,gender,acc_desc,injury_desc,report,category,mechanical,op_error,employee,notes,year
0,1007272,2009-04-05,CA,Anaheim,F,California Division of Occupational Safety and...,Amusement park,amusement ride,cars & track rides,Track ride,dark ride,In-house,1,44.0,F,Patron's attorney stated that she had a right ...,Right vertegral artery tear,0,Illness or neurological symptoms,False,False,False,0,2009
1,918972,2009-02-05,FL,Naples,P,Florida Dept. of Agriculture,Carnival or rental,amusement ride,pendulum,Flying carpet ride,Avalanche,"Wisdom Industries, Ltd.",1,17.0,F,17-yo female was ejected from the ride mid-cyl...,Multiple broken ribs and bruised lung,https://saferparksdata.org/sites/default/files...,Fall: ejection/fall from ride,True,False,False,"This was the third ejection on this ride, unde...",2009
2,919123,2008-11-08,CA,Buena Park,F,California Division of Occupational Safety and...,Amusement park,amusement ride,coaster,Coaster - steel,Boomerang,Vekoma,1,19.0,M,Guest injured right hand while riding.,Injured hand. Treated at local hospital.,0,Unknown (not enough info),False,False,False,0,2008


In [13]:
missing_zero_values_table(datos)

Your selected dataframe has 24 columns and 14884 Rows.
There are 0 columns that have missing values.


Unnamed: 0,Zero Values,Missing Values,% of Total Values,Total Zero Missing Values,% Total Zero Missing Values,Data Type


ENTIDADES
*Accidente
*Juego
*lugar
*Accidentado
*Fuente

In [14]:
datos.columns.values

array(['acc_id', 'acc_date', 'acc_state', 'acc_city', 'fix_port',
       'source', 'bus_type', 'industry_sector', 'device_category',
       'device_type', 'tradename_or_generic', 'manufacturer',
       'num_injured', 'age_youngest', 'gender', 'acc_desc', 'injury_desc',
       'report', 'category', 'mechanical', 'op_error', 'employee',
       'notes', 'year'], dtype=object)

## Separación de Entidades

### Entidad juego

In [45]:
entjuego = datos[['bus_type', 'industry_sector', 'device_category','device_type', 
                  'tradename_or_generic', 'manufacturer']].drop_duplicates().reset_index(drop=True)
entjuego.insert(0,'roller_coasterId',entjuego.index+1)
entjuego

Unnamed: 0,roller_coasterId,bus_type,industry_sector,device_category,device_type,tradename_or_generic,manufacturer
0,1,Amusement park,amusement ride,cars & track rides,Track ride,dark ride,In-house
1,2,Carnival or rental,amusement ride,pendulum,Flying carpet ride,Avalanche,"Wisdom Industries, Ltd."
2,3,Amusement park,amusement ride,coaster,Coaster - steel,Boomerang,Vekoma
3,4,Amusement park,amusement ride,spinning,Spinning cups/tubs,tea cups,In-house
4,5,Amusement park,amusement ride,spinning,Carousel,carousel,Morgan
...,...,...,...,...,...,...,...
1333,1334,Amusement park,unknown,unknown,Unknown,unidentified,Puentas Design
1334,1335,"Mall, store or restaurant",amusement ride,spinning,Ferris/gondola wheel,Vertical Lift Wheel,Mollini
1335,1336,Carnival or rental,amusement ride,cars & track rides,Train/tram,train,"Chance Manufacturing, Inc."
1336,1337,Carnival or rental,unknown,unknown,Unknown,unidentified,Thomas Brothers


In [46]:
missing_zero_values_table(entjuego)

Your selected dataframe has 7 columns and 1338 Rows.
There are 0 columns that have missing values.


Unnamed: 0,Zero Values,Missing Values,% of Total Values,Total Zero Missing Values,% Total Zero Missing Values,Data Type


In [47]:
entjuego['manufacturer'].map(len).max()

41

In [48]:
entjuego.to_sql(con=cnx,
                   name='roller_coaster',
                   if_exists='replace',
                   index=False,
                   chunksize=10000,
                   dtype={'roller_coasterId':INTEGER,
                          'bus_type':VARCHAR(29),
                          'industry_sector':VARCHAR(14),
                          'device_category':VARCHAR(18),
                          'device_type':VARCHAR(26),
                          'tradename_or_generic':VARCHAR(29),
                          'manufacturer':VARCHAR(41)})

1338

In [50]:
entjuego_c = entjuego.copy()
entjuego['llavejuego']=entjuego['bus_type']+entjuego['industry_sector']+entjuego['device_category']+entjuego['device_type']+entjuego['tradename_or_generic']+entjuego['manufacturer']
entjuego_c

Unnamed: 0,roller_coasterId,bus_type,industry_sector,device_category,device_type,tradename_or_generic,manufacturer,llavejuego
0,1,Amusement park,amusement ride,cars & track rides,Track ride,dark ride,In-house,Amusement parkamusement ridecars & track rides...
1,2,Carnival or rental,amusement ride,pendulum,Flying carpet ride,Avalanche,"Wisdom Industries, Ltd.",Carnival or rentalamusement ridependulumFlying...
2,3,Amusement park,amusement ride,coaster,Coaster - steel,Boomerang,Vekoma,Amusement parkamusement ridecoasterCoaster - s...
3,4,Amusement park,amusement ride,spinning,Spinning cups/tubs,tea cups,In-house,Amusement parkamusement ridespinningSpinning c...
4,5,Amusement park,amusement ride,spinning,Carousel,carousel,Morgan,Amusement parkamusement ridespinningCarouselca...
...,...,...,...,...,...,...,...,...
1333,1334,Amusement park,unknown,unknown,Unknown,unidentified,Puentas Design,Amusement parkunknownunknownUnknownunidentifie...
1334,1335,"Mall, store or restaurant",amusement ride,spinning,Ferris/gondola wheel,Vertical Lift Wheel,Mollini,"Mall, store or restaurantamusement ridespinnin..."
1335,1336,Carnival or rental,amusement ride,cars & track rides,Train/tram,train,"Chance Manufacturing, Inc.",Carnival or rentalamusement ridecars & track r...
1336,1337,Carnival or rental,unknown,unknown,Unknown,unidentified,Thomas Brothers,Carnival or rentalunknownunknownUnknownunident...


In [20]:
datos.columns.values

array(['acc_id', 'acc_date', 'acc_state', 'acc_city', 'fix_port',
       'source', 'bus_type', 'industry_sector', 'device_category',
       'device_type', 'tradename_or_generic', 'manufacturer',
       'num_injured', 'age_youngest', 'gender', 'acc_desc', 'injury_desc',
       'report', 'category', 'mechanical', 'op_error', 'employee',
       'notes', 'year'], dtype=object)

### Entidad lugar

In [21]:
entlugar = datos[['acc_state', 'acc_city']].drop_duplicates().reset_index(drop=True)
entlugar.columns = ['state','city']
entlugar.insert(0,'placeId',entlugar.index+1)
entlugar

Unnamed: 0,placeId,state,city
0,1,CA,Anaheim
1,2,FL,Naples
2,3,CA,Buena Park
3,4,CA,Los Angeles
4,5,CA,Riverside
...,...,...,...
534,535,FL,Everglades City
535,536,FL,Davie
536,537,FL,Weston
537,538,FL,Okeechobee


In [22]:
entlugar['city'].map(len).max()

20

In [23]:
entlugar.to_sql(con=cnx,
                   name='place',
                   if_exists='replace',
                   index=False,
                   chunksize=10000,
                   dtype={'placeId':INTEGER,
                          'state':VARCHAR(2),
                          'city':VARCHAR(20)})

539

In [24]:
datos.columns.values

array(['acc_id', 'acc_date', 'acc_state', 'acc_city', 'fix_port',
       'source', 'bus_type', 'industry_sector', 'device_category',
       'device_type', 'tradename_or_generic', 'manufacturer',
       'num_injured', 'age_youngest', 'gender', 'acc_desc', 'injury_desc',
       'report', 'category', 'mechanical', 'op_error', 'employee',
       'notes', 'year'], dtype=object)

In [84]:
entlugar_c = entlugar.copy()
entlugar_c['llavelugar'] = entlugar_c['state']+entlugar_c['city']
entlugar_c

Unnamed: 0,placeId,state,city,llavelugar
0,1,CA,Anaheim,CAAnaheim
1,2,FL,Naples,FLNaples
2,3,CA,Buena Park,CABuena Park
3,4,CA,Los Angeles,CALos Angeles
4,5,CA,Riverside,CARiverside
...,...,...,...,...
534,535,FL,Everglades City,FLEverglades City
535,536,FL,Davie,FLDavie
536,537,FL,Weston,FLWeston
537,538,FL,Okeechobee,FLOkeechobee


### Entidad accidentado

In [25]:
entaccidentado = datos[['num_injured', 'age_youngest', 'gender', 'injury_desc', 'category']].drop_duplicates().reset_index(drop=True)
entaccidentado .insert(0,'injuredId',entaccidentado .index+1)
entaccidentado 

Unnamed: 0,injuredId,num_injured,age_youngest,gender,injury_desc,category
0,1,1,44.0,F,Right vertegral artery tear,Illness or neurological symptoms
1,2,1,17.0,F,Multiple broken ribs and bruised lung,Fall: ejection/fall from ride
2,3,1,19.0,M,Injured hand. Treated at local hospital.,Unknown (not enough info)
3,4,1,51.0,F,Nosebleed. Treated at local hospital.,Illness or neurological symptoms
4,5,1,38.0,F,"Dizziness, high blood pressure. Treated at lo...",Illness or neurological symptoms
...,...,...,...,...,...,...
12056,12057,5,0.0,M,"Five patrons suffered injuries, including thre...",Derailment
12057,12058,1,0.0,F,Swelling,Unknown (not enough info)
12058,12059,1,0.0,F,Strain,Load/Unload: scrape or stumble
12059,12060,1,15.0,M,Fatal,Fall: ejection/fall from ride


In [26]:
entaccidentado.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12061 entries, 0 to 12060
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   injuredId     12061 non-null  int64  
 1   num_injured   12061 non-null  int64  
 2   age_youngest  12061 non-null  float64
 3   gender        12061 non-null  object 
 4   injury_desc   12061 non-null  object 
 5   category      12061 non-null  object 
dtypes: float64(1), int64(2), object(3)
memory usage: 565.5+ KB


In [27]:
entaccidentado['category'].map(len).max()

54

In [28]:
entaccidentado.to_sql(con=cnx,
                   name='injured',
                   if_exists='replace',
                   index=False,
                   chunksize=10000,
                   dtype={'injuredId':INTEGER,
                          'num_injured':INTEGER,
                          'age_youngest':FLOAT,
                          'gender':VARCHAR(1),
                          'injury_desc':VARCHAR(683),
                          'category':VARCHAR(54)})

12061

In [29]:
entaccidentado.columns.values

array(['injuredId', 'num_injured', 'age_youngest', 'gender',
       'injury_desc', 'category'], dtype=object)

In [52]:
entaccidentado_c = entaccidentado.copy()
entaccidentado_c['llaveaccidentado']=entaccidentado_c['num_injured'].map(str)+entaccidentado_c['age_youngest'].map(str)+entaccidentado_c['gender']+entaccidentado_c['injury_desc']+entaccidentado_c['category']
entaccidentado_c

Unnamed: 0,injuredId,num_injured,age_youngest,gender,injury_desc,category,llaveaccidentado
0,1,1,44.0,F,Right vertegral artery tear,Illness or neurological symptoms,144.0FRight vertegral artery tearIllness or ne...
1,2,1,17.0,F,Multiple broken ribs and bruised lung,Fall: ejection/fall from ride,117.0FMultiple broken ribs and bruised lungFal...
2,3,1,19.0,M,Injured hand. Treated at local hospital.,Unknown (not enough info),119.0MInjured hand. Treated at local hospital...
3,4,1,51.0,F,Nosebleed. Treated at local hospital.,Illness or neurological symptoms,151.0FNosebleed. Treated at local hospital.Il...
4,5,1,38.0,F,"Dizziness, high blood pressure. Treated at lo...",Illness or neurological symptoms,"138.0FDizziness, high blood pressure. Treated..."
...,...,...,...,...,...,...,...
12056,12057,5,0.0,M,"Five patrons suffered injuries, including thre...",Derailment,"50.0MFive patrons suffered injuries, including..."
12057,12058,1,0.0,F,Swelling,Unknown (not enough info),10.0FSwellingUnknown (not enough info)
12058,12059,1,0.0,F,Strain,Load/Unload: scrape or stumble,10.0FStrainLoad/Unload: scrape or stumble
12059,12060,1,15.0,M,Fatal,Fall: ejection/fall from ride,115.0MFatalFall: ejection/fall from ride


In [31]:
entaccidentado_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12061 entries, 0 to 12060
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   injuredId         12061 non-null  int64  
 1   num_injured       12061 non-null  int64  
 2   age_youngest      12061 non-null  float64
 3   gender            12061 non-null  object 
 4   injury_desc       12061 non-null  object 
 5   category          12061 non-null  object 
 6   llaveaccidentado  12061 non-null  object 
dtypes: float64(1), int64(2), object(4)
memory usage: 659.7+ KB


### Entidad fuente

In [32]:
entfuente = datos[['source']].drop_duplicates().reset_index(drop=True)
entfuente.insert(0,'sourceId',entfuente.index+1)
entfuente.head()

Unnamed: 0,sourceId,source
0,1,California Division of Occupational Safety and...
1,2,Florida Dept. of Agriculture
2,3,"Wisconsin Dept. of Commerce, Safety and Buildi..."
3,4,Media Report
4,5,Colorado Division of Oil and Public Safety


In [33]:
entfuente['source'].map(len).max()

57

In [34]:
entfuente.to_sql(con=cnx,
                   name='source',
                   if_exists='replace',
                   index=False,
                   chunksize=10000,
                   dtype={'sourceId':INTEGER,
                          'source':VARCHAR(57)})

31

In [35]:
datos.head()

Unnamed: 0,acc_id,acc_date,acc_state,acc_city,fix_port,source,bus_type,industry_sector,device_category,device_type,tradename_or_generic,manufacturer,num_injured,age_youngest,gender,acc_desc,injury_desc,report,category,mechanical,op_error,employee,notes,year
0,1007272,2009-04-05,CA,Anaheim,F,California Division of Occupational Safety and...,Amusement park,amusement ride,cars & track rides,Track ride,dark ride,In-house,1,44.0,F,Patron's attorney stated that she had a right ...,Right vertegral artery tear,0,Illness or neurological symptoms,False,False,False,0,2009
1,918972,2009-02-05,FL,Naples,P,Florida Dept. of Agriculture,Carnival or rental,amusement ride,pendulum,Flying carpet ride,Avalanche,"Wisdom Industries, Ltd.",1,17.0,F,17-yo female was ejected from the ride mid-cyl...,Multiple broken ribs and bruised lung,https://saferparksdata.org/sites/default/files...,Fall: ejection/fall from ride,True,False,False,"This was the third ejection on this ride, unde...",2009
2,919123,2008-11-08,CA,Buena Park,F,California Division of Occupational Safety and...,Amusement park,amusement ride,coaster,Coaster - steel,Boomerang,Vekoma,1,19.0,M,Guest injured right hand while riding.,Injured hand. Treated at local hospital.,0,Unknown (not enough info),False,False,False,0,2008
3,919095,2008-11-01,CA,Anaheim,F,California Division of Occupational Safety and...,Amusement park,amusement ride,spinning,Spinning cups/tubs,tea cups,In-house,1,51.0,F,"Guest stated when she sat down in the vehicle,...",Nosebleed. Treated at local hospital.,0,Illness or neurological symptoms,False,False,False,0,2008
4,919094,2008-10-29,CA,Anaheim,F,California Division of Occupational Safety and...,Amusement park,amusement ride,spinning,Spinning cups/tubs,tea cups,In-house,1,38.0,F,Guest stated she felt lightheaded and experien...,"Dizziness, high blood pressure. Treated at lo...",0,Illness or neurological symptoms,False,False,False,0,2008


In [36]:
datos['category'].unique()

array(['Illness or neurological symptoms',
       'Fall: ejection/fall from ride', 'Unknown (not enough info)',
       'Entrapment or pinch-point', 'Body pain (normal motion)',
       'Injured in queue or exit', 'Load/Unload: scrape or stumble',
       'Load/Unload: injured when vehicle moved',
       'Impact: hit something within ride vehicle',
       'Illness: Seizure or LOC',
       'Fall: patron fell from seat, but not carrier',
       'Fall: patron fell off inner tube, mat or board',
       'Hyperextension or dislocation',
       'Load/Unload: hit or pinched by restraint',
       'Fall: in climb or play area',
       'Collision: patrons collided (participatory)',
       'Impact: hit something in participatory attraction',
       'Impact: person hit by ride', 'Derailment',
       'Collision: operator-controlled vehicles',
       'Injured by foreign object', 'Restraint too tight',
       'Collision: patrons collided within vehicle',
       'Abrupt stop/drop/lurch',
       'Impact: e

In [37]:
datos.head(2)

Unnamed: 0,acc_id,acc_date,acc_state,acc_city,fix_port,source,bus_type,industry_sector,device_category,device_type,tradename_or_generic,manufacturer,num_injured,age_youngest,gender,acc_desc,injury_desc,report,category,mechanical,op_error,employee,notes,year
0,1007272,2009-04-05,CA,Anaheim,F,California Division of Occupational Safety and...,Amusement park,amusement ride,cars & track rides,Track ride,dark ride,In-house,1,44.0,F,Patron's attorney stated that she had a right ...,Right vertegral artery tear,0,Illness or neurological symptoms,False,False,False,0,2009
1,918972,2009-02-05,FL,Naples,P,Florida Dept. of Agriculture,Carnival or rental,amusement ride,pendulum,Flying carpet ride,Avalanche,"Wisdom Industries, Ltd.",1,17.0,F,17-yo female was ejected from the ride mid-cyl...,Multiple broken ribs and bruised lung,https://saferparksdata.org/sites/default/files...,Fall: ejection/fall from ride,True,False,False,"This was the third ejection on this ride, unde...",2009


In [38]:
datos.columns.values

array(['acc_id', 'acc_date', 'acc_state', 'acc_city', 'fix_port',
       'source', 'bus_type', 'industry_sector', 'device_category',
       'device_type', 'tradename_or_generic', 'manufacturer',
       'num_injured', 'age_youngest', 'gender', 'acc_desc', 'injury_desc',
       'report', 'category', 'mechanical', 'op_error', 'employee',
       'notes', 'year'], dtype=object)

### Entidad error

In [39]:
enterror = datos[['mechanical', 'op_error', 'employee']].drop_duplicates().reset_index(drop=True)
enterror.insert(0,'errorId',enterror.index+1)
enterror

Unnamed: 0,errorId,mechanical,op_error,employee
0,1,False,False,False
1,2,True,False,False
2,3,False,True,False
3,4,True,True,False
4,5,False,True,True
5,6,True,False,True
6,7,False,False,True


In [40]:
enterror['mechanical'].unique

<bound method Series.unique of 0    False
1     True
2    False
3     True
4    False
5     True
6    False
Name: mechanical, dtype: bool>

In [41]:
enterror.to_sql(con=cnx,
                   name='error',
                   if_exists='replace',
                   index=False,
                   chunksize=10000,
                   dtype={'errorId':INTEGER,
                          'mechanical':VARCHAR(6),
                          'op_error':VARCHAR(6),
                          'employee':VARCHAR(6)})

7

In [62]:
enterror_c = enterror.copy()
enterror_c['llaveerror'] = enterror_c['mechanical'].map(str)+enterror_c['op_error'].map(str)+enterror_c['employee'].map(str)
enterror_c

Unnamed: 0,errorId,mechanical,op_error,employee,llaveerror
0,1,False,False,False,FalseFalseFalse
1,2,True,False,False,TrueFalseFalse
2,3,False,True,False,FalseTrueFalse
3,4,True,True,False,TrueTrueFalse
4,5,False,True,True,FalseTrueTrue
5,6,True,False,True,TrueFalseTrue
6,7,False,False,True,FalseFalseTrue


In [60]:
enterror_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   errorId     7 non-null      int64
 1   mechanical  7 non-null      bool 
 2   op_error    7 non-null      bool 
 3   employee    7 non-null      bool 
dtypes: bool(3), int64(1)
memory usage: 205.0 bytes


### Entidad accidente

In [42]:
datos.head(4)

Unnamed: 0,acc_id,acc_date,acc_state,acc_city,fix_port,source,bus_type,industry_sector,device_category,device_type,tradename_or_generic,manufacturer,num_injured,age_youngest,gender,acc_desc,injury_desc,report,category,mechanical,op_error,employee,notes,year
0,1007272,2009-04-05,CA,Anaheim,F,California Division of Occupational Safety and...,Amusement park,amusement ride,cars & track rides,Track ride,dark ride,In-house,1,44.0,F,Patron's attorney stated that she had a right ...,Right vertegral artery tear,0,Illness or neurological symptoms,False,False,False,0,2009
1,918972,2009-02-05,FL,Naples,P,Florida Dept. of Agriculture,Carnival or rental,amusement ride,pendulum,Flying carpet ride,Avalanche,"Wisdom Industries, Ltd.",1,17.0,F,17-yo female was ejected from the ride mid-cyl...,Multiple broken ribs and bruised lung,https://saferparksdata.org/sites/default/files...,Fall: ejection/fall from ride,True,False,False,"This was the third ejection on this ride, unde...",2009
2,919123,2008-11-08,CA,Buena Park,F,California Division of Occupational Safety and...,Amusement park,amusement ride,coaster,Coaster - steel,Boomerang,Vekoma,1,19.0,M,Guest injured right hand while riding.,Injured hand. Treated at local hospital.,0,Unknown (not enough info),False,False,False,0,2008
3,919095,2008-11-01,CA,Anaheim,F,California Division of Occupational Safety and...,Amusement park,amusement ride,spinning,Spinning cups/tubs,tea cups,In-house,1,51.0,F,"Guest stated when she sat down in the vehicle,...",Nosebleed. Treated at local hospital.,0,Illness or neurological symptoms,False,False,False,0,2008


In [43]:
datos.columns.values

array(['acc_id', 'acc_date', 'acc_state', 'acc_city', 'fix_port',
       'source', 'bus_type', 'industry_sector', 'device_category',
       'device_type', 'tradename_or_generic', 'manufacturer',
       'num_injured', 'age_youngest', 'gender', 'acc_desc', 'injury_desc',
       'report', 'category', 'mechanical', 'op_error', 'employee',
       'notes', 'year'], dtype=object)

In [88]:
entaccidente = datos[['acc_id', 'acc_date', 'acc_state', 'acc_city', 'fix_port','source', 'bus_type', 
                      'industry_sector', 'device_category', 'device_type', 'tradename_or_generic', 
                      'manufacturer', 'num_injured', 'age_youngest', 'gender', 'acc_desc', 'injury_desc',
                      'report', 'category', 'mechanical', 'op_error', 'employee', 'notes', 'year']] #Cree una copia de la tabla
#
entaccidente['llavelugar'] = entaccidente['acc_state']+entaccidente['acc_city']
#Uni la tabla accidente con la tabla lugar
entaccidente = pd.merge(entaccidente, entlugar_c, how='left', right_on='llavelugar', left_on='llavelugar')
entaccidente = entaccidente.drop([ 'state', 'city', 'acc_state', 'acc_city', 'fix_port', 'llavelugar'], axis=1)
#Uni la tabla accidente con la tabla fuente
entaccidente = pd.merge(entaccidente, entfuente, how='left', right_on='source', left_on='source')
entaccidente = entaccidente.drop([ 'source'], axis=1)
#Cree la llave juego para unir
entaccidente['llavejuego']=entaccidente['bus_type']+entaccidente['industry_sector']+entaccidente['device_category']+entaccidente['device_type']+entaccidente['tradename_or_generic']+entaccidente['manufacturer']
#uni la tabla accidente con la tabla juego
entaccidente = pd.merge(entaccidente, entjuego_c, how='left', right_on='llavejuego', left_on='llavejuego')
entaccidente = entaccidente.drop(['bus_type_y','industry_sector_y','device_category_y','device_type_y',
                                  'tradename_or_generic_y','manufacturer_y','llavejuego','bus_type_x',
                                  'industry_sector_x','device_category_x','device_type_x','tradename_or_generic_x',
                                  'manufacturer_x'], axis=1)
#Cree la llave accidentado
entaccidente['llaveaccidentado']=entaccidente['num_injured'].map(str)+entaccidente['age_youngest'].map(str)+entaccidente['gender']+entaccidente['injury_desc']+entaccidente['category']
#Uni la tabla accidentado con la tabla accidente
entaccidente = pd.merge(entaccidente, entaccidentado_c, how='left', right_on='llaveaccidentado', left_on='llaveaccidentado')
entaccidente = entaccidente.drop(['num_injured_y','age_youngest_y','gender_y','injury_desc_y','category_y', 'num_injured_x','age_youngest_x','gender_x','acc_desc','injury_desc_x', 'llaveaccidentado', 'category_x'], axis=1)
#Cree la llave error
entaccidente['llaveerror'] = entaccidente['mechanical'].map(str)+entaccidente['op_error'].map(str)+entaccidente['employee'].map(str)
#Uni la tabla accidente con la tabla error 
entaccidente = pd.merge(entaccidente, enterror_c, how='left', right_on='llaveerror', left_on='llaveerror')
entaccidente = entaccidente.drop(['mechanical_x','op_error_x','employee_x','llaveerror','mechanical_y','op_error_y','employee_y'], axis=1)

entaccidente.head()

Unnamed: 0,acc_id,acc_date,report,notes,year,placeId,sourceId,roller_coasterId,injuredId,errorId
0,1007272,2009-04-05,0,0,2009,1,1,1,1,1
1,918972,2009-02-05,https://saferparksdata.org/sites/default/files...,"This was the third ejection on this ride, unde...",2009,2,2,2,2,2
2,919123,2008-11-08,0,0,2008,3,1,3,3,1
3,919095,2008-11-01,0,0,2008,1,1,4,4,1
4,919094,2008-10-29,0,0,2008,1,1,4,5,1


In [80]:
datos[datos['acc_id']==898647]

Unnamed: 0,acc_id,acc_date,acc_state,acc_city,fix_port,source,bus_type,industry_sector,device_category,device_type,tradename_or_generic,manufacturer,num_injured,age_youngest,gender,acc_desc,injury_desc,report,category,mechanical,op_error,employee,notes,year
14883,898647,1986-01-03,TX,0,F,Texas Dept. of Insurance,Family entertainment center,recreation,go-kart,Go-kart,go kart,Soli,1,30.0,F,Go kart hit from behind while stopped.,Pain,0,Collision: patron-controlled vehicles,False,False,False,0,1986


In [70]:
entaccidente['notes'].map(len).max()

2327

In [89]:
entaccidente.to_sql(con=cnx,
                   name='accident',
                   if_exists='replace',
                   index=False,
                   chunksize=10000,
                   dtype={'acc_id':INTEGER,
                          'acc_date':DATETIME,
                          'report':VARCHAR(86),
                          'notes':VARCHAR(2327),
                          'year':INTEGER,
                          'placeId':INTEGER,
                          'sourceId':INTEGER,
                          'roller_coasterId':INTEGER,
                          'injuredId':INTEGER,
                          'errorId':INTEGER})

14884

In [68]:
entaccidente.columns.values

array(['acc_id', 'acc_date', 'report', 'notes', 'year', 'placeId',
       'sourceId', 'roller_coasterId', 'injuredId', 'errorId'],
      dtype=object)