# Capstone Proposal - NEISS Dataset Exploration

## Import Libraries

In [1]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
path = "/content/drive/MyDrive/Capstone/neiss2022.xlsx"
neiss = pd.read_excel(path)

Mounted at /content/drive


In [2]:
import plotly.figure_factory as ff

## Preliminary Data Exploration

In [3]:
# shape of the raw dataset
neiss.shape

(323343, 25)

In [4]:
# preview the data
neiss.head()

Unnamed: 0,CPSC_Case_Number,Treatment_Date,Age,Sex,Race,Other_Race,Hispanic,Body_Part,Diagnosis,Other_Diagnosis,...,Fire_Involvement,Product_1,Product_2,Product_3,Alcohol,Drug,Narrative_1,Stratum,PSU,Weight
0,220100687,2022-01-01,56,1,1,,2,93,57,,...,0,1893,0,0,0,0,56YOM TRIED TO BLOCK THE CAT FROM GETTING OUT ...,S,46,77.9827
1,220100691,2022-01-01,85,1,0,,0,80,57,,...,0,670,1807,0,0,0,"85YOM WENT TO SIT ON HIS RECLINER, HIT HIS ARM...",S,46,77.9827
2,220100695,2022-01-01,78,1,1,,2,32,59,,...,0,1842,676,0,0,0,78YOM SLIPPED AND FELL DOWN 6-7 CARPETED STEPS...,S,46,77.9827
3,220100696,2022-01-01,4,2,1,,2,88,53,,...,0,1394,0,0,0,0,4YOF FELL ON HER *** TOY RESULTING IN A CONTUS...,S,46,77.9827
4,220101170,2022-01-01,2,1,1,,2,76,53,,...,0,1878,0,0,0,0,2YOM HAS CONTUSION TO FOREHEAD. TRIPPED AND FE...,S,97,77.9827


In [5]:
# data types
print(neiss.dtypes)

CPSC_Case_Number              int64
Treatment_Date       datetime64[ns]
Age                           int64
Sex                           int64
Race                          int64
Other_Race                   object
Hispanic                      int64
Body_Part                     int64
Diagnosis                     int64
Other_Diagnosis              object
Body_Part_2                 float64
Diagnosis_2                 float64
Other_Diagnosis_2            object
Disposition                   int64
Location                      int64
Fire_Involvement              int64
Product_1                     int64
Product_2                     int64
Product_3                     int64
Alcohol                       int64
Drug                          int64
Narrative_1                  object
Stratum                      object
PSU                           int64
Weight                      float64
dtype: object


In [6]:
# get summary stats
neiss.describe(include='all')

  neiss.describe(include='all')


Unnamed: 0,CPSC_Case_Number,Treatment_Date,Age,Sex,Race,Other_Race,Hispanic,Body_Part,Diagnosis,Other_Diagnosis,...,Fire_Involvement,Product_1,Product_2,Product_3,Alcohol,Drug,Narrative_1,Stratum,PSU,Weight
count,323343.0,323343,323343.0,323343.0,323343.0,12013,323343.0,323343.0,323343.0,61075,...,323343.0,323343.0,323343.0,323343.0,323343.0,323343.0,323343,323343,323343.0,323343.0
unique,,365,,,,68,,,,4267,...,,,,,,,322516,5,,
top,,2022-05-22 00:00:00,,,,UNKNOWN,,,,PAIN,...,,,,,,,2 YOF WITH A BEAD IN NOSE. DX FB NOSE,V,,
freq,,1148,,,,7671,,,,26429,...,,,,,,,10,118205,,
first,,2022-01-01 00:00:00,,,,,,,,,...,,,,,,,,,,
last,,2022-12-31 00:00:00,,,,,,,,,...,,,,,,,,,,
mean,221193700.0,,47.11044,1.45517,1.065157,,1.257244,64.666187,60.85962,,...,0.011109,2157.594626,307.884828,31.713338,0.018318,0.019796,,,45.423108,39.162834
std,2096836.0,,50.534202,0.498248,0.942599,,0.916012,24.016447,6.613404,,...,0.166559,1361.905978,827.715552,273.190511,0.134099,0.1393,,,28.568192,30.257881
min,220100700.0,,0.0,0.0,0.0,,0.0,0.0,41.0,,...,0.0,102.0,0.0,0.0,0.0,0.0,,,2.0,5.8342
25%,220448300.0,,13.0,1.0,0.0,,0.0,36.0,57.0,,...,0.0,1211.0,0.0,0.0,0.0,0.0,,,21.0,17.2223


In [7]:
# check missing values
print(neiss.isnull().sum())

CPSC_Case_Number          0
Treatment_Date            0
Age                       0
Sex                       0
Race                      0
Other_Race           311330
Hispanic                  0
Body_Part                 0
Diagnosis                 0
Other_Diagnosis      262268
Body_Part_2          251073
Diagnosis_2          251073
Other_Diagnosis_2    308000
Disposition               0
Location                  0
Fire_Involvement          0
Product_1                 0
Product_2                 0
Product_3                 0
Alcohol                   0
Drug                      0
Narrative_1               0
Stratum                   0
PSU                       0
Weight                    0
dtype: int64


In [8]:
# correlation matrix
corr_matrix = neiss.corr()


fig = ff.create_annotated_heatmap(
    z=corr_matrix.values,
    x=list(corr_matrix.columns),
    y=list(corr_matrix.index),
    annotation_text=corr_matrix.round(2).values,
    showscale=True
)

fig.show()

  corr_matrix = neiss.corr()


## Exploratory Data Analysis

In [11]:
import plotly.graph_objects as go

In [14]:
sex_mapping = {
    0: 'Unknown',
    1: 'Male',
    2: 'Female',
    3: 'Non-Binary/Other'
}

In [15]:
neiss['Sex_Mapping'] = neiss['Sex'].map(sex_mapping)

In [34]:
#mapping the month values
months_map = {val: 1 for val in range(201, 224)}
neiss['Age_Adjusted'] = neiss['Age'].apply(lambda x: months_map.get(x, x))

In [36]:
neiss['AgeGroup'] = pd.cut(neiss['Age_Adjusted'], bins=range(0, 111, 10), right=False, labels=range(0, 110, 10))
grouped = neiss.groupby(['AgeGroup', 'Sex_Mapping']).size().unstack()

In [39]:
print(neiss['AgeGroup'].value_counts(dropna=True).sort_index())

0      80085
10     64240
20     29852
30     25987
40     21466
50     23904
60     26807
70     24657
80     18623
90      7400
100      321
Name: AgeGroup, dtype: int64


In [26]:
colors = {
    'Unknown': 'gray',
    'Male': 'blue',
    'Female': 'pink',
    'Non-Binary/Other': 'green'
}

In [38]:
fig = go.Figure()

for sex, color in colors.items():
    fig.add_trace(go.Bar(name=sex, x=grouped.index, y=grouped[sex], marker_color=color))

# Update the layout
fig.update_layout(barmode='stack', title="Age Distribution by Sex", xaxis_title="Age Group", yaxis_title="Count")
fig.show()