In [0]:
import pandas as pd
import matplotlib as plt

In [0]:
csv_data = pd.read_csv("https://raw.githubusercontent.com/CSafewright/CS440_autism_datasets/refs/heads/main/autism_prevalence_studies_20250928.csv")
df = pd.DataFrame(csv_data)

Dropping unneeded columns

In [0]:
df.drop(['Title','Link to Publication', 'CDC Calculated Values',
         'Adaptive Score <70 (%)', 'Diagnosis Age Range (months)',
         'Diagnosis Median Age (months)', 'Diagnosis Mean Age (months)',
         'Non-Verbal or Minimally Verbal (%)', 'Percentage of Individual Co-occurring Conditions',
         'IQ Score <70 (%)', 'Non-Hispanic White:Hispanic Prevalence Ratio',
         'White:Black Prevalence Ratio', 'Autism Types Included',
         'Case Criterion', 'Confidence Interval (CI)'], axis=1, inplace=True)

Dropping study where 18 to 64 was the age range. Not important to us, we are looking at autism in children.

In [0]:
df = df[df['Age Range'] != '18 to 64']

In [0]:
df['Area(s)'] = df['Area(s)'].fillna('Unknown')

In [0]:
df.count()

Author                               206
Year Published                       206
Country                              206
Area(s)                              206
Age Range                            201
Study Years                          184
Case Identification Method           206
Sample Size                          191
Number of Cases                      191
ASD Prevalence Estimate per 1,000    206
Male:Female Sex Ratio                138
dtype: int64

Splitting 'Study Years' column and 'Age Range' column into two seperate columns each.  
When the study years only occured for one year, the year started and year ended column are the same.  
When the age range is only one age, the youngest and oldest age columns are the same.

In [0]:
# Split Study Years into start and end
df[['year_started', 'year_ended']] = (
    df['Study Years'].str.extract(r'(\d+)\s*-\s*(\d+)')
)

# If there are " & " cases instead of "-", handle them
mask_alt = df['Study Years'].str.contains('&', na=False)
df.loc[mask_alt, ['year_started', 'year_ended']] = (
    df.loc[mask_alt, 'Study Years'].str.extract(r'(\d+)\s*&\s*(\d+)')
)

df['year_started'] = df['year_started'].fillna(df['Year Published'])

# Fill missing end years with start years
df['year_ended'] = df['year_ended'].fillna(df['year_started'])

# Split Age Range
df[['youngest_age', 'oldest_age']] = (
    df['Age Range'].str.extract(r'(\d+)\s*to\s*(\d+)')
)

df['youngest_age'] = df['youngest_age'].fillna(df['Age Range'])

# Fill missing oldest_age with youngest_age
df['oldest_age'] = df['oldest_age'].fillna(df['youngest_age'])

# Drop original columns
#df = df.drop(columns=['Age Range', 'Study Years'])

# Convert to numeric
df[['year_started', 'year_ended', 'youngest_age', 'oldest_age', 'Sample Size', 'Number of Cases']] = (
    df[['year_started', 'year_ended', 'youngest_age', 'oldest_age', 'Sample Size', 'Number of Cases']].apply(pd.to_numeric, errors='coerce')
)

In [0]:
df.describe()

Unnamed: 0,Year Published,Sample Size,Number of Cases,"ASD Prevalence Estimate per 1,000",Male:Female Sex Ratio,year_started,year_ended,youngest_age,oldest_age
count,206.0,8.0,136.0,206.0,138.0,206.0,206.0,200.0,200.0
mean,2009.26699,679.75,211.448529,8.637704,4.081232,2006.635922,2008.033981,4.355,11.71
std,10.815146,207.304297,262.891467,8.207193,1.554231,11.674382,10.930102,3.004013,5.553386
min,1966.0,374.0,2.0,0.077,1.1,1962.0,1966.0,0.0,2.0
25%,2005.25,617.0,34.75,2.355,3.0,2002.25,2004.0,2.0,8.0
50%,2012.0,707.5,70.5,6.2,4.1,2010.0,2011.0,5.0,11.0
75%,2017.0,759.25,301.25,12.175,4.775,2015.0,2016.0,7.0,17.0
max,2022.0,998.0,987.0,43.6,9.9,2022.0,2022.0,15.0,30.0


In [0]:
avg_youngest_age = df['youngest_age'].mean()
avg_oldest_age = df['oldest_age'].mean()
df['youngest_age'] = df['youngest_age'].fillna(avg_youngest_age)
df['oldest_age'] = df['oldest_age'].fillna(avg_oldest_age)

Changing empty spaces in Area(s), Sample Size, and Number of Cases columns to 'Unknown'

In [0]:
df['Area(s)'] = df['Area(s)'].fillna('Unknown')

In [0]:
df['Sample Size'].mean()

np.float64(679.75)

In [0]:
df['Number of Cases'].mean()
#df['Number of Cases'].fillna(df['Number of Cases'].mean(), inplace=True)']

np.float64(211.4485294117647)

In [0]:
df['ASD Prevalence Estimate per 1,000'].mean()

np.float64(8.637703883495146)

In [0]:
df.count()

Author                               206
Year Published                       206
Country                              206
Area(s)                              206
Age Range                            201
Study Years                          184
Case Identification Method           206
Sample Size                            8
Number of Cases                      136
ASD Prevalence Estimate per 1,000    206
Male:Female Sex Ratio                138
year_started                         206
year_ended                           206
youngest_age                         206
oldest_age                           206
dtype: int64

Male:Female Sex Ratio had 67% of cells. There were only 39 below 3.0 and 59 below 4.0. There were ~75 above 4.0.  
I felt it safe to assign the mean to of 4.07 to the empty cells.
This update changes the USA average from 3.95 to 3.999 (4.0)
For every other study outside of the US, the average changes from 4.114 to 4.1

In [0]:
mf_avg_ratio = df['Male:Female Sex Ratio'].mean()
df['Male:Female Sex Ratio'] = df['Male:Female Sex Ratio'].fillna(mf_avg_ratio)

In [0]:
df.describe()

Unnamed: 0,Year Published,Sample Size,Number of Cases,"ASD Prevalence Estimate per 1,000",Male:Female Sex Ratio,year_started,year_ended,youngest_age,oldest_age
count,206.0,8.0,136.0,206.0,206.0,206.0,206.0,206.0,206.0
mean,2009.26699,679.75,211.448529,8.637704,4.081232,2006.635922,2008.033981,4.355,11.71
std,10.815146,207.304297,262.891467,8.207193,1.270571,11.674382,10.930102,2.959726,5.471514
min,1966.0,374.0,2.0,0.077,1.1,1962.0,1966.0,0.0,2.0
25%,2005.25,617.0,34.75,2.355,3.6,2002.25,2004.0,2.0,8.0
50%,2012.0,707.5,70.5,6.2,4.081232,2010.0,2011.0,4.6775,11.0
75%,2017.0,759.25,301.25,12.175,4.3,2015.0,2016.0,6.0,16.0
max,2022.0,998.0,987.0,43.6,9.9,2022.0,2022.0,15.0,30.0


In [0]:
df['Country'].unique()

array(['England', 'USA', 'Denmark', 'Japan', 'Sweden', 'Ireland',
       'Germany', 'Canada', 'France', 'Indonesia', 'Wales', 'Norway',
       'Finland', 'Iceland', 'United Kingdom', 'Australia', 'Spain',
       'China', 'Scotland', 'Portugal', 'United Arab Emirates',
       'Venezuela', 'Caribbean', 'Sri Lanka', 'Oman', 'Taiwan',
       'South Korea', 'Brazil', 'Iran', 'Bahrain', 'Israel', 'Uganda',
       'United Kindgom', 'United Kindgon', 'Ecuador', 'Nigeria',
       'Lebanon', 'Mexico', 'India', 'Poland', 'Nepal', 'Italy',
       'Bangladesh', 'Qatar', 'Vietnam', 'Basque Country, Spain',
       'Greece', 'Saudi Arabia', 'Korea'], dtype=object)

In [0]:
print(df[df['Country'] == 'USA'])

                                                Author  ...  oldest_age
1                                             Treffert  ...       12.00
12                                        Burd, et al.  ...       18.00
17                                       Ritvo, et al.  ...       12.00
28                                      Halfon, et al.  ...       17.00
30                                     Hillman, et al.  ...       15.00
36                                    Bertrand, et al.  ...       10.00
37                                       Croen, et al.  ...       12.00
39                             Yeargin-Allsopp, et al.  ...       10.00
40                                      Gurney, et al.  ...       17.00
43                                       Geier, et al.  ...       11.71
49                                 Newschaffer, et al.  ...        6.00
53                                   Blanchard, et al.  ...       17.00
59                                    Shattuck, et al.  ...     

In [0]:
df['oldest_age'].mean() - df['youngest_age'].mean()
print(df['youngest_age']).mean()
print(df['oldest_age']).mean()

0      8.0
1      2.0
2      2.0
3      5.0
4      0.0
      ... 
202    8.0
203    5.0
204    8.0
205    6.0
206    0.0
Name: youngest_age, Length: 206, dtype: float64


[0;31m---------------------------------------------------------------------------[0m
[0;31mAttributeError[0m                            Traceback (most recent call last)
File [0;32m<command-5147349549171067>, line 2[0m
[1;32m      1[0m df[[38;5;124m'[39m[38;5;124moldest_age[39m[38;5;124m'[39m][38;5;241m.[39mmean() [38;5;241m-[39m df[[38;5;124m'[39m[38;5;124myoungest_age[39m[38;5;124m'[39m][38;5;241m.[39mmean()
[0;32m----> 2[0m [38;5;28mprint[39m(df[[38;5;124m'[39m[38;5;124myoungest_age[39m[38;5;124m'[39m])[38;5;241m.[39mmean()
[1;32m      3[0m [38;5;28mprint[39m(df[[38;5;124m'[39m[38;5;124moldest_age[39m[38;5;124m'[39m])[38;5;241m.[39mmean()

[0;31mAttributeError[0m: 'NoneType' object has no attribute 'mean'

In [0]:
print(df[df['Male:Female Sex Ratio'] < 4.0])
df.describe()




In [0]:
import matplotlib.pyplot as plt
import numpy as np

plt.boxplot(df['Male:Female Sex Ratio'])

