In [0]:
import pandas as pd
import matplotlib as plt

In [0]:
csv_data = pd.read_csv("https://raw.githubusercontent.com/CSafewright/CS440_autism_datasets/refs/heads/main/autism_prevalence_studies_20250928.csv")
df = pd.DataFrame(csv_data)

Dropping unneeded columns

In [0]:
df.drop(['Title','Link to Publication', 'CDC Calculated Values',
         'Adaptive Score <70 (%)', 'Diagnosis Age Range (months)',
         'Diagnosis Median Age (months)', 'Diagnosis Mean Age (months)',
         'Non-Verbal or Minimally Verbal (%)', 'Percentage of Individual Co-occurring Conditions',
         'IQ Score <70 (%)', 'Non-Hispanic White:Hispanic Prevalence Ratio',
         'White:Black Prevalence Ratio', 'Autism Types Included',
         'Case Criterion', 'Confidence Interval (CI)'], axis=1, inplace=True)

Dropping study where 18 to 64 was the age range. Not important to us, we are looking at autism in children.

In [0]:
df = df[df['Age Range'] != '18 to 64']

5 entries did not have an area. Changed those to be 'Unknown'

In [0]:
df['Area(s)'] = df['Area(s)'].fillna('Unknown')

Dropped 15 rows where the was no sample size given. Resulting in 191 rows.  
Removed commas from Sample Size so it can be changed to numbers.

In [0]:
df = df.dropna(subset=['Sample Size'])
df['Sample Size'] = df['Sample Size'].replace(',', '', regex=True)

Dropped 5 rows where Number of Cases was not given, resulting in 186 rows.
Removed commas from Number of Casess so it can be changed to numbers.

In [0]:
df = df.dropna(subset=['Number of Cases'])
df['Number of Cases'] = df['Number of Cases'].replace(',', '', regex=True)
df.count()

Author                               186
Year Published                       186
Country                              186
Area(s)                              186
Age Range                            181
Study Years                          164
Case Identification Method           186
Sample Size                          186
Number of Cases                      186
ASD Prevalence Estimate per 1,000    186
Male:Female Sex Ratio                129
dtype: int64

Splitting 'Study Years' column and 'Age Range' column into two seperate columns each.  
When the study years only occured for one year, the year started and year ended column are the same.  
When the age range is only one age, the youngest and oldest age columns are the same.

In [0]:
# Split Study Years into start and end
df[['year_started', 'year_ended']] = (
    df['Study Years'].str.extract(r'(\d+)\s*-\s*(\d+)')
)

# If there are " & " cases instead of "-", handle them
mask_alt = df['Study Years'].str.contains('&', na=False)
df.loc[mask_alt, ['year_started', 'year_ended']] = (
    df.loc[mask_alt, 'Study Years'].str.extract(r'(\d+)\s*&\s*(\d+)')
)

df['year_started'] = df['year_started'].fillna(df['Year Published'])

# Fill missing end years with start years
df['year_ended'] = df['year_ended'].fillna(df['year_started'])

# Split Age Range
df[['youngest_age', 'oldest_age']] = (
    df['Age Range'].str.extract(r'(\d+)\s*to\s*(\d+)')
)

df['youngest_age'] = df['youngest_age'].fillna(df['Age Range'])

# Fill missing oldest_age with youngest_age
df['oldest_age'] = df['oldest_age'].fillna(df['youngest_age'])

# Drop original columns
df = df.drop(columns=['Age Range', 'Study Years'])

# Convert to numeric;  UPDATE THIS NEXT TIME: NEED TO REMOVE , FROM SAMPLE SIZE AND NUMBER OF CASES
df[['year_started', 'year_ended', 'youngest_age', 'oldest_age', 'Sample Size', 'Number of Cases']] = (
    df[['year_started', 'year_ended', 'youngest_age', 'oldest_age', 'Sample Size', 'Number of Cases']].apply(pd.to_numeric, errors='coerce')
)

In [0]:
df.describe()

Unnamed: 0,Year Published,Sample Size,Number of Cases,"ASD Prevalence Estimate per 1,000",Male:Female Sex Ratio,year_started,year_ended,youngest_age,oldest_age
count,186.0,186.0,186.0,186.0,129.0,186.0,186.0,180.0,180.0
mean,2009.005376,948550.9,5679.408602,8.592833,4.084574,2006.451613,2007.752688,4.316667,11.533333
std,11.195316,5345721.0,46664.636275,8.256713,1.597605,11.911853,11.303342,3.049819,5.444899
min,1966.0,374.0,2.0,0.077,1.1,1962.0,1966.0,0.0,2.0
25%,2005.0,11731.25,46.0,2.3075,2.9,2002.0,2003.0,2.0,8.0
50%,2012.0,58467.0,191.5,6.2,4.1,2010.0,2011.0,4.5,11.0
75%,2017.0,278566.8,1188.75,12.075,4.8,2015.0,2016.0,7.0,15.25
max,2022.0,51529340.0,625215.0,43.6,9.9,2022.0,2022.0,15.0,30.0


In [0]:
avg_youngest_age = df['youngest_age'].mean()
avg_oldest_age = df['oldest_age'].mean()
df['youngest_age'] = df['youngest_age'].fillna(avg_youngest_age)
df['oldest_age'] = df['oldest_age'].fillna(avg_oldest_age)

Changing empty spaces in Area(s), Sample Size, and Number of Cases columns to 'Unknown'

In [0]:
df.count()

Author                               186
Year Published                       186
Country                              186
Area(s)                              186
Case Identification Method           186
Sample Size                          186
Number of Cases                      186
ASD Prevalence Estimate per 1,000    186
Male:Female Sex Ratio                129
year_started                         186
year_ended                           186
youngest_age                         186
oldest_age                           186
dtype: int64

Male:Female Sex Ratio had 67% of cells. There were only 39 below 3.0 and 59 below 4.0. There were ~75 above 4.0.  
I felt it safe to assign the mean to of 4.07 to the empty cells.
This update changes the USA average from 3.95 to 3.999 (4.0)
For every other study outside of the US, the average changes from 4.114 to 4.1

In [0]:
mf_avg_ratio = df['Male:Female Sex Ratio'].mean()
df['Male:Female Sex Ratio'] = df['Male:Female Sex Ratio'].fillna(mf_avg_ratio)

In [0]:
df.describe()

Unnamed: 0,Year Published,Sample Size,Number of Cases,"ASD Prevalence Estimate per 1,000",Male:Female Sex Ratio,year_started,year_ended,youngest_age,oldest_age
count,186.0,186.0,186.0,186.0,186.0,186.0,186.0,186.0,186.0
mean,2009.005376,948550.9,5679.408602,8.592833,4.084574,2006.451613,2007.752688,4.316667,11.533333
std,11.195316,5345721.0,46664.636275,8.256713,1.328889,11.911853,11.303342,2.999955,5.355876
min,1966.0,374.0,2.0,0.077,1.1,1962.0,1966.0,0.0,2.0
25%,2005.0,11731.25,46.0,2.3075,3.5,2002.0,2003.0,2.0,8.0
50%,2012.0,58467.0,191.5,6.2,4.084574,2010.0,2011.0,4.316667,11.0
75%,2017.0,278566.8,1188.75,12.075,4.45,2015.0,2016.0,6.75,15.0
max,2022.0,51529340.0,625215.0,43.6,9.9,2022.0,2022.0,15.0,30.0


In [0]:
print(df.describe())
print(df.count())

       Year Published   Sample Size  ...  youngest_age  oldest_age
count      186.000000  1.860000e+02  ...    186.000000  186.000000
mean      2009.005376  9.485509e+05  ...      4.316667   11.533333
std         11.195316  5.345721e+06  ...      2.999955    5.355876
min       1966.000000  3.740000e+02  ...      0.000000    2.000000
25%       2005.000000  1.173125e+04  ...      2.000000    8.000000
50%       2012.000000  5.846700e+04  ...      4.316667   11.000000
75%       2017.000000  2.785668e+05  ...      6.750000   15.000000
max       2022.000000  5.152934e+07  ...     15.000000   30.000000

[8 rows x 9 columns]
Author                               186
Year Published                       186
Country                              186
Area(s)                              186
Case Identification Method           186
Sample Size                          186
Number of Cases                      186
ASD Prevalence Estimate per 1,000    186
Male:Female Sex Ratio                186
year_s

In [0]:
df.to_csv('autism_cleaned_data.csv', index=False)