In [1]:
import pandas as pd
import numpy as np

In [2]:
# Import explore_ml results file
explore_df = pd.read_csv("./Resources/explore_ML_results.csv")
explore_df.head(119)

Unnamed: 0,State_County,StateDesc,LocationName,Levels_Smokers,Levels_COPD,Low CL COPD,High CL COPD
0,"Iowa, Cass",Iowa,Cass,19.3,8.3,7.0,9.8
1,"Iowa, Monona",Iowa,Monona,18.5,8.3,7.0,9.5
2,"Alaska, Dillingham",Alaska,Dillingham,31.1,8.9,7.7,10.0
3,"Colorado, Custer",Colorado,Custer,13.8,7.5,6.1,9.0
4,"Alaska, Ketchikan Gateway",Alaska,Ketchikan Gateway,19.4,6.6,5.8,7.5
...,...,...,...,...,...,...,...
114,"Alabama, Madison",Alabama,Madison,15.7,6.7,5.7,7.7
115,"California, Sutter",California,Sutter,14.4,6.2,5.5,7.0
116,"Colorado, Gunnison",Colorado,Gunnison,13.1,4.5,3.8,5.4
117,"Arizona, Yavapai",Arizona,Yavapai,15.5,9.3,7.8,10.9


In [3]:
explore_df.shape

(3122, 7)

In [4]:
explore_df[explore_df['LocationName'].isnull()]

Unnamed: 0,State_County,StateDesc,LocationName,Levels_Smokers,Levels_COPD,Low CL COPD,High CL COPD
33,,United States,,15.3,6.6,6.5,6.7


In [5]:
# Drop the row containing "United States"
explore_df = explore_df[explore_df.LocationName != "United States"]

In [6]:
# Checking that this dataframe was merged correctly and that all Levels of COPD fall in CL range
accurate_predictions = explore_df[(explore_df['Levels_COPD'] >= explore_df['Low CL COPD']) & 
           (explore_df['Levels_COPD'] <= explore_df['High CL COPD'])].count()
accurate_predictions

State_County      3121
StateDesc         3122
LocationName      3121
Levels_Smokers    3122
Levels_COPD       3122
Low CL COPD       3122
High CL COPD      3122
dtype: int64

In [7]:
explore_df.shape

(3122, 7)

In [8]:
# Import RandomForestRegression-Complex Results
results_df = pd.read_csv("./Resources/RandomForest_Pred_vs_Actual_y_Results.csv")

results_df = results_df.set_index("Unnamed: 0")
results_df.index.name = None

results_df.head(119)

Unnamed: 0,Real Values,Predicted Values
1095,14.0,14.00
642,6.9,8.92
1573,7.3,7.70
576,6.8,7.77
118,4.9,5.17
...,...,...
2484,7.9,7.92
2577,6.3,7.45
2048,10.1,9.35
1784,13.9,13.20


In [9]:
results_df.shape

(954, 2)

In [10]:
explore_df.columns.tolist()

['State_County',
 'StateDesc',
 'LocationName',
 'Levels_Smokers',
 'Levels_COPD',
 'Low CL COPD',
 'High CL COPD']

In [11]:
explore_df = explore_df[['State_County',
                         'StateDesc',
                         'LocationName',
                         'Levels_Smokers',
                         'Levels_COPD',
                         'Low CL COPD',
                         'High CL COPD'
                        ]]
explore_df

Unnamed: 0,State_County,StateDesc,LocationName,Levels_Smokers,Levels_COPD,Low CL COPD,High CL COPD
0,"Iowa, Cass",Iowa,Cass,19.3,8.3,7.0,9.8
1,"Iowa, Monona",Iowa,Monona,18.5,8.3,7.0,9.5
2,"Alaska, Dillingham",Alaska,Dillingham,31.1,8.9,7.7,10.0
3,"Colorado, Custer",Colorado,Custer,13.8,7.5,6.1,9.0
4,"Alaska, Ketchikan Gateway",Alaska,Ketchikan Gateway,19.4,6.6,5.8,7.5
...,...,...,...,...,...,...,...
3117,"Wisconsin, Sauk",Wisconsin,Sauk,16.8,6.5,5.5,7.6
3118,"Wisconsin, Brown",Wisconsin,Brown,15.1,5.5,4.7,6.5
3119,"West Virginia, Webster",West Virginia,Webster,26.8,15.3,13.1,17.3
3120,"Wisconsin, Winnebago",Wisconsin,Winnebago,17.1,5.8,4.9,6.7


In [12]:
explore_df = explore_df.rename(columns={'StateDesc': 'State', 
                                        'LocationName': 'County'})

In [13]:
explore_df

Unnamed: 0,State_County,State,County,Levels_Smokers,Levels_COPD,Low CL COPD,High CL COPD
0,"Iowa, Cass",Iowa,Cass,19.3,8.3,7.0,9.8
1,"Iowa, Monona",Iowa,Monona,18.5,8.3,7.0,9.5
2,"Alaska, Dillingham",Alaska,Dillingham,31.1,8.9,7.7,10.0
3,"Colorado, Custer",Colorado,Custer,13.8,7.5,6.1,9.0
4,"Alaska, Ketchikan Gateway",Alaska,Ketchikan Gateway,19.4,6.6,5.8,7.5
...,...,...,...,...,...,...,...
3117,"Wisconsin, Sauk",Wisconsin,Sauk,16.8,6.5,5.5,7.6
3118,"Wisconsin, Brown",Wisconsin,Brown,15.1,5.5,4.7,6.5
3119,"West Virginia, Webster",West Virginia,Webster,26.8,15.3,13.1,17.3
3120,"Wisconsin, Winnebago",Wisconsin,Winnebago,17.1,5.8,4.9,6.7


In [14]:
# Merge dataframes
merged_df = pd.merge(explore_df, results_df, left_index=True, right_index=True)
merged_df

Unnamed: 0,State_County,State,County,Levels_Smokers,Levels_COPD,Low CL COPD,High CL COPD,Real Values,Predicted Values
1,"Iowa, Monona",Iowa,Monona,18.5,8.3,7.0,9.5,8.3,7.96
9,"Arkansas, Perry",Arkansas,Perry,22.6,11.1,9.6,12.6,11.1,10.83
10,"Arkansas, Carroll",Arkansas,Carroll,19.2,9.8,8.4,11.2,9.8,8.09
14,"Arkansas, Madison",Arkansas,Madison,23.5,11.3,9.8,13.0,11.3,11.62
15,"Colorado, Otero",Colorado,Otero,16.3,7.1,6.3,7.9,7.1,7.19
...,...,...,...,...,...,...,...,...,...
3105,"Wisconsin, Bayfield",Wisconsin,Bayfield,15.9,7.5,6.3,8.7,3.9,4.04
3116,"Wyoming, Platte",Wyoming,Platte,18.2,8.5,7.3,9.8,5.8,6.58
3117,"Wisconsin, Sauk",Wisconsin,Sauk,16.8,6.5,5.5,7.6,11.5,10.72
3119,"West Virginia, Webster",West Virginia,Webster,26.8,15.3,13.1,17.3,13.1,12.17


In [15]:
# Check that merge occurred correctly
merged_df[(merged_df['Levels_COPD'] == merged_df['Real Values'])].count()

State_County        59
State               59
County              59
Levels_Smokers      59
Levels_COPD         59
Low CL COPD         59
High CL COPD        59
Real Values         59
Predicted Values    59
dtype: int64

In [16]:
# Drop unneeded columns
merged_df = merged_df.drop(['Levels_Smokers', 'Levels_COPD'], axis=1)

In [17]:
# Check merge worked properly by seeing that all rows have a predicted value
merged_df["Predicted Values"].isnull().sum()

0

In [18]:
# View a random sample of data
merged_df.sample(5)

Unnamed: 0,State_County,State,County,Low CL COPD,High CL COPD,Real Values,Predicted Values
3084,"West Virginia, Tyler",West Virginia,Tyler,10.6,13.9,6.8,7.09
1450,"Minnesota, Martin",Minnesota,Martin,6.0,8.4,7.4,7.48
1095,"Louisiana, Jackson",Louisiana,Jackson,9.3,11.9,14.0,14.0
1684,"Missouri, Knox",Missouri,Knox,10.0,13.7,11.1,8.54
1010,"Kansas, Phillips",Kansas,Phillips,7.1,9.8,11.8,12.39


In [19]:
total_predictions = merged_df["Predicted Values"].count()

In [20]:
accurate_predictions = merged_df[(merged_df['Predicted Values'] >= merged_df['Low CL COPD']) & 
           (merged_df['Predicted Values'] <= merged_df['High CL COPD'])].count()


In [21]:
num_accurate_pred = (accurate_predictions[6] / total_predictions) * 100
print(f"The RandomForestRegression model had {num_accurate_pred:.2f}% of predictions within the confidence interval")

The RandomForestRegression model had 34.48% of predictions within the confidence interval


In [22]:
accurate_predictions[6]

322

In [23]:
merged_df[(merged_df['Predicted Values'] >= merged_df['Low CL COPD'])].count()

State_County        649
State               649
County              649
Low CL COPD         649
High CL COPD        649
Real Values         649
Predicted Values    649
dtype: int64

In [24]:
merged_df[(merged_df['Predicted Values'] <= merged_df['Low CL COPD'])].count()

State_County        286
State               286
County              286
Low CL COPD         286
High CL COPD        286
Real Values         286
Predicted Values    286
dtype: int64

In [25]:
merged_df[(merged_df['Predicted Values'] >= merged_df['High CL COPD'])].count()

State_County        329
State               329
County              329
Low CL COPD         329
High CL COPD        329
Real Values         329
Predicted Values    329
dtype: int64

In [26]:
merged_df[(merged_df['Predicted Values'] <= merged_df['High CL COPD'])].count()

State_County        607
State               607
County              607
Low CL COPD         607
High CL COPD        607
Real Values         607
Predicted Values    607
dtype: int64