# Machine Learning in Scikit-learn

If you are a self-guided learner:      
https://scikit-learn.org/stable/auto_examples/index.html   
Scikit-learn documentation has a variety of examples covering the broad scope of ML

__Steps__

1. Choose a model by importing its class from Scikit-Learn
2. Choose (non-data) parameters by creating an instance of the above class
3. Arrange data into the labels matrix and features matrix
4. Fit the data to the instance of the model
5. Examine results (e.g. predict, plot)

In [8]:
import os
import pandas as pd
import requests

In [17]:
path = '/Users/Sarah/Documents/GitHub/Sarah-Discussion-Notebooks/Data'

base_url = 'https://data.cityofchicago.org/api/views/{}/rows.csv?accessType=DOWNLOAD'
file_name = [('kn9c-c2s2', 'Chicago_SES.csv'),
            ('j6cj-r444','Chicago_Death.csv')]



def download_data(url, filename):
    if filename not in os.listdir():
        response = requests.get(url)
        output = response.text
        with open(filename, 'w') as ofile:
            ofile.write(output)

def read_data(path, filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(os.path.join(path, filename))
    elif filename.endswith('.xls'):
        df = pd.read_excel(os.path.join(path, filename))
    else:
        return 'unexpected file type in read_data'
    return df

def build_urls(base_url, num):
    return base_url.format(num)    
    
        
def parse_death(death_df):
    death_df.rename(columns = {'Community Area': 'Community Area Number'}, inplace=True)
    avg_an_death = death_df.pivot(index = 'Community Area Number', columns='Cause of Death', 
                                  values='Average Adjusted Rate 2006 - 2010')
    avg_an_death.drop(0, axis = 0, inplace = True) #drop the Chicago Total
    avg_an_death.reset_index(inplace = True)
    # create a colomn for Quartile of Avearge Annual Deaths by Coronary Heart Disease
    avg_an_death['CVD_quartile'] = (pd.qcut(avg_an_death['Coronary heart disease'], 
                                            4, labels=['Q1', 'Q2', 'Q3', 'Q4']))
    # cite: https://stackoverflow.com/questions/62610541/calculated-quartile-category-of-a-column-in-pandas
    
    # drop total dealths
    avg_an_death.drop(columns = ['All Causes'], inplace = True)
    return avg_an_death

def merge_dfs(SES_df, death_df):     
    SES_death = SES_df.merge(death_df, on='Community Area Number', how = 'inner')
    
    #drop colums with Nan (all cols dropped for this df are completely empty)
    SES_death.dropna(axis=1,inplace=True)

    return SES_death


def prep_df(base_url, file_name):
    urls = [] 
    for n, f in file_name:
        urls.append((build_urls(base_url, n), f)) 
 
    # download if not present andread in df
    df_contents = []
    for url, filename in urls:
        download_data(url,filename)
        df = read_data(path, filename)
        if filename == 'Chicago_Death.csv':
                df_contents.append(parse_death(df))
        else:
            df_contents.append(df)

    merged = merge_dfs(df_contents[0], df_contents[1])
    
    return merged

use_df = prep_df(base_url, file_name)

In [18]:
use_df.head()

Unnamed: 0,Community Area Number,COMMUNITY AREA NAME,PERCENT OF HOUSING CROWDED,PERCENT HOUSEHOLDS BELOW POVERTY,PERCENT AGED 16+ UNEMPLOYED,PERCENT AGED 25+ WITHOUT HIGH SCHOOL DIPLOMA,PERCENT AGED UNDER 18 OR OVER 64,PER CAPITA INCOME,HARDSHIP INDEX,All causes in females,...,Diabetes-related,Firearm-related,"Injury, unintentional","Kidney disease (nephritis, nephrotic syndrome and nephrosis)",Liver disease and cirrhosis,Lung cancer,Prostate cancer in males,Stroke (cerebrovascular disease),Suicide (intentional self-harm),CVD_quartile
0,1.0,Rogers Park,7.7,23.6,8.7,18.2,27.5,23939,39.0,744.0,...,31.4,4.8,25.7,21.3,11.5,41.7,26.3,32.3,8.4,Q4
1,2.0,West Ridge,7.8,17.2,8.8,20.8,38.5,23040,46.0,570.2,...,24.5,3.9,21.1,21.0,7.7,37.8,16.9,34.3,6.1,Q2
2,3.0,Uptown,3.8,24.0,8.9,11.8,22.2,35787,20.0,749.7,...,37.1,4.8,26.1,24.6,14.6,48.1,25.9,39.4,9.1,Q4
3,4.0,Lincoln Square,3.4,10.9,8.2,13.4,25.5,37524,17.0,528.7,...,20.3,2.9,23.9,25.5,10.4,42.9,23.7,30.6,6.1,Q2
4,5.0,North Center,0.3,7.5,5.2,4.5,26.2,57123,6.0,602.6,...,19.4,1.5,23.2,21.9,11.2,41.2,15.2,39.8,9.2,Q3


__Steps__

1. Choose a model by importing its class from Scikit-Learn
2. Choose (non-data) parameters by creating an instance of the above class
3. Arrange data into the labels matrix and features matrix
4. Fit the data to the instance of the model
5. Examine results (e.g. predict, plot)

In [21]:
use_df.columns

Index(['Community Area Number', 'COMMUNITY AREA NAME',
       'PERCENT OF HOUSING CROWDED', 'PERCENT HOUSEHOLDS BELOW POVERTY',
       'PERCENT AGED 16+ UNEMPLOYED',
       'PERCENT AGED 25+ WITHOUT HIGH SCHOOL DIPLOMA',
       'PERCENT AGED UNDER 18 OR OVER 64', 'PER CAPITA INCOME ',
       'HARDSHIP INDEX', 'All causes in females', 'All causes in males',
       'Alzheimers disease', 'Assault (homicide)', 'Breast cancer in females',
       'Cancer (all sites)', 'Colorectal cancer', 'Coronary heart disease',
       'Diabetes-related', 'Firearm-related', 'Injury, unintentional',
       'Kidney disease (nephritis, nephrotic syndrome and nephrosis)',
       'Liver disease and cirrhosis', 'Lung cancer',
       'Prostate cancer in males', 'Stroke (cerebrovascular disease)',
       'Suicide (intentional self-harm)', 'CVD_quartile'],
      dtype='object')

In [29]:
X = use_df.drop(columns = ['CVD_quartile', 'COMMUNITY AREA NAME'])
Y = use_df['CVD_quartile']

In [30]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=123)

In [31]:
print(X_train.shape)
print(Y_train.shape)

print(X_test.shape)
print(Y_test.shape)

(61, 25)
(61,)
(16, 25)
(16,)


In [74]:
X_train.head()

Unnamed: 0,Community Area Number,PERCENT OF HOUSING CROWDED,PERCENT HOUSEHOLDS BELOW POVERTY,PERCENT AGED 16+ UNEMPLOYED,PERCENT AGED 25+ WITHOUT HIGH SCHOOL DIPLOMA,PERCENT AGED UNDER 18 OR OVER 64,PER CAPITA INCOME,HARDSHIP INDEX,All causes in females,All causes in males,...,Coronary heart disease,Diabetes-related,Firearm-related,"Injury, unintentional","Kidney disease (nephritis, nephrotic syndrome and nephrosis)",Liver disease and cirrhosis,Lung cancer,Prostate cancer in males,Stroke (cerebrovascular disease),Suicide (intentional self-harm)
71,72.0,0.9,5.1,8.0,3.7,40.5,39523,12.0,689.1,792.9,...,132.6,19.8,5.7,18.4,14.2,5.3,44.1,44.0,49.7,7.4
43,44.0,3.3,27.8,24.0,14.5,40.3,18881,60.0,740.8,1229.5,...,133.7,37.7,36.3,35.3,30.1,5.9,51.1,54.3,42.1,7.6
51,52.0,6.8,19.2,12.1,31.9,42.8,17104,64.0,639.2,997.8,...,106.5,35.1,9.6,37.0,25.2,16.2,42.2,26.3,32.3,5.6
1,2.0,7.8,17.2,8.8,20.8,38.5,23040,46.0,570.2,843.0,...,136.3,24.5,3.9,21.1,21.0,7.7,37.8,16.9,34.3,6.1
37,38.0,3.3,29.3,24.3,15.9,39.5,23472,57.0,775.6,1356.0,...,157.6,31.6,26.3,42.9,21.3,12.2,65.7,35.9,49.0,6.0


In [75]:
Y_train.head()

71    Q2
43    Q2
51    Q1
1     Q2
37    Q3
Name: CVD_quartile, dtype: category
Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']

In [44]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier


In [76]:
model = GaussianNB()
model.fit(X_train, Y_train)
predict = model.predict(X_test)

In [77]:
predict

array(['Q4', 'Q1', 'Q4', 'Q1', 'Q4', 'Q1', 'Q1', 'Q3', 'Q1', 'Q2', 'Q1',
       'Q1', 'Q4', 'Q4', 'Q1', 'Q3'], dtype='<U2')

In [46]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [78]:
accuracy_score(Y_test, predict)

0.4375

In [79]:
confusion_matrix(Y_test, predict)

array([[3, 0, 0, 2],
       [2, 1, 0, 1],
       [2, 0, 1, 0],
       [1, 0, 1, 2]])

In [80]:
print(classification_report(Y_test, predict))

              precision    recall  f1-score   support

          Q1       0.38      0.60      0.46         5
          Q2       1.00      0.25      0.40         4
          Q3       0.50      0.33      0.40         3
          Q4       0.40      0.50      0.44         4

    accuracy                           0.44        16
   macro avg       0.57      0.42      0.43        16
weighted avg       0.56      0.44      0.43        16



In [71]:
model =  LinearDiscriminantAnalysis()
model.fit(X_train, Y_train)
predict = model.predict(X_test)

In [72]:
confusion_matrix(Y_test, predict)

array([[3, 2, 0, 0],
       [1, 1, 2, 0],
       [0, 3, 0, 0],
       [0, 0, 2, 2]])

In [73]:
print(classification_report(Y_test, predict))

              precision    recall  f1-score   support

          Q1       0.75      0.60      0.67         5
          Q2       0.17      0.25      0.20         4
          Q3       0.00      0.00      0.00         3
          Q4       1.00      0.50      0.67         4

    accuracy                           0.38        16
   macro avg       0.48      0.34      0.38        16
weighted avg       0.53      0.38      0.42        16



In [40]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

In [41]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB

# create instances of the models
models = [('Dec Tree', DecisionTreeClassifier()), 
          ('Lin Disc', LinearDiscriminantAnalysis()), 
          ('Gauss', GaussianNB()), 
          ('SVC', SVC(gamma='auto'))]

In [61]:
results = []

for name, model in models:
    kf = StratifiedKFold(n_splits=10, random_state=123)
    res = cross_val_score(model, X_train, Y_train, cv=kf, scoring='accuracy')
    res_mean = round(res.mean(), 4)
    res_std  = round(res.std(), 4)
    results.append((name, res_mean, res_std))



In [62]:
results #model, mean accuracy, std of accuracy

[('Dec Tree', 0.9333, 0.1106),
 ('Lin Disc', 0.4762, 0.1899),
 ('Gauss', 0.5405, 0.2333),
 ('SVC', 0.1786, 0.0357)]

In [56]:
cv = cross_val_score(model, X_train, Y_train, cv=kf, scoring='accuracy')
cv
# What is this? 
# see here: https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation 

array([0.28571429, 0.16666667, 0.16666667, 0.16666667, 0.16666667,
       0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667])

In [1]:

from pandas_datareader import wb

In [2]:
indicator = 'NY.GDP.MKTP.CD'
country = 'CL'

In [3]:
df = wb.download(indicator=indicator, country=country, start=2000, end=2010)