# Loading Libraries

In [1]:
import numpy as np #Library for Linear Algebra
import pandas as pd #Library for Data Manipulation

# Importing Standard Metropolitan Dataset

In [2]:
from google.colab import files
uploaded = files.upload()

Saving Standard Metropolitan Areas Dataset.csv to Standard Metropolitan Areas Dataset.csv


In [3]:
SMA = pd.read_csv('Standard Metropolitan Areas Dataset.csv')

# Checking for missingness in variables using the info() function

In [4]:
SMA.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   land_area       99 non-null     int64  
 1   percent_city    99 non-null     float64
 2   percent_senior  99 non-null     float64
 3   physicians      99 non-null     int64  
 4   hospital_beds   99 non-null     int64  
 5   graduates       99 non-null     float64
 6   work_force      99 non-null     float64
 7   income          99 non-null     int64  
 8   region          99 non-null     int64  
 9   crime_rate      99 non-null     float64
dtypes: float64(5), int64(5)
memory usage: 7.9 KB


## There are no missing values in the SMA dataset
### There are ten (10) columns/features/variables in the SMA dataset and ninety-nine (99) rows
#### crime_rate is the outcome or target variable (y) and other variables excluding crime_rate are the input variables (x)

# Using the head() and tail() function on the SMA dataset
### These functions will further help in observing and understanding the SMA dataset

In [6]:
SMA.head() # Returns the first five row of the SMA dataset

Unnamed: 0,land_area,percent_city,percent_senior,physicians,hospital_beds,graduates,work_force,income,region,crime_rate
0,1384,78.1,12.3,25627,69678,50.1,4083.9,72100,1,75.55
1,3719,43.9,9.4,13326,43292,53.9,3305.9,54542,2,56.03
2,3553,37.4,10.7,9724,33731,50.6,2066.3,33216,1,41.32
3,3916,29.9,8.8,6402,24167,52.2,1966.7,32906,2,67.38
4,2480,31.5,10.5,8502,16751,66.1,1514.5,26573,4,80.19


#### It can be observed that the numeric values above are in int(whole number) and float(decimal number).
#### It is important to note that the region is a categorical variable.

# Checking for the unique value of the categorical variable (region)

In [7]:
SMA['region'].unique()

array([1, 2, 4, 3])

# The unique value above expresses that there are four regions in the SMA dataset
### From kaggle.com these regions include:
#### 1 = North-East, 2 = North-Central, 3 = South, 4 = West.

## In the case of categorical variables, it is necessary to do a one hot encoding i.e. coverting information into binary (0s & 1s) but the instruction seeks for a non-technical decision tree model

In [10]:
SMA.tail() # Returns the last five rows of the SMA dataset

Unnamed: 0,land_area,percent_city,percent_senior,physicians,hospital_beds,graduates,work_force,income,region,crime_rate
94,1511,38.7,10.7,348,1093,50.4,127.2,1452,4,70.66
95,1543,39.6,8.1,159,481,30.3,80.6,769,3,36.36
96,1011,37.8,10.5,264,964,70.7,93.2,1337,3,60.16
97,813,13.4,10.9,371,4355,58.0,97.0,1589,1,36.33
98,654,28.8,3.9,140,1296,55.1,66.9,1148,3,68.76


#### It can be reconfirmed from the tail function that there are 99 rows in the SMA dataset 

# Splitting features/variables into outcome/target variable (y) and input variables (x)

In [11]:
x = SMA.drop(['crime_rate'], axis=1) # This means include all other features in the SMA dataset except crime_rate, axis=1 means that the command should be done on the columns
y = SMA['crime_rate'] # This means assign crime_rate only to y.

In [13]:
x.info() # This is to reaffirm that there are no missing values in the input variables

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   land_area       99 non-null     int64  
 1   percent_city    99 non-null     float64
 2   percent_senior  99 non-null     float64
 3   physicians      99 non-null     int64  
 4   hospital_beds   99 non-null     int64  
 5   graduates       99 non-null     float64
 6   work_force      99 non-null     float64
 7   income          99 non-null     int64  
 8   region          99 non-null     int64  
dtypes: float64(4), int64(5)
memory usage: 7.1 KB


## There are no missing values in the input variables.


### Describing the input variables

In [14]:
x.describe() # This will help explore the input variables

Unnamed: 0,land_area,percent_city,percent_senior,physicians,hospital_beds,graduates,work_force,income,region
count,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0
mean,2615.727273,42.518182,9.781818,1828.333333,6345.868687,54.463636,449.366667,6762.505051,2.494949
std,3045.82621,17.348277,2.524547,3192.199763,9136.202716,7.773286,610.990885,10393.34966,1.013921
min,47.0,13.4,3.9,140.0,481.0,30.3,66.9,769.0,1.0
25%,1408.0,30.1,8.35,459.0,2390.0,50.25,150.3,2003.0,2.0
50%,1951.0,39.5,9.7,774.0,3472.0,54.0,257.2,3510.0,3.0
75%,2890.5,52.6,10.75,1911.5,6386.5,58.3,436.5,6283.5,3.0
max,27293.0,100.0,21.8,25627.0,69678.0,72.8,4083.9,72100.0,4.0


# This will be explained thoroughly in my next code as the project instruction focuses on prediciting crime_rate

# Building the Decision Tree Model
## Decision Tree can be used on either classification or regression problem in ML Algorithm
### In this case, crime rate is a continuous variable thus it is a regression problem.
#### Import the DecisionTreeRegressor for regression problem and DecisionTreeClassifier for classification problem
#### In this case, the DecisionTreeRegressor would be imported.


In [15]:
from sklearn.tree import DecisionTreeRegressor # importing decision tree library

SMA_model = DecisionTreeRegressor(random_state=1) # The random state is to return same result if the command is run over, over, and again

SMA_model.fit(x,y)

DecisionTreeRegressor(random_state=1)

#### We now have a fitted model that can be used to make predictions.
#### Make prediction for the first five rows of the training data

In [18]:
print("Predicting Crime Rate")
print(x.head())
print("The predictions are")
print(SMA_model.predict(x.head()))

Predicting Crime Rate
   land_area  percent_city  percent_senior  physicians  hospital_beds  \
0       1384          78.1            12.3       25627          69678   
1       3719          43.9             9.4       13326          43292   
2       3553          37.4            10.7        9724          33731   
3       3916          29.9             8.8        6402          24167   
4       2480          31.5            10.5        8502          16751   

   graduates  work_force  income  region  
0       50.1      4083.9   72100       1  
1       53.9      3305.9   54542       2  
2       50.6      2066.3   33216       1  
3       52.2      1966.7   32906       2  
4       66.1      1514.5   26573       4  
The predictions are
[75.55 56.03 41.32 67.38 80.19]


## The predicted crime rate accordingly to the Decision Tree Model are:
## 75.55, 56.03, 41.32, 67.38, & 80.19
## The above predictions are for new crime rate and not existing crime rate as seen in SMA.head() in the earlier command.

# Note: This ML Model is a non-technical one as instructed by the teacher. The technical ML Model puts into consideration splitting the SMA dataset into training and test datasets, checking for the accuracy_score of the training dataset as well as the test dataset, and other important things.


## This will be covered in my next ML Algorithm.