In [174]:
#import modules and set parameters
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans

In [86]:
#read the training data
data = pd.read_csv(r'C:\Users\tolagu\Documents\lbr-train.csv')

In [87]:
#do a quick check of the training data
data.head()

Unnamed: 0.1,Unnamed: 0,ID,LOW,AGE,LWT,RACE,SMOKE,PTL,HT,UI,FTV,BWT
0,120,216,0,16,95,3,0,0,0,0,1,3997
1,156,37,1,17,130,3,1,1,0,1,0,2125
2,77,169,0,25,140,1,0,0,0,0,1,3416
3,140,20,1,21,165,1,1,0,1,0,1,1790
4,124,220,0,22,129,1,0,0,0,0,0,4111


In [88]:
#do another quick check of the training data
data.shape

(151, 12)

In [89]:
#check for null values in the training data
data.isnull().sum()

Unnamed: 0    0
ID            0
LOW           0
AGE           0
LWT           0
RACE          0
SMOKE         0
PTL           0
HT            0
UI            0
FTV           0
BWT           0
dtype: int64

In [90]:
#The 'RACE'feature is unordered. Create new column to encode 'White'
data['White'] = data.RACE.map({1:1, 2:0, 3:0})

In [91]:
#The 'RACE'feature is unordered. Create new column to encode 'Black'
data['Black']= data.RACE.map({1:0, 2:1, 3:0})

In [92]:
#The 'RACE'feature is unordered. Create new column to encode 'Other'
data['Other']= data.RACE.map({1:0, 2:0, 3:1})

In [93]:
#check new dataframe for the encoded features
data.head()

Unnamed: 0.1,Unnamed: 0,ID,LOW,AGE,LWT,RACE,SMOKE,PTL,HT,UI,FTV,BWT,White,Black,Other
0,120,216,0,16,95,3,0,0,0,0,1,3997,0,0,1
1,156,37,1,17,130,3,1,1,0,1,0,2125,0,0,1
2,77,169,0,25,140,1,0,0,0,0,1,3416,1,0,0
3,140,20,1,21,165,1,1,0,1,0,1,1790,1,0,0
4,124,220,0,22,129,1,0,0,0,0,0,4111,1,0,0


In [94]:
#The 'Unnamed' column not useful. Delete column
del data['Unnamed: 0']

In [95]:
#check that 'Unnamed' column has been deleted
data.head()

Unnamed: 0,ID,LOW,AGE,LWT,RACE,SMOKE,PTL,HT,UI,FTV,BWT,White,Black,Other
0,216,0,16,95,3,0,0,0,0,1,3997,0,0,1
1,37,1,17,130,3,1,1,0,1,0,2125,0,0,1
2,169,0,25,140,1,0,0,0,0,1,3416,1,0,0
3,20,1,21,165,1,1,0,1,0,1,1790,1,0,0
4,220,0,22,129,1,0,0,0,0,0,4111,1,0,0


In [96]:
#Generate summary stats for the training data
data.describe()

Unnamed: 0,ID,LOW,AGE,LWT,RACE,SMOKE,PTL,HT,UI,FTV,BWT,White,Black,Other
count,151.0,151.0,151.0,151.0,151.0,151.0,151.0,151.0,151.0,151.0,151.0,151.0,151.0,151.0
mean,120.741722,0.317881,23.423841,129.172185,1.847682,0.370861,0.192053,0.059603,0.145695,0.847682,2946.13245,0.509934,0.13245,0.357616
std,63.37912,0.467202,5.454585,29.140067,0.921942,0.484643,0.472091,0.237537,0.353975,1.024684,717.087346,0.501565,0.340108,0.480893
min,10.0,0.0,14.0,80.0,1.0,0.0,0.0,0.0,0.0,0.0,1021.0,0.0,0.0,0.0
25%,67.5,0.0,19.0,110.0,1.0,0.0,0.0,0.0,0.0,0.0,2412.0,0.0,0.0,0.0
50%,116.0,0.0,23.0,120.0,1.0,0.0,0.0,0.0,0.0,1.0,2920.0,1.0,0.0,0.0
75%,178.0,1.0,26.5,140.0,3.0,1.0,0.0,0.0,0.0,1.0,3515.5,1.0,0.0,1.0
max,226.0,1.0,45.0,241.0,3.0,1.0,2.0,1.0,1.0,4.0,4990.0,1.0,1.0,1.0


In [97]:
#I'll be fitting the Linear Regression model first. 
#For Linear Regression, I will try and predict 'BWT' as the target feature 
#since 'BWT' is a continous feature
#First, create the target variable Y and features X
feature_cols = ['AGE', 'LWT', 'SMOKE', 'PTL', 'HT', 'UI', 'FTV', 'White', 'Black', 'Other']
X = data[feature_cols]
Y = data['BWT']

In [98]:
#create a variable for the Linear Regression model and fit the model
linreg = LinearRegression()
linreg.fit(X,Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [99]:
#print the coefficients
print linreg.intercept_
print linreg.coef_

2439.37677394
[  -1.97093149    5.58547823 -282.77279277 -103.87223949 -625.53761402
 -608.10756364  -11.11181041  251.01731928 -236.80084241  -14.21647687]


In [100]:
#pair feature names with the coefficients
zip(feature_cols, linreg.coef_)

[('AGE', -1.9709314887938811),
 ('LWT', 5.5854782273527999),
 ('SMOKE', -282.77279277399873),
 ('PTL', -103.87223948901413),
 ('HT', -625.53761401989163),
 ('UI', -608.10756363765063),
 ('FTV', -11.11181040887702),
 ('White', 251.01731927529019),
 ('Black', -236.80084240855859),
 ('Other', -14.216476866731938)]

In [101]:
#import test data set
data_test = pd.read_csv(r'C:\Users\tolagu\Documents\lbr-test.csv')

In [102]:
data_test.head()

Unnamed: 0.1,Unnamed: 0,ID,LOW,AGE,LWT,RACE,SMOKE,PTL,HT,UI,FTV,BWT
0,5,89,0,18,107,1,1,0,0,1,0,2600
1,13,98,0,22,95,3,0,0,1,0,0,2750
2,31,117,0,17,113,2,0,0,0,0,1,2920
3,37,124,0,19,138,1,1,0,0,0,2,2977
4,38,125,0,27,124,1,1,0,0,0,0,2992


In [103]:
#perform necessary transformation on test data
data_test['White'] = data.RACE.map({1:1, 2:0, 3:0})
data_test['Black']= data.RACE.map({1:0, 2:1, 3:0})
data_test['Other']= data.RACE.map({1:0, 2:0, 3:1})

In [104]:
#check test data
data_test.head()

Unnamed: 0.1,Unnamed: 0,ID,LOW,AGE,LWT,RACE,SMOKE,PTL,HT,UI,FTV,BWT,White,Black,Other
0,5,89,0,18,107,1,1,0,0,1,0,2600,0,0,1
1,13,98,0,22,95,3,0,0,1,0,0,2750,0,0,1
2,31,117,0,17,113,2,0,0,0,0,1,2920,1,0,0
3,37,124,0,19,138,1,1,0,0,0,2,2977,1,0,0
4,38,125,0,27,124,1,1,0,0,0,0,2992,1,0,0


In [105]:
#delete the Unnamed column
del data_test['Unnamed: 0']

In [106]:
#check test data
data_test.head()

Unnamed: 0,ID,LOW,AGE,LWT,RACE,SMOKE,PTL,HT,UI,FTV,BWT,White,Black,Other
0,89,0,18,107,1,1,0,0,1,0,2600,0,0,1
1,98,0,22,95,3,0,0,1,0,0,2750,0,0,1
2,117,0,17,113,2,0,0,0,0,1,2920,1,0,0
3,124,0,19,138,1,1,0,0,0,2,2977,1,0,0
4,125,0,27,124,1,1,0,0,0,0,2992,1,0,0


In [107]:
#set a parameter for values of 'BWT' in the test data
true_test = data_test['BWT']

In [108]:
#set the features
test_feature_cols = ['AGE', 'LWT', 'SMOKE', 'PTL', 'HT', 'UI', 'FTV', 'White', 'Black', 'Other']
X_test = data_test[test_feature_cols]

In [109]:
#use fitted model to make prediction on the test data
Y_test_predict = linreg.predict(X_test)

In [110]:
#derive Mean Squared Error for the Y_test_predict vs true_test
mnse = np.sqrt(metrics.mean_squared_error(true_test, Y_test_predict))

In [111]:
#MNSE
mnse

810.32704477164373

In [112]:
#Repeat process above for the K-Nearest Neighbor (KNN)
#Since KNN is classification, the attribute to predict is LOW
#define X and Y for knn on the training dataset
feature_k = ['AGE', 'LWT', 'SMOKE', 'PTL', 'HT', 'UI', 'FTV', 'White', 'Black', 'Other']
X_k = data[feature_k]
Y_k = data['LOW']

In [113]:
#write the knn model into a variable and use it to train the training data, where number of neighbors (k) = 5
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_k, Y_k)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [114]:
#format test data
#set a parameter for values of 'LOW' in the test data
true_test_k = data_test['LOW']

In [126]:
#set the features in the test data
test_feature_cols_k = ['AGE', 'LWT', 'SMOKE', 'PTL', 'HT', 'UI', 'FTV', 'White', 'Black', 'Other']
X_test_k = data_test[test_feature_cols_k]

In [127]:
#make predictions on the test model using trained model
k_predict = knn.predict(X_test_k)

In [128]:
k_predict

array([1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], dtype=int64)

In [129]:
#compute accuracy for the knn model
print metrics.accuracy_score(true_test_k,k_predict)*100

55.2631578947


In [167]:
#Repeat process above for the K-Means (KM)
#I will use K-Means as an unsupervised learning model to cluster BWT into 5 clusters on the test data set
#define X for kmeans on the training dataset
feature_km = ['BWT']
X_km = data_test[feature_km]

In [168]:
#write the kmeans model into a variable and use it to train the training data, where number of clusters (k) = 5
kmeans = KMeans(n_clusters=5)
kmeans.fit(X_km)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=5, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [169]:
centroid = kmeans.cluster_centers_
labels = kmeans.labels_
print (centroid)
print(labels)

[[ 1962.25      ]
 [ 3502.09090909]
 [ 4238.33333333]
 [ 2935.26666667]
 [  709.        ]]
[3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 2 2 2 4 0 0 0 0 0 0 0 0 3
 3]


In [141]:
#Decision Tree
#Since Decision Tree is a classification model, the attribute to classify is LOW
#define X and Y on the training dataset
feature_cols_dt = ['AGE', 'LWT', 'SMOKE', 'PTL', 'HT', 'UI', 'FTV', 'White', 'Black', 'Other']
X_dt = data_test[feature_cols_dt]
Y_dt = data_test['LOW']

In [142]:
#write the decision tree model into a variable and use it to train the training data, with the tree depth = 3
treeclf = DecisionTreeClassifier(max_depth = 3, random_state=1)
treeclf.fit(X_dt, Y_dt)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=1, splitter='best')

In [143]:
#set the features in the test data
test_feature_cols_dt = ['LOW']
X_test_dt = data_test[test_feature_cols_km]

In [171]:
#make predictions on the test data using trained decision tree model and print result
dt_X_test = treeclf.predict(X_dt)
dt_X_test

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1], dtype=int64)

In [153]:
#to determine performanace of the decision tree model, count the number of 1s and 0s in the test data set 
#and compare with 1s and 0s in the prediction results
#count of 1s and 0s in test data set
Y_dt.value_counts()

0    27
1    11
Name: LOW, dtype: int64

In [173]:
#count the number of 1s and 0s in the predicted data
counts_of_dt = dt_X_test
df_dt = pd.DataFrame(counts_of_dt)
df_dt.columns = ['Result']
df_dt_result = df_dt['Result']
df_dt_result.value_counts()

0    34
1     4
Name: Result, dtype: int64