In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.cluster import KMeans



In [2]:
df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data')
df = np.asarray(df)
df

array([['M', 0.35, 0.265, ..., 0.0485, 0.07, 7],
       ['F', 0.53, 0.42, ..., 0.1415, 0.21, 9],
       ['M', 0.44, 0.365, ..., 0.114, 0.155, 10],
       ..., 
       ['M', 0.6, 0.475, ..., 0.2875, 0.308, 9],
       ['F', 0.625, 0.485, ..., 0.261, 0.29600000000000004, 10],
       ['M', 0.71, 0.555, ..., 0.3765, 0.495, 12]], dtype=object)

In [3]:
Y = df[:,0]
X = df[:, 1:]
#scale 
scaler = preprocessing.StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.75)



# Classification model. Decision tree.

In [4]:
future_tree = RandomForestClassifier()
future_tree.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [5]:
prediction = future_tree.predict(X_test)
confusion_matrix(Y_test,prediction)

array([[135,  37, 152],
       [ 45, 260,  42],
       [153,  77, 143]])

In [6]:
print classification_report(Y_test,prediction)

             precision    recall  f1-score   support

          F       0.41      0.42      0.41       324
          I       0.70      0.75      0.72       347
          M       0.42      0.38      0.40       373

avg / total       0.51      0.52      0.51      1044



In [7]:
accuracy_score(Y_test, prediction)

0.51532567049808431

# K-means

In [8]:
kmeans = KMeans(n_clusters = 5, init='k-means++')
kmeans.fit(X)
prediction_clusters = kmeans.predict(X)

In [9]:
#Train a model using the same technique as above for each cluster
X=np.column_stack((X,prediction_clusters))

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,train_size=0.75)
c_train=X_train[:,-1]
c_test=X_test[:,-1]

#scale
scalerx = preprocessing.StandardScaler().fit(X_train)
X_train=scalerx.transform(X_train)
X_test=scalerx.transform(X_test)

In [10]:
X_train0,y_train0=X_train[np.where(c_train==0)],Y_train[np.where(c_train==0)]
X_train1,y_train1=X_train[np.where(c_train==1)],Y_train[np.where(c_train==1)]
X_train2,y_train2=X_train[np.where(c_train==2)],Y_train[np.where(c_train==2)]
X_train3,y_train3=X_train[np.where(c_train==3)],Y_train[np.where(c_train==3)]
X_train4,y_train4=X_train[np.where(c_train==4)],Y_train[np.where(c_train==4)]

X_test0,y_test0=X_test[np.where(c_test==0)],Y_test[np.where(c_test==0)]
X_test1,y_test1=X_test[np.where(c_test==1)],Y_test[np.where(c_test==1)]
X_test2,y_test2=X_test[np.where(c_test==2)],Y_test[np.where(c_test==2)]
X_test3,y_test3=X_test[np.where(c_test==3)],Y_test[np.where(c_test==3)]
X_test4,y_test4=X_test[np.where(c_test==4)],Y_test[np.where(c_test==4)]

In [11]:
future_tree = RandomForestClassifier()

future_tree.fit(X_train0,y_train0)
pred0=future_tree.predict(X_test0)

future_tree.fit(X_train1,y_train1)
pred1=future_tree.predict(X_test1)

future_tree.fit(X_train2,y_train2)
pred2=future_tree.predict(X_test2)

future_tree.fit(X_train3,y_train3)
pred3=future_tree.predict(X_test3)

future_tree.fit(X_train4,y_train4)
pred4=future_tree.predict(X_test4)

In [12]:
Y_test=np.concatenate((y_test0, y_test1,y_test2, y_test3, y_test4), axis=0)
prediction=np.concatenate((pred0, pred1, pred2, pred3, pred4), axis=0)

In [13]:
confusion_matrix(Y_test,prediction)

array([[162,  42, 136],
       [ 43, 232,  41],
       [163,  75, 150]])

In [14]:
print classification_report(Y_test,prediction)

             precision    recall  f1-score   support

          F       0.44      0.48      0.46       340
          I       0.66      0.73      0.70       316
          M       0.46      0.39      0.42       388

avg / total       0.52      0.52      0.52      1044



In [15]:
accuracy_score(Y_test, prediction)

0.52107279693486586