In [24]:
import matplotlib.pyplot as plt

from sklearn.preprocessing import scale
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union

from sklearn import metrics

resp = pd.read_csv('../project/data/resp.csv', index_col=0)

In [25]:
features = resp[['exercise', 'food_amount', 'cat_occ', 'stores']]
response = resp.income_lvl

X_train, X_test, y_train, y_test = train_test_split(features, response, random_state=1)

X_train_scaled = scale(X_train)
X_test_scaled = scale(X_test)

## KNN

In [26]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

preds = knn.predict(X_test)
metrics.accuracy_score(y_test, preds)

0.81379310344827582

In [27]:
knn.score(X_test_scaled, y_test)

0.69310344827586212

In [28]:
cross_val_score(knn, X_test_scaled, y_test, cv=5, scoring='accuracy').mean()

0.77616222760290565

In [29]:
# knn.predict([[2, 1, 3, 3, 2], [2, 3, 5, 2, 2], [1, 1, 9, 5, 3]])

## Logistic Regression

In [30]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)

preds = logreg.predict(X_test)
metrics.accuracy_score(y_test, preds)

0.81379310344827582

In [31]:
logreg.score(X_test_scaled, y_test)

0.80689655172413788

In [32]:
cross_val_score(logreg, X_test_scaled, y_test, cv=5, scoring='accuracy').mean()

0.82113801452784507

## Pipeline

In [33]:
logreg = LogisticRegression()
filter1 = SelectKBest(k=2)
pipe = Pipeline([('anova', filter1), ('logistic', logreg)])

pipe.fit(X_train_scaled, y_train)

pipe.score(X_test_scaled, y_test)

0.81034482758620685

In [34]:
cross_val_score(pipe, X_test_scaled, y_test, cv=5, scoring='accuracy').mean()

0.82113801452784507

## Decision Tree

In [35]:
from sklearn.tree import DecisionTreeClassifier

treeclf = DecisionTreeClassifier(max_depth=3, random_state=1)
treeclf.fit(X_train, y_train)

treeclf.score(X_test, y_test)

0.80689655172413788

In [36]:
cross_val_score(treeclf, X_test, y_test, cv=5, scoring='accuracy').mean()

0.79686440677966108

## Bagged Descision Tree

In [37]:
from sklearn.ensemble import BaggingClassifier
bagclf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, bootstrap=True, oob_score=True, random_state=1)

bagclf.fit(X_train, y_train)

bagclf.score(X_test, y_test)

0.79655172413793107

In [38]:
cross_val_score(bagclf, X_test, y_test, cv=5, scoring='accuracy').mean()

0.78347457627118633

## Random Forest

In [39]:
from sklearn.ensemble import RandomForestClassifier

rfclf = RandomForestClassifier(n_estimators=1000)
rfclf.fit(X_train, y_train)

rfclf.score(X_test, y_test)

0.79655172413793107

In [40]:
cross_val_score(rfclf, X_test, y_test, cv=5, scoring='accuracy').mean()

0.79014124293785315

In [41]:
from sklearn.ensemble import VotingClassifier

voteclf = VotingClassifier(estimators=[('lr', logreg), ('rf', rfclf), ('dt', treeclf)], voting='hard')
voteclf.fit(X_train, y_train)

voteclf.score(X_test, y_test)

0.80344827586206902

## Clustering Weekly Income

In [42]:
resp_cluster = resp[['exercise', 'food_amount', 'stores' , 'cat_occ', 'income_weekly']]
X = resp_cluster.drop('income_weekly', axis=1)

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

km = KMeans(n_clusters=5, random_state=1)
km.fit(X_scaled)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=5, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=1, tol=0.0001,
    verbose=0)

In [43]:
resp_cluster['cluster'] = km.labels_
resp_cluster.groupby('cluster').mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0_level_0,exercise,food_amount,stores,cat_occ,income_weekly
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1.703125,2.984375,4.578125,2.328125,1026.26771
1,1.577143,3.0,1.36,5.594286,828.378144
2,1.455882,1.573529,1.5,2.735294,691.386437
3,2.0,3.0,1.194958,1.593277,1063.648576
4,0.992188,3.0,1.257812,1.886719,783.014386


In [44]:
from sklearn import metrics
metrics.silhouette_score(X_scaled, km.labels_)

0.56934269199958576

In [22]:
k_range = range(2, 20)
scores = []
for k in k_range:
    km = KMeans(n_clusters=k, random_state=1)
    km.fit(X_scaled)
    scores.append(metrics.silhouette_score(X_scaled, km.labels_))

In [23]:
plt.plot(k_range, scores)
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Coefficient')
plt.grid(True)

NameError: name 'plt' is not defined

## Multinomial NB

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB(fit_prior=False)
nb.fit(X_train, y_train)

In [None]:
nb.score(X_test, y_test)