In [2]:
import pandas as pd
import numpy as np

data = pd.read_csv("../input/obezite/yenidata.csv")

In [3]:
data.head()
#FAVC - frequency of consuming high-calorie foods: "no" and "yes"
#FCVC - frequency of vegetable intake in meals
#NCP - number of main meals
#CAEC - frequency of sub-meals between main meals: "no", "Sometimes", "Frequently" and "Always"
#CH2O - amount of daily water intake in liters
#SCC - a binary indicator of food calorie monitoring: "no" and "yes"
#FAF - frequency of physical activity
#TUE - time spent on technological devices
#CALC - a binary indicator of alcohol intake: "no", "Sometimes", "Frequently" and "Always"
#MTRANS - the usual means of transportation: "Automobile", "Bike", "Motorbike", "Public_Transportation" and "Walking"
#NObeyesdad - NObeyesdad, the obesity level as a factor with 7 levels: "Insufficient_Weight", "Normal_Weight", "Overweight_Level_I", "Overweight_Level_II", "Obesity_Type_I", "Obesity_Type_II", "Obesity_Type_III".

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,21.0,1.62,64.0,1,0,2.0,3.0,2,0,2.0,0,0.0,1.0,3,3,1
1,0,21.0,1.52,56.0,1,0,3.0,3.0,2,1,3.0,1,3.0,0.0,2,3,1
2,1,23.0,1.8,77.0,1,0,2.0,3.0,2,0,2.0,0,2.0,1.0,1,3,1
3,1,27.0,1.8,87.0,0,0,3.0,3.0,2,0,2.0,0,2.0,0.0,1,4,5
4,1,22.0,1.78,89.8,0,0,2.0,1.0,2,0,2.0,0,0.0,0.0,2,3,6


In [4]:
x, y = data.iloc[:,:-1].values, data.iloc[:,-1].values

In [6]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=1)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [7]:
from sklearn.neighbors import KNeighborsClassifier
ns = np.arange(1, 20)
train_accuracy = []
test_accuracy = []
for i, k in enumerate(ns):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train,y_train)
    train_accuracy.append(knn.score(x_train, y_train))
    test_accuracy.append(knn.score(x_test, y_test))
print("Best accuracy is {} with K = {}".format(np.max(test_accuracy),1+test_accuracy.index(np.max(test_accuracy))))

Best accuracy is 0.851063829787234 with K = 1


In [8]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(x_train,y_train)
print(knn.score(x_test, y_test))

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train,y_train)
print(knn.score(x_test, y_test))

0.851063829787234
0.8132387706855791


In [13]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)
y_pred = lin_reg.predict(x_test)
print(lin_reg.score(x_test, y_test))

0.2753011137101834


In [14]:
y_train_bin = np.array([1 if i>4 else 0 for i in y_train])
y_test_bin = np.array([1 if i>4 else 0 for i in y_test])

In [15]:
x_train.shape

(1688, 16)

In [16]:
y_train_bin.shape

(1688,)

In [17]:
lin_reg.fit(x_train, y_train_bin)
y_pred = lin_reg.predict(x_test)
print(lin_reg.score(x_test, y_test_bin))

0.18235887005861828


In [54]:
from sklearn.preprocessing import PolynomialFeatures
poly_features = PolynomialFeatures(degree=2, include_bias=False)
x_train_poly = poly_features.fit_transform(x_train)
x_test_poly = poly_features.transform(x_test)

poly_reg = LinearRegression()
poly_reg.fit(x_train_poly, y_train_bin)
print(lin_reg.intercept_, lin_reg.coef_)

y_pred = poly_reg.predict(x_test_poly)
print(poly_reg.score(x_test_poly, y_test_bin))

0.27191943127961926 [ 0.02236648  0.06554701  0.04132034 -0.16843933  0.06656982 -0.03521648
 -0.03840679 -0.04466768  0.09150002 -0.02280638  0.02187326  0.03341098
 -0.0136521   0.00694881 -0.05262706  0.01246197]
0.5192598852593359


In [18]:
from sklearn.linear_model import Ridge
ridge = Ridge(alpha = 0.1)
ridge.fit(x_train,y_train)
ridge_predict = ridge.predict(x_test)
print('Ridge score: ',ridge.score(x_test,y_test))

Ridge score:  0.2753026618932257


In [19]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha = 0.1)
lasso.fit(x_train,y_train)
lasso_predict = lasso.predict(x_test)
print('Lasso score: ',lasso.score(x_test,y_test))
print('Lasso coefficients: ',lasso.coef_)

Lasso score:  0.25297796997287647
Lasso coefficients:  [-0.          0.18208589 -0.07209884  0.43817941  0.22232296 -0.03237175
 -0.         -0.05223111  0.33644397 -0.          0.          0.
 -0.06097992 -0.         -0.10070389  0.        ]


In [20]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(x_train,y_train_bin)
y_pred = log_reg.predict(x_test)
print(log_reg.score(x_test,y_test_bin))

0.7494089834515366


In [22]:
from sklearn.svm import SVC
svm = SVC()
svm.fit(x_train,y_train_bin)
y_pred = svm.predict(x_test)
print(svm.score(x_test,y_test_bin))

0.9196217494089834


In [23]:
from sklearn.model_selection import GridSearchCV
grid = {'C': np.arange(1,30)}
svm_gscv = GridSearchCV(svm, grid, cv=5) 
svm_gscv.fit(x_train,y_train_bin)

print("Tuned hyperparameter c: {}".format(svm_gscv.best_params_)) 
print("Best score: {}".format(svm_gscv.best_score_))


Tuned hyperparameter c: {'C': 23}
Best score: 0.9283145751760223


In [26]:
svm = SVC(C=23)
svm.fit(x_train,y_train_bin)
y_pred = svm.predict(x_test)
print(svm.score(x_test,y_test_bin))

0.9432624113475178


In [24]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(x_train,y_train)
y_pred = nb.predict(x_test)
print(nb.score(x_test,y_test))

0.5886524822695035


In [29]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
grid = {'criterion': ['entropy','gini','log_loss'],'max_depth': np.arange(1,50)}
dtree_gscv = GridSearchCV(dtree, grid, cv=5) 
dtree_gscv.fit(x_train,y_train)

print("Tuned hyperparameters: {}".format(dtree_gscv.best_params_)) 
print("Best score: {}".format(dtree_gscv.best_score_))

Tuned hyperparameter c: {'criterion': 'entropy', 'max_depth': 40}
Best score: 0.9472723122574755


In [30]:
dtree = DecisionTreeClassifier()
dtree.fit(x_train,y_train)
y_pred = dtree.predict(x_test)
print(dtree.score(x_test,y_test))

0.9408983451536643


In [31]:
dtree = DecisionTreeClassifier(criterion='entropy', max_depth=40)
dtree.fit(x_train,y_train)
y_pred = dtree.predict(x_test)
print(dtree.score(x_test,y_test))

0.9432624113475178


In [35]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
grid = {'criterion': ['entropy','gini','log_loss'],'n_estimators': np.arange(1,50)}
rf_gscv = GridSearchCV(rf, grid, cv=5) 
rf_gscv.fit(x_train,y_train)

print("Tuned hyperparameters: {}".format(rf_gscv.best_params_)) 
print("Best score: {}".format(rf_gscv.best_score_))

Tuned hyperparameters: {'criterion': 'log_loss', 'n_estimators': 48}
Best score: 0.9573437746914122


In [36]:
rf = RandomForestClassifier()
rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)
print(rf.score(x_test,y_test))

0.9408983451536643


In [37]:
rf = RandomForestClassifier(criterion='log_loss', n_estimators=48)
rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)
print(rf.score(x_test,y_test))

0.950354609929078


In [53]:
for name, score in zip(data.columns,rf.feature_importances_):
    print(name,score)

Gender 0.0671814395589035
Age 0.07380876651896139
Height 0.0919996667852228
Weight 0.4261019189428352
family_history_with_overweight 0.03148270538175101
FAVC 0.016089087337162786
FCVC 0.08468519863431935
NCP 0.045910100613393456
CAEC 0.026833710944253675
SMOKE 0.0013574320848386045
CH2O 0.03106297707120054
SCC 0.0033729620105899787
FAF 0.02728851160977695
TUE 0.03145556867587934
CALC 0.026545102918525073
MTRANS 0.014824850912386417


In [41]:
from sklearn.ensemble import VotingClassifier
vot_clf = VotingClassifier(
estimators=[('knn',knn),('nb',nb),('dt',dtree),('rf',rf)],
voting='hard'
)
vot_clf.fit(x_train,y_train)
y_pred = vot_clf.predict(x_test)
print(vot_clf.score(x_test,y_test))

0.9078014184397163


In [43]:
vot_clf = VotingClassifier(
estimators=[('knn',knn),('nb',nb),('dt',dtree),('rf',rf)],
voting='soft'
)
vot_clf.fit(x_train,y_train)
y_pred = vot_clf.predict(x_test)
print(vot_clf.score(x_test,y_test))

0.950354609929078


In [44]:
from sklearn.metrics import accuracy_score
for clf in (knn, nb, dtree, rf, vot_clf):
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

KNeighborsClassifier 0.8132387706855791
GaussianNB 0.5886524822695035
DecisionTreeClassifier 0.950354609929078
RandomForestClassifier 0.9479905437352246
VotingClassifier 0.9456264775413712


In [45]:
#Bagging
from sklearn.ensemble import BaggingClassifier
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), 
    n_estimators=500, 
    max_samples=1000, 
    bootstrap=True, 
    n_jobs=-1
)
bag_clf.fit(x_train, y_train)
y_pred = bag_clf.predict(x_test)
print(accuracy_score(y_test, y_pred))



0.9550827423167849


In [46]:
#Pasting
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), 
    n_estimators=500, 
    max_samples=1000, 
    bootstrap=False, 
    n_jobs=-1
)
bag_clf.fit(x_train, y_train)
y_pred = bag_clf.predict(x_test)
print(accuracy_score(y_test, y_pred))

0.9621749408983451


In [None]:
from sklearn.ensemble import AdaBoostClassifier
ada_boost = AdaBoostClassifier(
    DecisionTreeClassifier(criterion='entropy',max_depth=40), 
    n_estimators=200,
    algorithm="SAMME.R", 
    learning_rate=0.5
)
ada_boost.fit(x_train, y_train)
y_pred = ada_boost.predict(x_test)
print(accuracy_score(y_test, y_pred))

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
grad_boost = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
grad_boost.fit(x, y)
y_pred = grad_boost.predict(x_test)
print(accuracy_score(y_test, y_pred))