In [107]:
#Standard libs
import numpy as np 
import pandas as pd 

#Feature engineering, metrics and modeling libs
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score

In [108]:
abalone = pd.read_csv("Abalone.csv")

In [109]:
abalone.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [110]:
abalone.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             4177 non-null   object 
 1   Length          4177 non-null   float64
 2   Diameter        4177 non-null   float64
 3   Height          4177 non-null   float64
 4   Whole weight    4177 non-null   float64
 5   Shucked weight  4177 non-null   float64
 6   Viscera weight  4177 non-null   float64
 7   Shell weight    4177 non-null   float64
 8   Rings           4177 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB


In [111]:
abalone.describe()
abalone.shape

(4177, 9)

In [112]:
len(abalone.isnull())

4177

In [113]:
#menganalisa variabel yang memiliki korelasi dengan variabel lain
numeric_features = abalone.select_dtypes(include=[np.number])
correlation = numeric_features.corr()
print(correlation['Rings'].sort_values(ascending=False))

Rings             1.000000
Shell weight      0.627574
Diameter          0.574660
Height            0.557467
Length            0.556720
Whole weight      0.540390
Viscera weight    0.503819
Shucked weight    0.420884
Name: Rings, dtype: float64


In [114]:
#Data Cleaning
from scipy import stats
z= np.abs(stats.zscore(abalone.select_dtypes(include=[np.number])))
print(z)
abalone_o = abalone[(z < 3).all(axis=1)]

        Length  Diameter    Height  Whole weight  Shucked weight  \
0     0.574558  0.432149  1.064424      0.641898        0.607685   
1     1.448986  1.439929  1.183978      1.230277        1.170910   
2     0.050033  0.122130  0.107991      0.309469        0.463500   
3     0.699476  0.432149  0.347099      0.637819        0.648238   
4     1.615544  1.540707  1.423087      1.272086        1.215968   
...        ...       ...       ...           ...             ...   
4172  0.341509  0.424464  0.609334      0.118813        0.047908   
4173  0.549706  0.323686  0.107991      0.279929        0.358808   
4174  0.632985  0.676409  1.565767      0.708212        0.748559   
4175  0.841182  0.777187  0.250672      0.541998        0.773341   
4176  1.549052  1.482634  1.326659      2.283681        2.640993   

      Viscera weight  Shell weight     Rings  
0           0.726212      0.638217  1.571544  
1           1.205221      1.212987  0.910013  
2           0.356690      0.207139  0.2896

In [115]:
#encoding
#Membuat binary kolom yang menunjukkan setiap nilai pada dataset
#dikarenakan model tidak bisa membaca  'M', 'F' atau 'I' ketika proses ini berlangsung
low_cardinality_cols = [cname for cname in abalone_o.columns if
                        abalone_o[cname].nunique() < 10 and 
                       abalone_o[cname].dtype == "object"]
numeric_cols = [cname for cname in abalone_o.columns if
                                 abalone_o[cname].dtype in ['int64','float64']]

my_cols = low_cardinality_cols + numeric_cols
abalone_predictors = abalone_o[my_cols]

In [116]:
abalone_predictors.dtypes.sample(7)

Sex                object
Length            float64
Rings               int64
Whole weight      float64
Shucked weight    float64
Diameter          float64
Shell weight      float64
dtype: object

In [141]:
#membuat 3 kolom baru yaitu fitur M', 'F' dan 'I'
abalone_encoded_predictors = pd.get_dummies(abalone_predictors)

In [118]:
from sklearn.pipeline import make_pipeline
cross_cols = ['Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight','Shell weight','Sex_F','Sex_I','Sex_M']
X = abalone_encoded_predictors[cross_cols]
y = abalone_encoded_predictors.Rings

decision_pipeline = make_pipeline(DecisionTreeRegressor())
decision_scores = cross_val_score(decision_pipeline, X,y,scoring='neg_mean_absolute_error')

print('MAE %2f' %(-1 * decision_scores.mean()))

MAE 1.950023


In [119]:
#melakukan pembagian data set training dan testing
dt_train_X,dt_test_X,dt_train_y,dt_test_y = train_test_split(X,y)

In [120]:
#Decision tree mampu menentukan variasi dan berapa panyak leaf nodes yang diproduksi dan mennghasilkan hasil yang terbaik
def get_mae(max_leaf_nodes,dt_train_X,dt_test_X,dt_train_y,dt_test_y ):
    model_pipeline = make_pipeline(DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes,random_state=0))
    model_pipeline.fit(dt_train_X,dt_train_y)
    preds_val = model_pipeline.predict(dt_test_X)
    mae = mean_absolute_error(dt_test_y,preds_val)
    return(mae)

In [121]:
#Membuat pipeline untuk menentukan return mean absolute error
decision_split_pipeline = make_pipeline(DecisionTreeRegressor(max_leaf_nodes=5))
decision_split_pipeline.fit(dt_train_X,dt_train_y)
decision_tree_prediction = decision_split_pipeline.predict(dt_test_X)
print("MAE: " + str(mean_absolute_error(decision_tree_prediction,dt_test_y)))

MAE: 1.688271496150728


In [122]:
acc_decision = decision_split_pipeline.score(dt_test_X,dt_test_y)
print("Acc:", acc_decision )

Acc: 0.3714352010728543


In [123]:
#Random Forest
forest_pipeline = make_pipeline(RandomForestRegressor(random_state=1))
forest_scores = cross_val_score(forest_pipeline, X,y,scoring="neg_mean_absolute_error")
print('MAE %2f' %(-1 * forest_scores.mean()))

MAE 1.473488


In [124]:
#Pada Random Forest memiliki akurasi 55%, lebih tinggi daripada Decision tree
f_train_X,f_test_X,f_train_y,f_test_y = train_test_split(X,y)
forest_split_pipeline = make_pipeline(RandomForestRegressor(random_state=1))
forest_split_pipeline.fit(f_train_X,f_train_y)
forest_predictions = forest_split_pipeline.predict(f_test_X)
print("Accuracy:",forest_split_pipeline.score(f_test_X,f_test_y))
print("MAE:",str(mean_absolute_error(forest_predictions,f_test_y)))

Accuracy: 0.5516220619492421
MAE: 1.444498510427011


In [125]:
#Logistic Regresi
one_hot_encoders_abalone_df =  pd.get_dummies(abalone)
cols = one_hot_encoders_abalone_df.columns
abalone_clean_data = pd.DataFrame(one_hot_encoders_abalone_df,columns= cols)
abalone_clean_data.head(1)

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_F,Sex_I,Sex_M
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,0,0,1


In [126]:
#Scaling
#mengubah data menjadi data yang memiliki nilai rata-rata distribusi=0 dan standar deviasi=1
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
scaled_data =  pd.DataFrame(
    sc_X.fit_transform(abalone_clean_data[['Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight']]),
                           columns=['Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight'],
                           index=abalone_clean_data.index)

In [128]:
scaled_data.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight
0,-0.574558,-0.432149,-1.064424,-0.641898,-0.607685,-0.726212
1,-1.448986,-1.439929,-1.183978,-1.230277,-1.17091,-1.205221
2,0.050033,0.12213,-0.107991,-0.309469,-0.4635,-0.35669
3,-0.699476,-0.432149,-0.347099,-0.637819,-0.648238,-0.6076
4,-1.615544,-1.540707,-1.423087,-1.272086,-1.215968,-1.287337


In [129]:
#Data Cleaning
abalone_clean_data_standard = abalone_clean_data.copy(deep=True)
abalone_clean_data_standard[['Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight']] = scaled_data[['Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight']]
abalone_clean_data_standard.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_F,Sex_I,Sex_M
0,-0.574558,-0.432149,-1.064424,-0.641898,-0.607685,-0.726212,0.15,15,0,0,1
1,-1.448986,-1.439929,-1.183978,-1.230277,-1.17091,-1.205221,0.07,7,0,0,1
2,0.050033,0.12213,-0.107991,-0.309469,-0.4635,-0.35669,0.21,9,1,0,0
3,-0.699476,-0.432149,-0.347099,-0.637819,-0.648238,-0.6076,0.155,10,0,0,1
4,-1.615544,-1.540707,-1.423087,-1.272086,-1.215968,-1.287337,0.055,7,0,1,0


In [130]:
x = abalone_clean_data_standard.drop(["Rings"],axis=1)
y = abalone_clean_data_standard.Rings
# y is float value and we will categorize ouput in two categories 0 and 1
y = np.where(y > 10,1,0)

In [131]:
train_x, test_x, train_y, test_y = train_test_split(x, y,random_state = 23,test_size=0.3)
train_y

array([0, 0, 0, ..., 0, 0, 1])

In [132]:
#Model Training
logreg = LogisticRegression()
logreg.fit(train_x,train_y)
y_pred = logreg.predict(test_x) 

In [133]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score , classification_report
print("accuracy: "+ str(accuracy_score(test_y,y_pred)*100) + "%")
print("Mean absolute error: {}".format(mean_absolute_error(test_y,y_pred)))
print("Mean squared error: {}".format(mean_squared_error(test_y,y_pred)))
print("R2 score: {}".format(r2_score(test_y, y_pred)))
print("intercept: {}".format(logreg.intercept_))

accuracy: 77.75119617224881%
Mean absolute error: 0.22248803827751196
Mean squared error: 0.22248803827751196
R2 score: 0.009240249313990034
intercept: [-1.5174329]


In [134]:
print(classification_report(test_y, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.88      0.84       827
           1       0.71      0.58      0.64       427

    accuracy                           0.78      1254
   macro avg       0.76      0.73      0.74      1254
weighted avg       0.77      0.78      0.77      1254



In [135]:
#Linear Regresi
xn = ['Sex']
xq = ['Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight','Shell weight']
x = xn + xq
# y data labels
y = ['Rings']

print(x)

['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight']


In [136]:
print(y)

['Rings']


In [137]:
#Membuat model
from sklearn import linear_model
lmreg = linear_model.LinearRegression( fit_intercept = False )
lmreg.fit( abalone_clean_data_standard[xq], abalone_clean_data_standard[y] )

LinearRegression(fit_intercept=False)

In [138]:
lmreg.coef_

array([[ 2.73007582e-03,  6.25210601e-01,  3.00904926e-01,
        -3.63184386e+00, -1.22179099e+00,  2.78413384e-01,
         4.10737716e+01]])

In [139]:
lmreg.score(abalone_clean_data_standard[xq],abalone_clean_data_standard[y])

0.43354598461469307

In [144]:
mean = abalone_clean_data_standard['Rings'].mean()
var = np.sqrt(abalone_clean_data_standard['Rings'].var())
print ("scikit-learn linear regression model:")
print ("mean response value = %0.2f"%(mean))
print ("L2 residual = %0.2f"%(var))

scikit-learn linear regression model:
mean response value = 9.93
L2 residual = 3.22


In [None]:
#Dari keempat metode tersebut logistik regression memiliki tingkat akurasi paling tinggi untuk kasus data abalone dengan akurasi
# 77% dan decision tree memiliki akurasi paling rendah yaitu 37%