In [9]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.proportion import proportions_ztest
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.width', 500)


def load():
    data = pd.read_csv("diabetes.csv")
    data["Insulin"] = data["Insulin"].replace(0, np.nan)
    data["Glucose"] = data["Glucose"].replace(0, np.nan)
    return data


df = load()
print(df.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin    BMI  DiabetesPedigreeFunction  Age  Outcome
0            6  148.000             72             35      NaN 33.600                     0.627   50        1
1            1   85.000             66             29      NaN 26.600                     0.351   31        0
2            8  183.000             64              0      NaN 23.300                     0.672   32        1
3            1   89.000             66             23   94.000 28.100                     0.167   21        0
4            0  137.000             40             35  168.000 43.100                     2.288   33        1


In [3]:
#############################################
# Feature Extraction (Özellik Çıkarımı)
#############################################

#############################################
# Glucose column configured
#############################################

df["New_Is_Glucose_Missing"] = df["Glucose"].notnull().astype('int')
print(df.groupby("New_Is_Glucose_Missing").agg({"Outcome": "mean"}))
print(df.head())

test_stat, pvalue = proportions_ztest(count=[df.loc[df["New_Is_Glucose_Missing"] == 1, "Outcome"].sum(),
                                             df.loc[df["New_Is_Glucose_Missing"] == 0, "Outcome"].sum()], #Outcome olanların sayısı
                                      nobs=[df.loc[df["New_Is_Glucose_Missing"] == 1, "Outcome"].shape[0],
                                            df.loc[df["New_Is_Glucose_Missing"] == 0, "Outcome"].shape[0]])#Gözlem sayısı

                        Outcome
New_Is_Glucose_Missing         
0                         0.400
1                         0.349
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin    BMI  DiabetesPedigreeFunction  Age  Outcome  New_Is_Glucose_Missing
0            6  148.000             72             35      NaN 33.600                     0.627   50        1                       1
1            1   85.000             66             29      NaN 26.600                     0.351   31        0                       1
2            8  183.000             64              0      NaN 23.300                     0.672   32        1                       1
3            1   89.000             66             23   94.000 28.100                     0.167   21        0                       1
4            0  137.000             40             35  168.000 43.100                     2.288   33        1                       1


In [5]:
#############################################
# Aykırı Değer Problemini Çözme
#############################################

###################
# Baskılama Yöntemi
###################

def outlier_thresholds(dataframe, col_name, q1=0.25, q3=0.75):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 2.1 * interquantile_range
    low_limit = quartile1 - 2.1 * interquantile_range
    return low_limit, up_limit


def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False


def grab_col_names(dataframe, cat_th=30, car_th=50):

    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    return cat_cols, num_cols, cat_but_car

def convert_to_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit


cat_cols, num_cols, cat_but_car = grab_col_names(df)

columns = [col for col in num_cols]

for col in columns:
    convert_to_thresholds(df, col)

Observations: 768
Variables: 10
cat_cols: 3
num_cols: 7
cat_but_car: 0
num_but_cat: 3


In [7]:
#############################################
# Missing Values (Eksik Değerler)
# Eksik değerler medyan değerleri ile değiştirilir.
#############################################

null_cols = [col for col in df.columns if df[col].isnull().sum() > 0]

def replace_with_mean(dataframe, variable):
    variable_mean = dataframe[variable].mean()
    dataframe[variable] = dataframe[variable].replace(np.nan, variable_mean)


for col in null_cols:
    replace_with_mean(df, col)

In [8]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,New_Is_Glucose_Missing
0,6,148.0,72.0,35.0,150.068,33.6,0.627,50.0,1,1
1,1,85.0,66.0,29.0,150.068,26.6,0.351,31.0,0,1
2,8,183.0,64.0,0.0,150.068,23.3,0.672,32.0,1,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0,1
4,0,137.0,40.0,35.0,168.0,43.1,1.429,33.0,1,1
5,5,116.0,74.0,0.0,150.068,25.6,0.201,30.0,0,1
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26.0,1,1
7,10,115.0,24.2,0.0,150.068,35.3,0.134,29.0,0,1
8,2,197.0,70.0,45.0,428.875,30.5,0.158,53.0,1,1
9,8,125.0,96.0,0.0,150.068,7.77,0.232,54.0,1,1


In [18]:
ohe_cols = [col for col in df.columns if 20 >= df[col].nunique() > 2]
ohe_cols

['Pregnancies']

In [23]:
#############################################
# 3. Encoding Label Encoding
#############################################

def label_encoder(dataframe, binary_col):
    labelencoder = LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe

binary_cols = [col for col in df.columns if df[col].dtype not in [int, float]
               and df[col].nunique() == 2]

for col in binary_cols:
    label_encoder(df, col)


In [24]:
#############################################
# Feature Scaling (Özellik Ölçeklendirme)
# RobustScaler: Medyanı çıkar iqr'a böl.
#############################################

def robust_scaler(dataframe, variable):
  rs = RobustScaler()
  df[variable + '_robust_scaled'] = rs.fit_transform(dataframe[[variable]])
  df.describe().T

for col in num_cols:
    robust_scaler(df, col)


In [26]:
#############################################
# 8. Model
#############################################

y = df["Outcome"]
X = df.drop(["New_Is_Glucose_Missing", "Outcome"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

rf_model = RandomForestClassifier(random_state=46).fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

print(accuracy_score(y_pred, y_test))

0.7272727272727273


In [27]:
#############################################
# Özellik Mühendisliği yapmadan elde edilen skor
#############################################

dff = load()
dff.dropna(inplace=True)
y = dff["Outcome"]
X = dff.drop(["Outcome"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=17)
rf_model = RandomForestClassifier(random_state=46).fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print(accuracy_score(y_pred, y_test))

0.7627118644067796
