In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('/kaggle/input/noshowappointments/KaggleV2-May-2016.csv')

In [None]:
data.head(10)

In [None]:
null_counts = data.isnull().sum()
print(null_counts)

In [None]:
import matplotlib.pyplot as plt  # matplotlib'i dahil edin

data['No-show'].value_counts().plot(kind='bar')
plt.title('Randevuya Gelme Durumu İstatistikleri')
plt.xlabel('No-show')
plt.ylabel('Sayı')
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Histogram çizimi
plt.figure(figsize=(10, 6))
plt.hist(data['Age'], bins=30, edgecolor='black', alpha=0.7)

# Başlık ve etiketler
plt.title('Yaş Dağılımı')
plt.xlabel('Yaş')
plt.ylabel('Frekans')

# Gösterim
plt.show()

In [None]:
pd.crosstab(data['Gender'], data['No-show']).plot(kind='bar', stacked=True)
plt.title('Cinsiyet ve Randevuya Gelme Durumu (Crosstab)')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()


In [None]:
#Veri Ön İşleme
data['ScheduledDay'] = pd.to_datetime(data['ScheduledDay'])
data['AppointmentDay'] = pd.to_datetime(data['AppointmentDay'])

In [None]:
data['AppointmentDiff'] = data.apply(lambda row: (row['AppointmentDay'] - row['ScheduledDay']).days, axis=1)

In [None]:
from sklearn.preprocessing import OneHotEncoder

# OneHotEncoder kullanarak
encoder = OneHotEncoder(drop='first', sparse_output=False)

# Gender ve Neighbourhood sütunları için dönüşüm
encoded_cols = encoder.fit_transform(data[['Gender', 'Neighbourhood']])

# Yeni sütun isimleri
encoded_df = pd.DataFrame(encoded_cols, columns=encoder.get_feature_names_out(['Gender', 'Neighbourhood']))

# Orijinal veriyi yeni sütunlarla birleştirmek
data = pd.concat([data.drop(['Gender', 'Neighbourhood'], axis=1), encoded_df], axis=1)

In [None]:
data['No-show'] = data['No-show'].map({'Yes': 1, 'No': 0})

In [None]:
data = data.loc[data['Age'] >= 0]

In [None]:
#Gözetimli Öğrenme
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
X = data.copy()  # Orijinal veriyi bozmamak için kopya oluşturulur
X.drop(['No-show', 'ScheduledDay', 'AppointmentDay', 'AppointmentID', 'PatientId'], axis=1, inplace=True)
y = data['No-show']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)

In [None]:
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_model.fit(X_train, y_train)

In [None]:
y_pred = random_forest_model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

In [None]:
#Gözetimsiz Öğrenme
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X_scaled)

In [None]:
data['Cluster'] = kmeans.labels_

In [None]:
import matplotlib.pyplot as plt

# Scatter plot creation using matplotlib
plt.figure(figsize=(8, 6))
for cluster in data['Cluster'].unique():
    cluster_data = data[data['Cluster'] == cluster]
    plt.scatter(cluster_data['Age'], cluster_data['AppointmentDiff'], label=f'Cluster {cluster}')

plt.title('Yaş ve Randevu Süresi Verilerine Göre Kümeler')
plt.xlabel('Yaş')
plt.ylabel('Randevu Süresi')
plt.legend(title='Küme')
plt.grid(True)
plt.show()

In [None]:
# En iyi parametreleri bulacağız.
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'number_of_estimators': [100, 200, 300],
    'max_tree_depth': [None, 10, 20, 30],
    'min_samples_to_split': [2, 5, 10]
}

In [None]:
param_grid = {
    'n_estimators': [i * 50 for i in range(1, 3)],  # 50 ve 100'ü dinamik olarak oluşturur
    'max_depth': [i * 10 for i in range(1, 3)],     # 10 ve 20'yi dinamik olarak oluşturur
    'min_samples_split': [2, 5]                     # Sabit liste
}

In [None]:
from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(
    estimator=model, 
    param_distributions=param_grid, 
    cv=3, 
    scoring='accuracy', 
    n_iter=10,  # Sadece 10 kombinasyonu dene
    random_state=42
)

random_search.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Parametreler ve model
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5]
}

model = RandomForestClassifier()

# GridSearchCV kullanarak parametre araması
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# En iyi parametreleri yazdırma
print(f"Best parameters: {grid_search.best_params_}")

In [None]:
# En iyi model ile tahmin
best_model = grid_search.best_estimator_

# Test verisi ile tahmin yapma
y_pred_best = best_model.predict(X_test)

# En iyi modelin doğruluğunu hesaplama
from sklearn.metrics import accuracy_score
print(f"Optimized Accuracy: {accuracy_score(y_test, y_pred_best)}")