In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, precision_recall_curve
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import plot_tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix

### import data

In [None]:
# imoort data
df=pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1/main/Datasets/bikes_sharing.csv')

### data frame check

In [None]:
#2.data frame checks:
df.head()

In [None]:
df.info()#missing data/incorrect data

In [None]:
df.describe() # range make sense? any outliers? define the categorical/numeric columns.

In [None]:
df['ca'].unique() # to see the unique value of the column if it's strange

### data cleaning

In [None]:
#3.data cleaning 
missing_percentage = (df.isnull().sum() / len(df)) * 100
# or
print(df.isnull().sum())

# clean missing data #fill missing data with the mean of age/ embarked represents the port 
df['Age'] = df['Age'].fillna(df.groupby('Sex')['Age'].transform('mean'))
df['Embarked'] = df['Embarked'].fillna('S')

# drop those useless columns 
df.drop(['alive','alone','embark_town','who','adult_male','deck','class'], axis=1,inplace=True)

In [None]:
df.drop_duplicates()

### data exploration

In [None]:
# 4. data exploration
# #Display the distribution of target value-'Survived'
sns.countplot(x='Survived', data=df)
plt.show()

# #More detailed exploration
sns.pairplot(df, hue='Survived')
plt.show()

#using boxplot to check each continuous variables:
sns.boxplot(x='target value', y='age', data=df) 

In [None]:
import matplotlib.pyplot as plt

fig, ((ax1,ax2,ax3),(ax4,ax5,ax6)) = plt.subplots(2,3, figsize=(20,10))
sns.boxplot(x='DEATH_EVENT', y='age', ax=ax1, data=df)
sns.boxplot(x='DEATH_EVENT', y='ejection_fraction', ax=ax2, data=df)
sns.boxplot(x='DEATH_EVENT', y='serum_sodium', ax=ax3, data=df)
sns.boxplot(x='DEATH_EVENT', y='creatinine_phosphokinase', ax=ax4, data=df)
sns.boxplot(x='DEATH_EVENT', y='platelets', ax=ax5, data=df)
sns.boxplot(x='DEATH_EVENT', y='serum_creatinine', ax=ax6, data=df)

In [None]:
#drop the outliers
bedrooms_outliers=df[df['bedrooms']>30].index
df.drop(bedrooms_outliers,inplace=True)

In [None]:
# using groupby checking the categorical columns
df_anaemia = df.groupby(['DEATH_EVENT','anaemia'])[['sex']].count().unstack()
df_anaemia['total'] = df_anaemia['sex'][0] + df_anaemia['sex'][1]
df_anaemia['percent_0'] = df_anaemia['sex'][0] / df_anaemia['total']
df_anaemia['percent_1'] = df_anaemia['sex'][1] / df_anaemia['total']
df_anaemia

#using the plot bar chart:
df.groupby(['hd_cons','thal'])['ca'].count().unstack().plot(kind='bar', stacked=True)
# or
df.groupby('embarked').count() #看‘embarked’一栏的值在其他栏的分布情况，可以看出embarked栏内哪个值最多，就可以用这个值来fillin missing

### feature engineering

In [None]:
# 5. feature engineering
#generate new features :
#translate continuous bariable into groups
sns.histplot(df.casual)

df.casual.describe()

def casual_category(x):
    if x >= 0 and x <= 4:
        return 0
    elif x > 4 and x <= 17:
        return 1
    elif x >17 and x <= 49:
        return 2
    elif x > 49:
        return 3

In [None]:
df['casual_cat'] = df['casual'].apply(casual_category)

In [None]:
# get dummies：
sex_dummies = pd.get_dummies(df['sex'])
embarked_dummies = pd.get_dummies(df['embarked'])
#这里sex、embarked栏原本type为object
# or you need to rename the new columns:
Ever_Married = pd.get_dummies(df['Ever_Married'], dtype=int)
Ever_Married.rename(columns={'Yes':"Married", "No":'Not_Married'})

In [None]:
#把get dummies 的栏与原df合并，然后drop掉原栏
df = pd.concat([df,sex_dummies,embarked_dummies],axis=1)
df.drop(['sex','embarked'],axis=1, inplace=True)
df.columns

In [None]:
# define new metrics (e.g., multiply columns together or create custom categories based on multiple variables)
mapping = {0:0, 1:1, 2:1, 3:1, 4:1}
df['hd_cons'] = df['hd'].map(mapping)
df.groupby('hd_cons').count()

season_mapping = {1:'winter', 2:'spring', 3:'summer', 4:'fall'}
df['season'] = df['season'].map(season_mapping)

# apply function
def good_bad(temp,hum):
    if temp > 25 and hum > 70:
        return 'too hot'
    elif temp <=25 and hum >= 50 and hum <= 70:
        return 'so so day'
    else:
        return 'good day'
    
df['day_type'] = df.apply(lambda x: good_bad(x['temp'], x['humidity']), axis=1)

### data model

In [None]:
#6.data model
# after you choose the variables, you can build the baseline of the models:
x = df[['age', 'anaemia','ejection_fraction', 'high_blood_pressure',
       'serum_creatinine', 'serum_sodium']]# the variable columns
y = df['DEATH_EVENT'] # the target value


In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=1234)

In [None]:
# Apply the StandardScalar to numerical column
# Create a ColumnTransformer: apply StandardScaler on numerical columns and let other columns pass through
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num)
    ],
    remainder='passthrough')

# Fit the transformer using the training data
X_train_transformed = preprocessor.fit_transform(x_train)

In [None]:
knn = KNeighborsClassifier()
log = LogisticRegression()
nb = GaussianNB()
svc = SVC()
dtc= DecisionTreeClassifier()

In [None]:
knn.fit(x_train, y_train)
log.fit(x_train, y_train)
nb.fit(x_train, y_train)
svc.fit(x_train, y_train)
dtc.fit(x_train,y_train)

In [None]:
pred_knn = knn.predict(x_test)
pred_log = log.predict(x_test)
pred_nb = nb.predict(x_test)
pred_sv = svc.predict(x_test)
pred_dtc=dtc.predict(x_test)

In [None]:
print(f1_score(y_test,pred_knn))
print(f1_score(y_test,pred_log))
print(f1_score(y_test,pred_nb))
print(f1_score(y_test,pred_sv))
print(f1_score(y_test,pred_dtc))


In [None]:
#using the coross-valuadation，if the 10 numbers have more than 10% difference will no be good.
cv_scores_knn = cross_val_score(knn, x_train, y_train, cv=10, scoring='f1')
cv_scores_nb = cross_val_score(nb, x_train, y_train, cv=10, scoring='f1')

### find the best parameters

In [None]:
# Hyperparameter tuning
# select parameters
params_nb = {'var_smoothing': np.logspace(0,-9, num=100)}
params_knn = {'n_neighbors': list(range(1,200)), 'weights': ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan']}

# 可以把range里的数调小一点，这里有200*2*2=400组

# Define the models
nb = GaussianNB()
knn = KNeighborsClassifier()

# Define GridSearchCV---要根据题目决定选择recall 还是precision
gridsearch_knn = GridSearchCV(knn, params_knn, cv=10, scoring='recall')
    #nb, params_nb, cv=5)
randomsearch_knn = RandomizedSearchCV(knn, params_knn, cv=10, n_iter = 50, scoring='recall')

# Fit models
gridsearch_knn.fit(x_train, y_train)
randomsearch_knn.fit(x_train, y_train)

# Get the best parameters
print("Best parameters for KNN - gridsearch: ", gridsearch_knn.best_params_)
print("Best parameters for KNN - randomsearch: ", randomsearch_knn.best_params_)

In [None]:
# Apply the best parameters
nb_best = GaussianNB(**gridsearch_nb.best_params_)
knn_best = KNeighborsClassifier(**gridsearch_knn.best_params_)

# Perform cross-validation
cv_scores_nb = cross_val_score(nb_best, x_train, y_train, cv=5, scoring='roc_auc')
cv_scores_knn = cross_val_score(knn_best, x_train, y_train, cv=10, scoring='precision')

#print("Cross-validation scores for Naive Bayes: ", cv_scores_nb)
print("Cross-validation scores for KNN: ", cv_scores_knn)

# Fit the models with the best parameters
nb_best.fit(x_train, y_train)
knn_best.fit(x_train, y_train)

# Evaluate on the test set
y_pred_nb = nb_best.predict(x_train)
y_pred_knn = knn_best.predict(x_train)

#print("Test accuracy for Naive Bayes: ", accuracy_score(y_test, y_pred_nb))
print("Test accuracy for KNN: ", accuracy_score(y_test, y_pred_knn))

In [None]:
# Compute metrics
print("Naive Bayes Metrics: ")
print("Confusion Matrix: ", confusion_matrix(y_test, y_pred_nb))
print("Accuracy: ", accuracy_score(y_test, y_pred_nb))
print("Precision: ", precision_score(y_test, y_pred_nb))
print("Recall: ", recall_score(y_test, y_pred_nb))
print("F1 Score: ", f1_score(y_test, y_pred_nb))

print("\nK-Nearest Neighbors Metrics: ")
print("Confusion Matrix: ", confusion_matrix(y_test, y_pred_knn))
print("Accuracy: ", accuracy_score(y_test, y_pred_knn))
print("Precision: ", precision_score(y_test, y_pred_knn))
print("Recall: ", recall_score(y_test, y_pred_knn))
print("F1 Score: ", f1_score(y_test, y_pred_knn))

In [None]:
# KNN Predicted Probabilities
knn_best_pred_prob = knn_best.predict_proba(X_test_transformed)
knn_best_fpr, knn_best_tpr, knn_thresholds = roc_curve(y_test, knn_best_pred_prob[:, 1])

# Logistic Regression Predicted Probabilities
log_pred_prob = log.predict_proba(X_test_transformed)
log_fpr, log_tpr, log_thresholds = roc_curve(y_test, log_pred_prob[:, 1])

# Decision Tree Predicted Probabilities
dtc_pred_prob = dtc.predict_proba(X_test_transformed)
dtc_fpr, dtc_tpr, dtc_thresholds = roc_curve(y_test, dtc_pred_prob[:, 1])

# Naive Bayes Predicted Probabilities
nb_pred_prob = nb.predict_proba(X_test_transformed)
nb_fpr, nb_tpr, nb_thresholds = roc_curve(y_test, dtc_pred_prob[:, 1])

# Plotting
plt.figure(figsize=(7, 7))

# Plot ROC for each model
plt.plot(knn_best_fpr, knn_best_tpr, label='KNN_best')
plt.plot(log_fpr, log_tpr, label='Logistic Regression')
plt.plot(dtc_fpr, dtc_tpr, label='Decision Tree')
plt.plot(nb_fpr, nb_tpr, label='Naive Bayes')
# [Add plots for other models here]

# Plot line for random classifier
plt.plot([0, 1], [0, 1], 'k--')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Multiple Models')
plt.legend()
plt.show()

### build the pipeline

In [None]:
# build the pipeline:
cat_columns = ['Pclass','Sex','Embarked']
num_columns = ['Age','SibSp','Parch','Fare']
target = 'Survived'

cat_transformer = Pipeline(steps=[('passthrough', 'passthrough')])#if you dont need to transform the cat columns
num_transformer = Pipeline(steps = [('scaler', StandardScaler())])#if you dont need to scarlar the numeric columns

cat_transformer = OneHotEncoder(handle_unknown='ignore')
num_transformer = StandardScaler()
preprocessor = ColumnTransformer(transformers = [('cat',cat_transformer, cat_columns),
                                                 ('num', num_transformer, num_columns)])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df[cat_columns + num_columns], df[target], test_size=0.2, random_state=42)

In [None]:
x_train_transformed = preprocessor.fit_transform(X_train)
x_test_transformed = preprocessor.fit_transform(X_test)

In [None]:
encoded_columns = list(preprocessor.named_transformers_['cat'].get_feature_names_out(cat_columns))
all_feat = num_columns + encoded_columns

X_train_transformed = pd.DataFrame(x_train_transformed, columns=all_feat)
X_test_transformed = pd.DataFrame(x_test_transformed, columns=all_feat)

In [None]:
knn = KNeighborsClassifier(n_neighbors = 40)
knn.fit(X_train_transformed, y_train)
y_pred = knn.predict(X_test_transformed)
print(f1_score(y_pred, y_test))
print(accuracy_score(y_pred, y_test))