# About competition 

- Using binary classification to predict a patient's smoking status given information about various other health indicators. 

- Submissions are evaluated on area under the ROC curve between the predicted probability and the observed target.



In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

In [None]:
df = pd.read_csv("train.csv")
df.head().T

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.isnull().sum()

### No missing values 

In [None]:
df.duplicated().sum()

### No duplicated values 

# Data preprocessing 

In [None]:
df.columns = df.columns.str.lower().str.replace(" ","_")
df.head()

In [None]:
df.drop("id",axis = 1, inplace = True)

In [None]:
df['height(cm)'] = df['height(cm)'] / 100
df['imc'] = df['weight(kg)'] / (df['height(cm)'] * df['height(cm)'])
df.head()

# EDA

## univariate analysis 

In [None]:
sns.countplot(df, x = "smoking")

In [None]:
cont_var  = []
disct_var = []

for col in df.columns: 
    if len(df[col].unique())>10: 
        cont_var.append(col)
    else:
        disct_var.append(col)
print(f'there is {len(cont_var)} continues variables')
print(f'there is {len(disct_var)} discrete variables')

In [None]:
df[cont_var].hist(bins = 40, figsize = (20,20))

In [None]:
fig,axs = plt.subplots(2,3,figsize = (20,10))

for i,var in enumerate(disct_var):
    row, col  = i//3,i%3
    ax = axs[row,col]
    sns.countplot(df, x = var, ax = ax)
    ax.set_title(f"Counts of {var}")
    

# Target Vs continues variables 

In [None]:
cont_var_for_corr = cont_var+["smoking"]
df_num = df[cont_var_for_corr]
df_num.head()

In [None]:
corr = df_num.corr()
plt.figure(figsize=(15, 15))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation matrix')
plt.show()

In [None]:
cont_var  = []
disct_var = []

for col in df.columns: 
    if len(df[col].unique())>10: 
        cont_var.append(col)
    else:
        disct_var.append(col)
print(f'there is {len(cont_var)} continues variables')
print(f'there is {len(disct_var)-1} discrete variables')

In [None]:
fig,axs = plt.subplots(4,5, figsize = (20,20))

for i,var in enumerate(cont_var): 
    row, col = i//5, i%5
    ax = axs[row,col]
    sns.boxplot(df, x = "smoking", y = var , ax =ax)
    ax.set_title(f"smooking vs {var}")

# Target vs discret variables 

In [None]:
fig,axs = plt.subplots(2,3, figsize = (15,10))

for i,var in enumerate(disct_var): 
    row, col = i//3, i%3
    ax = axs[row,col]
    sns.countplot(df, x = "smoking", hue = var, ax =ax)
    ax.set_title(f"smooking vs {var}")

# Feature ingenering 

In [None]:
df_num["total_lipid"] = df_num["cholesterol"]+df_num["triglyceride"]+df_num["hdl"]+df_num["ldl"]

In [None]:
df_num.head()

In [None]:
corr = df_num.corr()
plt.figure(figsize=(15, 15))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation matrix')
plt.show()

In [None]:
df["total_lipid"] = df["cholesterol"]+df["triglyceride"]+df["hdl"]+df["ldl"]
df.describe().T

# train-test split 

In [None]:
from sklearn.model_selection import train_test_split


X = df.drop("smoking", axis =1 )
y = df["smoking"]
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y, test_size=0.2)

In [None]:
from sklearn.preprocessing import MinMaxScaler, RobustScaler, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

cont_var  = []
disct_var = []

for col in Xtrain.columns: 
    if len(Xtrain[col].unique())>10: 
        cont_var.append(col)
    else:
        disct_var.append(col)

cont_trans = ColumnTransformer([
                    ("continu_transform", MinMaxScaler(), cont_var)],  
                    remainder = 'passthrough')

In [None]:
X_temp = cont_trans.fit_transform(Xtrain)

In [None]:
col = cont_var+disct_var
Xtrain_prepared = pd.DataFrame(X_temp, columns= col, index = Xtrain.index)
Xtrain_prepared.head()

# Build the model 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble     import RandomForestClassifier
from sklearn.svm          import LinearSVC
import xgboost as xgb  
from xgboost import XGBClassifier

log_clf = LogisticRegression()
rndfrt_clf = RandomForestClassifier(n_estimators=400, max_depth=10, random_state=0)
svm_clf = LinearSVC()
xgb_clf = XGBClassifier(n_estimators = 100)

In [None]:
X_test_temp = cont_trans.fit_transform(Xtest)
X_test_prepared = pd.DataFrame(X_test_temp,columns = col, index = Xtest.index)
X_test_prepared.head()

In [None]:
from sklearn.metrics import accuracy_score,  roc_auc_score, average_precision_score

models = [log_clf,rndfrt_clf,svm_clf,xgb_clf]

metrics = {
    'Accuracy': accuracy_score,
    'AUC-ROC': roc_auc_score,
    'AUC-PR': average_precision_score
}
for model in models: 
    model.fit(Xtrain_prepared,ytrain)
    yhat = model.predict(X_test_prepared)
    
    print(f"Evaluation des performances du modèle {type(model).__name__}:")
    for metric_name, metric_func in metrics.items():
        score = metric_func(ytest, yhat)
        print(f"{metric_name}: {score:.2f}")
    print()

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score, auc


classifier_names = ["Logistic Regression", "Random Forest", "SVM", "XGBoost"]
classifiers = [log_clf, rndfrt_clf, svm_clf, xgb_clf]

for i, clf in enumerate(classifiers):
    if hasattr(clf, "predict_proba"):
        y_scores = clf.predict_proba(X_test_prepared)[:, 1]  
        y_scores = clf.decision_function(X_test_prepared)  
    fpr, tpr, _ = roc_curve(ytest, y_scores)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{classifier_names[i]} (AUC = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('(FPR)')
plt.ylabel('(TPR)')
plt.title('  ROC curve ')
plt.legend(loc='lower right')
plt.show()

# Make submission 

In [None]:
test = pd.read_csv("test.csv")

In [None]:
test.columns = test.columns.str.lower().str.replace(" ","_")
test.drop("id",axis = 1, inplace = True)
test['height(cm)'] = test['height(cm)'] / 100
test['imc'] = test['weight(kg)'] / (test['height(cm)'] * test['height(cm)'])
test.head()

In [None]:
test["total_lipid"] = test["cholesterol"]+test["triglyceride"]+test["hdl"]+test["ldl"]

In [None]:
test_temp = cont_trans.fit_transform(test)

In [None]:
test.shape

In [None]:
y_score = rndfrt_clf.predict_proba(test_temp)

In [None]:
y_score

In [None]:
predictions = pd.DataFrame({'id':test["id"],'smoking':y_score[:,1]})

In [None]:
predictions.to_csv('submission.csv', index=False)