## 1. Load Data

In [None]:
import pandas as pd

df = pd.read_csv('../input/room-occupancy/file.csv')
df.head(10)

`*` By checking the shape of the data, we can obtain the numbers of rows and columns.

In [None]:
df.shape

`*` We also want to know if there is any null-value in our data and the `Dtype` of each column.

In [None]:
df.info()

`*` Checking the statisctic info for the numerical columns is also useful:  
__Light__ and __CO2__: with relative small `means` and `stds` (standard deviations), they both have large `max` (maximun) value, which indicate that these two columns contain _outliers_.

In [None]:
df.describe()

## 2. Data Visualizations
Line Plots and Box Plots.

In [None]:
import matplotlib.pyplot as plt
import numpy as np


def occupancy_plot(df, cat):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14,4))
    
    fig.suptitle(cat)
    ax1.plot(np.where(df.Occupancy==1, df[cat], None), label='Occupied')
    ax1.plot(np.where(df.Occupancy==0, df[cat], None), label='Vacant', ls='--')
    ax1.grid()
    ax1.legend()
    
    ax2.boxplot([df[cat][df.Occupancy==1], df[cat][df.Occupancy==0], df[cat]])
    ax2.set_xticklabels(['Occupancy', 'Vacant', 'Overall'])


for i in range(1, 6):
    occupancy_plot(df, df.columns[i])

`*` Remove _outliers_ column by column.

In [None]:
df2 = df
df2 = df2[np.abs(df2.Temperature - df2.Temperature.mean()) <= 3*df2.Temperature.std()]
print("1. Removing the Outliers on 'Temperature' has reduced the data size from {} to {}.".format(len(df), len(df2)))
print("\n")
df = df2[np.abs(df2.Light - df2.Light.mean()) <= 3*df2.Light.std()]
print("2. Removing the Outliers on 'Light' has reduced the data size from {} to {}.".format(len(df2), len(df)))
print("\n")
df2 = df[np.abs(df.CO2 - df.CO2.mean()) <= 3*df2.CO2.std()]
print("3. Removing the Outliers on 'CO2' has reduced the data size from {} to {}.".format(len(df), len(df2)))
print("\n")

`*` Let's review the data after the _outliers_ are removed.

In [None]:
df2.describe()

In [None]:
for i in range(1, 6):
    occupancy_plot(df2, df2.columns[i])

`*` Correleation Heatmap:  
__Humidity Ratio__ and __Humidity__ are strongly, positivedly correlative.  
__Light__ is a powerful indicator for __Occupancy__.

In [None]:
import seaborn as sns

mask = np.triu(np.ones_like(df2.corr()))
plt.figure(figsize = (15,8))
sns.heatmap(df2.corr(),annot=True, fmt="1.2f", mask=mask, cmap="YlGnBu")
plt.yticks(rotation=0)
plt.show()

## 3. Predicting Occupancy Using Classification Algorithms
The most common classification algorithms include:  
__I. Logistic Regression__  
__II. K Nearest Neighbors (KNN)__  
__III. Support Vector Machine (SVM)__  
__IV. Decision Tree__  
__V. Random Forest__  
__VI. Naive Bayes__  
__VII. Gradent Boosting__

`*` Split the data into training and testing sets and apply the resulting model on the testing set.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

X, Y = df2.iloc[:,1:-1], df2.iloc[:,-1]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)

# Create a dataframe that used to store data from confusion matrix and accuracy 
result = pd.DataFrame(columns=['Classifier','True Negative', 'False Postive', 'False Negative', 'True Positive', 'Classifier Accuracy'])


def accuracy_vis(xtest, ytest, ypred, predit_proba):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14,4))
    
    # Confusion Matrix Visulation
    cm = confusion_matrix(ytest, ypred)
    x_axis_labels = ['Actual Postive', 'Actual Negative']
    y_axis_labels = ['Predicted Postive', 'Predicted Negative']
    sns.heatmap(cm, fmt=".0f", annot=True, linewidths=.5, ax=ax1, 
                cmap="YlGnBu", xticklabels=x_axis_labels)
    ax1.set_yticklabels(y_axis_labels, rotation=0, ha='right')
    
    # ROC Curve Visulation
    logit_roc_auc = roc_auc_score(ytest, ypred)
    fpr, tpr, thresholds = roc_curve(ytest, predit_proba[:,1])
    ax2.plot(fpr, tpr, label='Logistic Regression (area = {})'.format(round(logit_roc_auc,6)))
    ax2.plot([0, 1], [0, 1],'r--')
    ax2.set_xlim([0.0, 1.0])
    ax2.set_ylim([0.0, 1.05])
    ax2.set_xlabel('False Positive Rate')
    ax2.set_ylabel('True Positive Rate')
    ax2.legend()
    plt.show()
    return(confusion_matrix(Y_test, Y_pred).ravel())

__I. Logistic Regression__

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train, Y_train)
Y_pred, lr_score, predit_proba = lr.predict(X_test), lr.score(X_test, Y_test), lr.predict_proba(X_test)
print('Accuracy of Logistic Regression Classifier on test set: {:.6f}%'.format(lr_score*100))
tn, fp, fn, tp = accuracy_vis(X_test, Y_test, Y_pred, predit_proba)
result.loc['LR'] = ['Logistic Regression', tn, fp, fn, tp, round(lr_score*100, 6)]

__II. K Nearest Neighbors (KNN)__  

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=int(np.sqrt(len(X_train))))
knn.fit(X_train, Y_train)
Y_pred, knn_score, predit_proba  = knn.predict(X_test), knn.score(X_test, Y_test), knn.predict_proba(X_test)
print('Accuracy of K Nearest Neighbors Classifier on test set: {:.6f}%'.format(knn_score*100))
tn, fp, fn, tp = accuracy_vis(X_test, Y_test, Y_pred, predit_proba)
result.loc['KNN'] = ['K Nearest Neighbors', tn, fp, fn, tp, round(knn_score*100, 6)]

__III. Support Vector Machine (SVM)__ 

In [None]:
from sklearn.svm import SVC

svm = SVC(probability=True)
svm.fit(X_train, Y_train)
Y_pred, svm_score, predit_proba = svm.predict(X_test), svm.score(X_test, Y_test), svm.predict_proba(X_test)
print('Accuracy of Support Vector Machine Classifier on test set: {:.6f}%'.format(svm_score*100))
tn, fp, fn, tp = accuracy_vis(X_test, Y_test, Y_pred, predit_proba)
result.loc['SVM'] = ['Support Vector Machine', tn, fp, fn, tp, round(svm_score*100, 6)]

__IV. Decision Tree__ 

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, Y_train)
Y_pred, dt_score, predit_proba = dt.predict(X_test), dt.score(X_test, Y_test), dt.predict_proba(X_test)
print('Accuracy of Decision Tree Classifier on test set: {:.6f}%'.format(dt_score*100))
tn, fp, fn, tp = accuracy_vis(X_test, Y_test, Y_pred, predit_proba)
result.loc['DT'] = ['Decision Tree', tn, fp, fn, tp, round(dt_score*100, 6)]

__V. Random Forest__  

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, Y_train)
Y_pred, rf_score, predit_proba = rf.predict(X_test), rf.score(X_test, Y_test), rf.predict_proba(X_test)
print('Accuracy of Random Forest Classifier on test set: {:.6f}%'.format(rf_score*100))
tn, fp, fn, tp = accuracy_vis(X_test, Y_test, Y_pred, predit_proba)
result.loc['RF'] = ['Random Forest', tn, fp, fn, tp, round(rf_score*100, 6)]

__VI. Naive Bayes__

In [None]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train, Y_train)
Y_pred, nb_score, predit_proba = nb.predict(X_test), nb.score(X_test, Y_test), nb.predict_proba(X_test)
print('Accuracy of Naive Bayes Classifier on test set: {:.6f}%'.format(nb_score*100))
tn, fp, fn, tp = accuracy_vis(X_test, Y_test, Y_pred, predit_proba)
result.loc['NB'] = ['Naive Bayes', tn, fp, fn, tp, round(nb_score*100, 6)]

__VII. Gradent Boosting__ 

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
gb.fit(X_train, Y_train)
Y_pred, gb_score, predit_proba = gb.predict(X_test), gb.score(X_test, Y_test), gb.predict_proba(X_test)
print('Accuracy of Gradent Boosting Classifier on test set: {:.6f}%'.format(gb_score*100))
tn, fp, fn, tp = accuracy_vis(X_test, Y_test, Y_pred, predit_proba)
result.loc['GB'] = ['Gradent Boosting', tn, fp, fn, tp, round(gb_score*100, 6)]

## 4. Result

In [None]:
result.sort_values('Classifier Accuracy', ascending=False)