# Q1 - Supervised Outlier Detection

## 1. Import Libraries

In [9]:
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import lightgbm as lgb

## 2. Read Data

In [10]:
train_df = pd.DataFrame()
folder_path = '../Data_Q1/train'

for i in range(20):  
    file_name = f'data_{i}.csv'
    file_path = os.path.join(folder_path, file_name)

    # 读取CSV文件并合并到all_data
    data = pd.read_csv(file_path)
    train_df = pd.concat([train_df, data])
    print('Successfully read train file: ', file_name)

test_df = pd.read_csv('../Data_Q1/test/test_set.csv')

Successfully read train file:  data_0.csv
Successfully read train file:  data_1.csv
Successfully read train file:  data_2.csv
Successfully read train file:  data_3.csv
Successfully read train file:  data_4.csv
Successfully read train file:  data_5.csv
Successfully read train file:  data_6.csv
Successfully read train file:  data_7.csv
Successfully read train file:  data_8.csv
Successfully read train file:  data_9.csv
Successfully read train file:  data_10.csv
Successfully read train file:  data_11.csv
Successfully read train file:  data_12.csv
Successfully read train file:  data_13.csv
Successfully read train file:  data_14.csv
Successfully read train file:  data_15.csv
Successfully read train file:  data_16.csv
Successfully read train file:  data_17.csv
Successfully read train file:  data_18.csv
Successfully read train file:  data_19.csv


In [11]:
print(train_df.shape)
train_df.head()

(134229, 8)


Unnamed: 0,x,y,z,a,b,c,d,Is_Falling
0,18.49586,13.766527,14.362624,0,0,0,1,0
1,18.501072,13.827225,14.270268,0,0,1,0,0
2,18.40595,13.868976,14.094804,1,0,0,0,0
3,18.444572,13.910701,14.116078,0,1,0,0,0
4,18.41847,13.933917,14.320566,0,0,0,1,0


In [12]:
print(test_df.shape)
test_df.head()

(6623, 9)


Unnamed: 0,ID,x,y,z,a,b,c,d,Is_Falling
0,1,6.912997,11.518698,15.471855,0,0,0,1,0
1,2,6.936432,11.574586,15.446939,0,0,1,0,0
2,3,6.935274,11.57179,15.437505,1,0,0,0,0
3,4,6.886688,11.561593,15.704019,0,0,0,1,0
4,5,6.921823,11.597728,15.634435,0,0,1,0,0


## 3. Data Preprocessing

In [13]:
# Dropping the target variable 'Is_Falling' to separate features (X) from the target (y)
X_train = train_df.drop('Is_Falling', axis=1)  
y_train = train_df['Is_Falling'] 

# Standardizing the features: it's a common practice to scale features 
# so that all of them can be uniformly evaluated
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  

# Using SMOTE (Synthetic Minority Over-sampling Technique) for handling class imbalance
# This helps to generate synthetic samples for the minority class to balance the dataset
smote = SMOTE()
X_sm, y_sm = smote.fit_resample(X_train_scaled, y_train)  

# Preparing the test data
# Dropping the 'ID' and 'Is_Falling' columns as we need to predict 'Is_Falling' and 'ID' is not a feature
X_test = test_df.drop(['ID', 'Is_Falling'], axis=1)
y_test = test_df['Is_Falling'] 

# Scaling the test data with the same scaler used for the training data
X_test_scaled = scaler.transform(X_test) 

## 4. Model Training and Evaluation

### 4.1 Random Forest Classifier

In [14]:
# Train Random Forest Classifier
rf_model = RandomForestClassifier()
rf_model.fit(X_sm, y_sm)

# Predict and evaluate for Random Forest Classifier
rf_predictions = rf_model.predict(X_test_scaled)
print("Random Forest Classifier Evaluation")
print(classification_report(y_test, rf_predictions))

Random Forest Classifier Evaluation
              precision    recall  f1-score   support

           0       0.96      0.96      0.96      6280
           1       0.25      0.26      0.25       343

    accuracy                           0.92      6623
   macro avg       0.61      0.61      0.61      6623
weighted avg       0.92      0.92      0.92      6623


### 4.2 LightGBM Classifier

In [16]:
# Train LightGBM Classifier
lgbm_model = lgb.LGBMClassifier()
lgbm_model.fit(X_sm, y_sm)

# Predict and evaluate for LightGBM Classifier
lgbm_predictions = lgbm_model.predict(X_test_scaled)
print("LightGBM Classifier Evaluation")
print(classification_report(y_test, lgbm_predictions))

[LightGBM] [Info] Number of positive: 127656, number of negative: 127656
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006337 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 777
[LightGBM] [Info] Number of data points in the train set: 255312, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
LightGBM Classifier Evaluation
              precision    recall  f1-score   support

           0       0.97      0.91      0.94      6280
           1       0.25      0.52      0.33       343

    accuracy                           0.89      6623
   macro avg       0.61      0.72      0.64      6623
weighted avg       0.93      0.89      0.91      6623


## 5. Output

Since Higher Recall for the Minority Class: LightGBM showed a recall rate of 0.52 for the falling class, which is more than double that of the RandomForest's recall rate of 0.26. In applications where detecting every possible incident is critical, such as fall detection in cats, a higher recall rate is immensely valuable. This means LightGBM is more effective in identifying true falling events.

So, I will use LightGBM to predict the test data.

In [1]:
# Output the predictions to a CSV file
output_df = pd.DataFrame({'ID': test_df['ID'], 'Is_Falling': lgbm_predictions})
output_df.to_csv('../Q1_output.csv', index=False)