In [1]:
# import necessary modules 
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np 
from sklearn.linear_model import LogisticRegression 
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import confusion_matrix, classification_report 

# load the data set 
data = pd.read_csv(r'C:\Users\aishk\Desktop\Major\Meander_HandPD.csv') 

# print info about columns in the dataframe 
print(data.info()) 


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 368 entries, 0 to 367
Data columns (total 16 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   _ID_EXAM                                         368 non-null    int64  
 1   IMAGE_NAME                                       368 non-null    object 
 2   ID_PATIENT                                       368 non-null    int64  
 3   CLASS_TYPE                                       368 non-null    int64  
 4   GENDER                                           368 non-null    object 
 5   RIGH/LEFT-HANDED                                 368 non-null    object 
 6   AGE                                              368 non-null    int64  
 7   RMS                                              368 non-null    float64
 8   MAX_BETWEEN_ST_HT                                368 non-null    float64
 9   MIN_BETWEEN_ST_HT               

In [2]:
features=data.loc[:,data.columns!='CLASS_TYPE']
labels=data.loc[:,'CLASS_TYPE']

In [3]:
features = features.drop(['_ID_EXAM', 'ID_PATIENT','IMAGE_NAME','GENDER','RIGH/LEFT-HANDED'], axis=1)

In [4]:
from sklearn.model_selection import train_test_split 

# split into 70:30 ration 
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.3, random_state = 0) 

# describes info about train and test set 
print("Number transactions X_train dataset: ", X_train.shape) 
print("Number transactions y_train dataset: ", y_train.shape) 
print("Number transactions X_test dataset: ", X_test.shape) 
print("Number transactions y_test dataset: ", y_test.shape) 

Number transactions X_train dataset:  (257, 10)
Number transactions y_train dataset:  (257,)
Number transactions X_test dataset:  (111, 10)
Number transactions y_test dataset:  (111,)


In [5]:
# logistic regression object 
lr = LogisticRegression() 

# train the model on train set 
lr.fit(X_train, y_train.ravel()) 

predictions = lr.predict(X_test) 

# print classification report 
print(classification_report(y_test, predictions)) 


              precision    recall  f1-score   support

           1       0.63      0.50      0.56        24
           2       0.87      0.92      0.89        87

    accuracy                           0.83       111
   macro avg       0.75      0.71      0.73       111
weighted avg       0.82      0.83      0.82       111



In [6]:
print("Before Undersampling, counts of label '1': {}".format(sum(y_train == 1))) 
print("Before Undersampling, counts of label '2': {} \n".format(sum(y_train == 2))) 

# apply near miss 
from imblearn.under_sampling import NearMiss 
nr = NearMiss() 

X_train_miss, y_train_miss = nr.fit_sample(X_train, y_train.ravel()) 

print('After Undersampling, the shape of train_X: {}'.format(X_train_miss.shape)) 
print('After Undersampling, the shape of train_y: {} \n'.format(y_train_miss.shape)) 

print("After Undersampling, counts of label '1': {}".format(sum(y_train_miss == 1))) 
print("After Undersampling, counts of label '2': {}".format(sum(y_train_miss == 2))) 


Before Undersampling, counts of label '1': 48
Before Undersampling, counts of label '2': 209 

After Undersampling, the shape of train_X: (96, 10)
After Undersampling, the shape of train_y: (96,) 

After Undersampling, counts of label '1': 48
After Undersampling, counts of label '2': 48


In [7]:
# train the model on train set 
lr2 = LogisticRegression() 
lr2.fit(X_train_miss, y_train_miss.ravel()) 
predictions = lr2.predict(X_test) 

# print classification report 
print(classification_report(y_test, predictions)) 


              precision    recall  f1-score   support

           1       0.44      0.71      0.54        24
           2       0.90      0.75      0.82        87

    accuracy                           0.74       111
   macro avg       0.67      0.73      0.68       111
weighted avg       0.80      0.74      0.76       111



In [8]:
from xgboost import XGBClassifier
xbg_model = XGBClassifier()
xbg_model.fit(X_train_miss, y_train_miss.ravel())





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [9]:
y_pred = xbg_model.predict(X_test)

In [10]:
from sklearn.metrics import confusion_matrix, accuracy_score
print(confusion_matrix(y_test,y_pred))

[[21  3]
 [18 69]]


In [11]:
print(accuracy_score(y_test, y_pred)*100)

81.08108108108108
