# Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
import seaborn as seabornInstance 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression # import logistic regression
from sklearn import metrics
from vega_datasets import data

%matplotlib inline

# Upload dataset

In [None]:
dataset = pd.read_csv('FILE_PATH.csv')
dataset.describe()

## # Divide the data into attributes and labels

In [None]:
X = dataset[['head_x', 
             'head_y', 
             'neck_x', 
             'neck_y', 
             'body_x', 
             'body_y', 
             'tail_x', 
             'tail_y', 
             'leg_r_x', 
             'leg_r_y', 
             'leg_l_x', 
             'leg_l_y', 
             'width', 
             'length', 
             'r_x',
             'r_y',
            ]].values
y = dataset['class'].values

In [None]:
plt.figure(figsize=(10,5))
plt.tight_layout()
seabornInstance.distplot(dataset['class'])

## Split the dataset to train and test

In [None]:
# Divide the data into training and testing partition 
# train => 70%
# test => 30%
# we use the following function that we imported above:
# from sklearn.model_selection import train_test_split 

print(f'X : {X.shape}, y : {y.shape}')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
print (f'X_train: {X_train.shape}')
print (f'X_test: {X_test.shape}')

print (f'y_train: {y_train.shape}')
print (f'y_test: {y_test.shape}')

In [None]:
print('Before Normalization: ')
print(f'Train: max {X_train.max(axis=0)}, min {X_train.min(axis=0)}')
print(f'Test: max {X_test.max(axis=0)}, min {X_test.min(axis=0)}')

#Calculate mean and std for the training data. 
train_mean = X_train.mean(axis=0)
train_std = X_train.std(axis=0)

# normalize train and test data
X_train_nrom = (X_train - train_mean) / train_std
X_test_nrom = (X_test - train_mean) / train_std

print('*' * 20)
print('After Normalization: ')
print(f'Train: max {X_train_nrom.max(axis=0)}, min {X_train_nrom.min(axis=0)}')
print(f'Test: max {X_test_nrom.max(axis=0)}, min {X_test_nrom.min(axis=0)}')

# Model

In [None]:
regressor = LogisticRegression(solver='liblinear', max_iter=1000)  # solver='liblinear' for one-versus-rest 
regressor.fit(X_train, y_train)

# Training the model and calculate the accuracy

In [None]:
y_pred = regressor.predict(X_train)
print("Accuracy:",metrics.accuracy_score(y_train, y_pred))

df = pd.DataFrame({'Actual': y_train, 'Predicted': y_pred})
df1 = df.head(20)
df1

# Test data accuracy

In [None]:
y_pred = regressor.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df1 = df.head(20)
df1

## Plot prediction versus actual 

In [None]:
df1.plot(kind='bar',figsize=(20,8))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

# Confusion matrix

In [None]:
cm = metrics.confusion_matrix(y_test, y_pred)

class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
seabornInstance.heatmap(pd.DataFrame(cm), annot=True, cmap='Blues' ,fmt='g')
#ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion Matrix') #, y=1.1)
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
ax.xaxis.set_ticklabels(['Positive', 'Negative']); ax.yaxis.set_ticklabels(['Positive', 'Negative']);