# Importing libraries

In [None]:
import io
import sys
PATH = '/Data'
DIR_DATA = '../Data/'
sys.path.append(PATH) if PATH not in list(sys.path) else None
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, plot_confusion_matrix,confusion_matrix,cohen_kappa_score,mean_squared_error
from matplotlib.colors import ListedColormap
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.cluster import KMeans
from sklearn.preprocessing import label_binarize


import seaborn as sns
from sklearn.model_selection import train_test_split
%matplotlib inline

# Load dataset

In [None]:
filename = DIR_DATA + 'DATA FOR CEREBRAL PALSY.csv'
data = pd.read_csv(filename, sep=';', decimal=',', header=None, names=['Logitud de zancada (m)', 'Cadencia (paso/min)', 'Longitud de la pierna (m)', 'Edad (años)','Status'] )
data
#Status:
#1: Intact children, (control group)
#2: Children with spastic diplegia form of cerebral palsy

### Data Set Information:
The dataset contains cases from a study that was conducted in the Motion Analysis Laboratory at the University of Virginia. Eighty eight children with spastic diplegia form of cerebral palsy (ranging from 2 to 20 years with a mean of 9.9 years) and a neurologically intact control group of 68 children (ranging from 2 to 13 years with a mean of 7.1 years) with no history of motor pathology. Each child performed at leats three walking trials at a freely selected and comfortable walking speed. 

### Attribute Information:

1. Stride length (numerical)
2. Cadence (numerical)
3. Leg length (numerical)
4. Age (numerical)
5. Health status (class attribute)
    1 = Neurologically intact child, 
    2 = Child with the spastic diplegia form of cerebral palsy

# describe Dataset

In [None]:
print(data.describe())
print('*'*65)
print(data.info())

In [None]:
data['Status'].value_counts()

In [None]:
cs = data.groupby("Status", as_index=False).count()
cs

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x = cs['Status'].values, y = cs['Logitud de zancada (m)'].values, alpha=0.8)
plt.title('Status Frequency')
plt.ylabel('Frecuency', fontsize=10)
plt.xlabel('Status', fontsize=10)
plt.show()

In [None]:
cs = data.groupby("Edad (años)",as_index=False).count()
cs

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x = cs['Edad (años)'].values, y = cs['Logitud de zancada (m)'].values, alpha=0.8)
plt.title('Age Frequency')
plt.ylabel('Frecuency', fontsize=10)
plt.xlabel('Ages', fontsize=10)
plt.show()

# Distribution plot

In [None]:
# distributions plot by Stride length
sns.FacetGrid(data, hue='Status', height=5).map(sns.distplot, 'Logitud de zancada (m)').add_legend();

In [None]:
# distributions plot by Cadence
sns.FacetGrid(data, hue='Status', height=5).map(sns.distplot, 'Cadencia (paso/min)').add_legend();

In [None]:
# distributions plot by Leg length
sns.FacetGrid(data, hue='Status', height=5).map(sns.distplot, 'Longitud de la pierna (m)').add_legend();

In [None]:
# distributions plot by Age
sns.FacetGrid(data, hue='Status', height=5).map(sns.distplot, 'Edad (años)').add_legend();

In [None]:
counts, bind_edges = np.histogram(data['Edad (años)'], bins=10, density=True)
plt.xlabel('Edad (años)')
print(counts,sum(counts))
pf = counts/sum(counts)
print('pf=',pf)
print('bind_edges', bind_edges)
cdf = np.cumsum(pf)
plt.plot(bind_edges[1:], pf)
plt.plot(bind_edges[1:], cdf)

# Bivariate analysis

In [None]:
sns.boxplot(x='Status', y='Edad (años)', data=data)
plt.show()

In [None]:
sns.boxplot(x='Status', y='Logitud de zancada (m)', data=data)
plt.show()

In [None]:
sns.violinplot(x='Status', y='Cadencia (paso/min)', data=data, size=10)
plt.show()

In [None]:
sns.violinplot(x='Status', y='Longitud de la pierna (m)', data=data, size=10)
plt.show()

In [None]:
sns.set_style('whitegrid')
sns.FacetGrid(data, hue = 'Status' , height = 6).map(plt.scatter,'Edad (años)','Logitud de zancada (m)').add_legend()
plt.show()

In [None]:
sns.set_style('whitegrid')
sns.FacetGrid(data, hue = 'Status' , height = 6).map(plt.scatter,'Edad (años)','Cadencia (paso/min)').add_legend()
plt.show()

In [None]:
sns.set_style('whitegrid')
sns.FacetGrid(data, hue = 'Status' , height = 6).map(plt.scatter,'Edad (años)','Longitud de la pierna (m)').add_legend()
plt.show()

# Multivariate Analysis

In [None]:
sns.pairplot(data, hue='Status', height=5)
plt.show()

In [None]:
# box and whisker plots
data.plot(kind='box', subplots=True, layout=(2,5), sharex=False, sharey=False)
plt.show()

In [None]:
# histogramas
data.drop(['Status'],axis=1).hist()
plt.show()

## Multivariate analysis

In [None]:
fig, (ax) = plt.subplots(1, 1, figsize=(14,8))
hm = sns.heatmap(data.corr(), ax=ax, cmap="bwr", annot=True, fmt='.2f', linewidths=.05)
fig.subplots_adjust(top=0.93)
fig.suptitle('Combined undergone Cerebral palsy Attributes and their Correlation Heatmap', fontsize=14, fontweight='bold');

# x y

In [None]:
x = data.iloc[:,0:4].values
y = data.iloc[:,4].values
"""# Binarize the output
y = label_binarize(y, classes=[1, 2])"""

# Principal component analysis

label_dict = {1: 'Neurologically intact child',
              2: 'Child with the spastic diplegia form of cerebral palsy'}

feature_dict = {0: 'Logitud de zancada (m)',
                1: 'Cadencia (paso/min)',
                2: 'Longitud de la pierna (m)',
                3: 'Edad (años)'}


In [None]:
x_std = StandardScaler().fit_transform(x)

pca = PCA(n_components=2)
Y_sklearn = pca.fit_transform(x_std)

with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(8, 6))
    for lab, col in zip((1,2), ('blue', 'red')):
        plt.scatter(Y_sklearn[y==lab, 0], Y_sklearn[y==lab, 1], label=lab, c=col)
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.legend(loc='upper left')
    plt.tight_layout()
    plt.show()

In [None]:
#Dividimos los datos en el conjunto de entrenamiento y el conjunto de prueba con  sklearn.model_selection, train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2, random_state=0)
print(len(x),len(x_test),len(x_train))
print(len(y),len(y_test),len(y_train))

In [None]:
#Estandarizamos escalas con sklearn.preprocessing, StandardScaler
sc=StandardScaler()  
x_train=sc.fit_transform(x_train)  #y no porque solo estamos transformando las variabkes independientes, x. Caracteristicas
x_test=sc.transform(x_test)


In [None]:
#Aplicando PCA
pca = PCA(n_components=2)
x_train=pca.fit_transform(x_train)
x_test=pca.transform(x_test)

kmeans = KMeans(n_clusters=2)
log_reg = LogisticRegression()
new_X_train = kmeans.fit_transform(x_train)
log_reg.fit(new_X_train, y_train) 
y_pred=log_reg.predict(x_test)
f1 = f1_score(y_test, y_pred, average="macro")
precision = precision_score(y_test, y_pred, average="macro")
recall = recall_score(y_test, y_pred, average="macro")
accuracy = accuracy_score(y_test, y_pred, normalize=True)
print('F1: ',f1)
print('Precision: ', precision)
print('Recall: ', recall)
print('Accuracy: ', accuracy)

# Logistic Regression

In [None]:
#Aplicando Regresion logistica
clasificador=LogisticRegression(random_state=42)
clasificador.fit(x_train,y_train) #entrenando el clasificador
y_pred=clasificador.predict(x_test)
f1 = f1_score(y_test, y_pred, average="macro")
precision = precision_score(y_test, y_pred, average="macro")
recall = recall_score(y_test, y_pred, average="macro")
accuracy = accuracy_score(y_test, y_pred, normalize=True)
k=cohen_kappa_score(y_test, y_pred, labels=None, weights=None, sample_weight=None)
mse=mean_squared_error(y_test, y_pred)
print('F1: ',f1)
print('Precision: ', precision)
print('Recall: ', recall)
print('Accuracy: ', accuracy)
print('kappa: ', k)
print('mse: ', mse)

In [None]:
cm=confusion_matrix(y_test,y_pred)
plot_confusion_matrix(clasificador,x_test,y_test)
plt.show()  

In [None]:
X_set, y_set=x_test,y_test
X1,X2=np.meshgrid(np.arange(start=X_set[:,0].min()-1,stop=X_set[:,0].max()+1,step=0.01),
                 np.arange(start=X_set[:,1].min()-1,stop=X_set[:,1].max()+1,step=0.01))
plt.contourf(X1,X2,clasificador.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),
            alpha=0.75,cmap=ListedColormap(('red','green')))
plt.xlim(X1.min(),X1.max())
plt.ylim(X2.min(),X2.max())
for i,j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set==j,0], X_set[y_set==j,1],
               c=ListedColormap(('red','green'))(i),label=j)
plt.title('Logistic Regression (Test set)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend()
plt.show()

# k-fold cross-validation

In [None]:
# evaluate a logistic regression model using k-fold cross-validation
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
# create dataset
#X, y = make_classification(n_samples=100, n_features=20, n_informative=15, n_redundant=5, random_state=1)
# prepare the cross-validation procedure
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
model = clasificador
# evaluate model
scores = cross_val_score(model, x, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

# Random Forest


In [None]:
rf = RandomForestClassifier(max_depth=200, n_estimators=200, random_state=42).fit(x_train, y_train) 
prediction = rf.predict(x_test) 

f1 = f1_score(y_test, prediction, average="macro")
precision = precision_score(y_test, prediction, average="macro")
recall = recall_score(y_test, prediction, average="macro")
accuracy = accuracy_score(y_test, prediction, normalize=True)
print('F1: ',f1)
print('Precision: ', precision)
print('Recall: ', recall)
print('Accuracy: ', accuracy)

In [None]:
cm=confusion_matrix(y_test,y_pred)
plot_confusion_matrix(rf,x_test,y_test)
plt.show()  

In [None]:
X_set, y_set=x_test,y_test
X1,X2=np.meshgrid(np.arange(start=X_set[:,0].min()-1,stop=X_set[:,0].max()+1,step=0.01),
                 np.arange(start=X_set[:,1].min()-1,stop=X_set[:,1].max()+1,step=0.01))
plt.contourf(X1,X2,rf.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),
            alpha=0.75,cmap=ListedColormap(('red','green')))
plt.xlim(X1.min(),X1.max())
plt.ylim(X2.min(),X2.max())
for i,j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set==j,0], X_set[y_set==j,1],
               c=ListedColormap(('red','green'))(i),label=j)
plt.title('Logistic Regression (Test set)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend()
plt.show()

# Naive Bayes


In [None]:
"""# Split dataset in training and test datasets
X_train, X_test = train_test_split(data, test_size=0.2, random_state=6) 
y_train =X_train["Status"]
y_test = X_test["Status"]"""

# Instantiate the classifier
gnb = GaussianNB()
# Train classifier
"""gnb.fit(
    X_train[used_features].values,
    y_train
)

y_pred = gnb.predict(X_test[used_features])"""
gnb.fit(
    x_train,
    y_train
)
y_pred = gnb.predict(x_test)
 
print('Accuracy en el set de Entrenamiento: {:.2f}'
     .format(gnb.score(x_train, y_train)))
print('Accuracy en el set de Test: {:.2f}'
     .format(gnb.score(x_test, y_test)))
f1 = f1_score(y_test, y_pred, average="macro")
precision = precision_score(y_test, y_pred, average="macro")
recall = recall_score(y_test, y_pred, average="macro")
accuracy = accuracy_score(y_test, y_pred, normalize=True)
print('F1: ',f1)
print('Precision: ', precision)
print('Recall: ', recall)
print('Accuracy: ', accuracy)

In [None]:
cm=confusion_matrix(y_test,y_pred)
plot_confusion_matrix(gnb,x_test,y_test)
plt.show()  

In [None]:
X_set, y_set=x_test,y_test
X1,X2=np.meshgrid(np.arange(start=X_set[:,0].min()-1,stop=X_set[:,0].max()+1,step=0.01),
                 np.arange(start=X_set[:,1].min()-1,stop=X_set[:,1].max()+1,step=0.01))
plt.contourf(X1,X2,gnb.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),
            alpha=0.75,cmap=ListedColormap(('red','green')))
plt.xlim(X1.min(),X1.max())
plt.ylim(X2.min(),X2.max())
for i,j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set==j,0], X_set[y_set==j,1],
               c=ListedColormap(('red','green'))(i),label=j)
plt.title('Naive Bayes (Test set)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend()
plt.show()

# Super vector machine

In [None]:
svc = svm.SVC(kernel='rbf', C=1, probability=True).fit(x_train, y_train) 
prediction = svc.predict(x_test) 

f1 = f1_score(y_test, prediction, average="macro")
precision = precision_score(y_test, prediction, average="macro")
recall = recall_score(y_test, prediction, average="macro")
accuracy = accuracy_score(y_test, prediction, normalize=True)
print('Svm:')
print('F1: ',f1)
print('Precision: ', precision)
print('Recall: ', recall)
print('Accuracy: ', accuracy)

# MLP

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split                                           
clf = MLPClassifier(random_state=1, max_iter=300).fit(x_train, y_train)
prediction2=clf.predict(x_test)
f1 = f1_score(y_test, prediction2, average="macro")
precision = precision_score(y_test, prediction2, average="macro")
recall = recall_score(y_test, prediction2, average="macro")
accuracy = accuracy_score(y_test, prediction2, normalize=True)
print('MLP:')
print('F1: ',f1)
print('Precision: ', precision)
print('Recall: ', recall)
print('Accuracy: ', accuracy)