In [1]:
!rm -rf data
!mkdir data
!wget https://s3.eu-central-1.wasabisys.com/aicrowd-practice-challenges/public/dibrd/v0.1/train.csv -O data/train.csv
!wget https://s3.eu-central-1.wasabisys.com/aicrowd-practice-challenges/public/dibrd/v0.1/test.csv -O data/test.csv

--2020-05-20 18:22:58--  https://s3.eu-central-1.wasabisys.com/aicrowd-practice-challenges/public/dibrd/v0.1/train.csv
Resolving s3.eu-central-1.wasabisys.com (s3.eu-central-1.wasabisys.com)... 130.117.252.12, 130.117.252.13, 130.117.252.10, ...
Connecting to s3.eu-central-1.wasabisys.com (s3.eu-central-1.wasabisys.com)|130.117.252.12|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 116826 (114K) [text/csv]
Saving to: ‘data/train.csv’


2020-05-20 18:23:00 (162 KB/s) - ‘data/train.csv’ saved [116826/116826]

--2020-05-20 18:23:02--  https://s3.eu-central-1.wasabisys.com/aicrowd-practice-challenges/public/dibrd/v0.1/test.csv
Resolving s3.eu-central-1.wasabisys.com (s3.eu-central-1.wasabisys.com)... 130.117.252.12, 130.117.252.13, 130.117.252.10, ...
Connecting to s3.eu-central-1.wasabisys.com (s3.eu-central-1.wasabisys.com)|130.117.252.12|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 29188 (29K) [text/csv]
Saving to: ‘data/test.csv’




## Import packages

In [0]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score
from tqdm import tqdm

## Load Data
We use pandas library to load our data. Pandas loads them into dataframes which helps us analyze our data easily. Learn more about it [here](https://www.tutorialspoint.com/python_data_science/python_pandas.htm)

In [0]:
train_data_path = "data/train.csv" #path where data is stored

In [0]:
train_data = pd.read_csv(train_data_path,header=None) #load data in dataframe using pandas

## Visualise the Dataset

In [5]:
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1,1,75,63,60,55,48,35,13.195493,4.396967,0.10407,0.0,0.0,0.0,0.0,0.0,0.513092,0.123966,0,1
1,1,1,79,76,74,72,69,50,61.559348,28.959444,12.778104,2.045287,0.038016,0.0,0.0,0.0,0.527993,0.101884,0,1
2,1,1,41,41,40,40,38,35,6.090116,0.834492,0.02746,0.0,0.0,0.0,0.0,0.0,0.506881,0.091535,1,0
3,1,1,17,16,16,14,12,9,75.438535,20.3525,5.237412,0.206817,0.003884,0.000971,0.000971,0.000971,0.544614,0.089329,1,1
4,1,1,63,63,63,59,57,48,13.558211,5.366467,0.604079,0.051511,0.0,0.0,0.0,0.0,0.552941,0.112387,0,1


You can see the columns goes from 0 to 19, where columns from 0 to 18 represents features extracted from the image set and last column represents the type of patient i.e 1 if if signs of Diabetic Retinopathy is present else 0.

## Define the Classifier
Now we come to the juicy part. We have fixed our data and now we train a classifier. The classifier will learn the function by looking at the inputs and corresponding outputs. There are a ton of classifiers to choose from some being [Logistic Regression](https://towardsdatascience.com/logistic-regression-detailed-overview-46c4da4303bc), [SVM](https://towardsdatascience.com/support-vector-machine-introduction-to-machine-learning-algorithms-934a444fca47), [Random Forests](https://towardsdatascience.com/support-vector-machine-introduction-to-machine-learning-algorithms-934a444fca47), [Decision Trees](https://towardsdatascience.com/decision-trees-in-machine-learning-641b9c4e8052), etc.   
Tip: A good model doesnt depend solely on the classifier but on the features(columns) you choose. So make sure to play with your data and keep only whats important. 

In [0]:
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

clf1 = SVC(kernel='poly', C = 100, gamma=0.0001, coef0=1.5, probability=True)
clf2 = SVC(kernel='poly', C = 10, gamma=0.0001, coef0=2.5, probability=True)
clf3 = SVC(kernel='poly', C = 1, gamma=0.0001, coef0=1.5, probability=True)
clf4 = LogisticRegression(max_iter=5000)

eclf = VotingClassifier(
    estimators=[('svc1', clf1), ('svc2', clf2), ('svc3', clf3), ('lr', clf4)],
    voting='soft', weights=[1,1,1,1])

## Train the classifier

In [7]:
X = np.array(train_data.iloc[:])
X_train, X_val= train_test_split(X, test_size=0.2, random_state=69)
X_train,y_train = X_train[:,:-1],X_train[:,-1]
X_val,y_val = X_val[:,:-1],X_val[:,-1]

idx = [0, 1, 2, 3, 4, 6, 7, 8, 10, 12, 13, 14, 15, 16, 17]
X_train = X_train[:, idx]
eclf.fit(X_train, y_train)

VotingClassifier(estimators=[('svc1',
                              SVC(C=100, break_ties=False, cache_size=200,
                                  class_weight=None, coef0=1.5,
                                  decision_function_shape='ovr', degree=3,
                                  gamma=0.0001, kernel='poly', max_iter=-1,
                                  probability=True, random_state=None,
                                  shrinking=True, tol=0.001, verbose=False)),
                             ('svc2',
                              SVC(C=10, break_ties=False, cache_size=200,
                                  class_weight=None, coef0=2.5,
                                  decision_fun...
                                  shrinking=True, tol=0.001, verbose=False)),
                             ('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                           

## Predict on Validation
Now we predict our trained classifier on the validation set and evaluate our model# Predict on test set

In [0]:
X_val = X_val[:, idx]
yy = predsv = eclf.predict(X_val)

## Evaluate the Performance
We use the same metrics as that will be used for the test set.  
[F1 score](https://en.wikipedia.org/wiki/F1_score) are the metrics for this challenge

In [0]:
precision = precision_score(y_val,yy,average='micro')
recall = recall_score(y_val,yy,average='micro')
accuracy = accuracy_score(y_val,yy)
f1 = f1_score(y_val,yy,average='macro')

In [10]:
print("Accuracy of the model is :" ,accuracy)
print("Recall of the model is :" ,recall)
print("Precision of the model is :" ,precision)
print("F1 score of the model is :" ,f1)

Accuracy of the model is : 0.7771739130434783
Recall of the model is : 0.7771739130434783
Precision of the model is : 0.7771739130434783
F1 score of the model is : 0.7756831494751866


# Prediction on Evaluation Set

## Load Test Set
Load the test data now# Load the evaluation data

In [0]:
final_test_path = "data/test.csv"
final_test = pd.read_csv(final_test_path,header=None)
ft = np.array(final_test, dtype=np.float32)

## Predict Test Set
Time for the moment of truth! Predict on test set and time to make the submission.

In [0]:
ft = ft[:, idx]
submission = eclf.predict(ft)

## Save the prediction to csv

In [0]:
submission = pd.DataFrame(submission)
submission.to_csv('submission_dibrd.csv',header=['label'],index=False)