In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('full_df.csv', encoding='ISO-8859-1')

In [3]:
df

Unnamed: 0,ID,Patient Age,Patient Sex,Left-Fundus,Right-Fundus,Left-Diagnostic Keywords,Right-Diagnostic Keywords,N,D,G,C,A,H,M,O,filepath,labels,target,filename
0,0,69,Female,0_left.jpg,0_right.jpg,cataract,normal fundus,0,0,0,1,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['N'],"[1, 0, 0, 0, 0, 0, 0, 0]",0_right.jpg
1,1,57,Male,1_left.jpg,1_right.jpg,normal fundus,normal fundus,1,0,0,0,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['N'],"[1, 0, 0, 0, 0, 0, 0, 0]",1_right.jpg
2,2,42,Male,2_left.jpg,2_right.jpg,laser spotï¼moderate non proliferative retino...,moderate non proliferative retinopathy,0,1,0,0,0,0,0,1,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",2_right.jpg
3,4,53,Male,4_left.jpg,4_right.jpg,macular epiretinal membrane,mild nonproliferative retinopathy,0,1,0,0,0,0,0,1,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",4_right.jpg
4,5,50,Female,5_left.jpg,5_right.jpg,moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",5_right.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6387,4686,63,Male,4686_left.jpg,4686_right.jpg,severe nonproliferative retinopathy,proliferative diabetic retinopathy,0,1,0,0,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",4686_left.jpg
6388,4688,42,Male,4688_left.jpg,4688_right.jpg,moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",4688_left.jpg
6389,4689,54,Male,4689_left.jpg,4689_right.jpg,mild nonproliferative retinopathy,normal fundus,0,1,0,0,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",4689_left.jpg
6390,4690,57,Male,4690_left.jpg,4690_right.jpg,mild nonproliferative retinopathy,mild nonproliferative retinopathy,0,1,0,0,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",4690_left.jpg


In [4]:
df = df[['labels', 'filepath']]

In [5]:
df

Unnamed: 0,labels,filepath
0,['N'],../input/ocular-disease-recognition-odir5k/ODI...
1,['N'],../input/ocular-disease-recognition-odir5k/ODI...
2,['D'],../input/ocular-disease-recognition-odir5k/ODI...
3,['D'],../input/ocular-disease-recognition-odir5k/ODI...
4,['D'],../input/ocular-disease-recognition-odir5k/ODI...
...,...,...
6387,['D'],../input/ocular-disease-recognition-odir5k/ODI...
6388,['D'],../input/ocular-disease-recognition-odir5k/ODI...
6389,['D'],../input/ocular-disease-recognition-odir5k/ODI...
6390,['D'],../input/ocular-disease-recognition-odir5k/ODI...


In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [8]:
class TextNormalizer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, y=None, **fit_params):
        X_copy = X.copy()
        for i in range(len(X)):
            X_copy[i] = ' '.join([token.lower()
                                for token in word_tokenize(X_copy[i])])
            return X_copy

In [9]:
norm = TextNormalizer()

In [10]:
norm.fit_transform(['There is the house, where my family lives!'])

['there is the house , where my family lives !']

In [11]:
class WordExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, stop_words):
        self.stop_words = stop_words
    
    def fit(self, X, y=None, **fit_params):
        self.general_freq = FreqDist()
        for document in X:
            tokens = word_tokenize(document)
            freq = FreqDist(tokens)
            self.general_freq.update(freq)
        self.hapaxes = self.general_freq.hapaxes()
        return self
    
    def transform(self, X, y=None, **fit_params):
        X_copy = X.copy()
        for i in range(len(X)):
            X_copy[i] = ' '.join([token for token in word_tokenize(X[i])
                                 if token not in self.hapaxes and
                                 token not in self.stop_words])
        return X_copy

In [12]:
stop_words = stopwords.words('english')

In [13]:
word_extractor = WordExtractor(stop_words)

In [14]:
corpus = [
    'His name is Paul',
    'Paul likes to paint',
    'Paul likes to play sports',
    'A very good sport is running'
]

In [15]:
word_extractor.fit_transform(corpus)

['Paul', 'Paul likes', 'Paul likes', '']

In [16]:
class ApplyStemmer(BaseEstimator, TransformerMixin):
    def __init__(self, stemmer):
        self.stemmer = stemmer
    
    def fit(self, X, y=None, **fit_tranform):
        return self
    
    def transform(self, X, y=None, **fit_tranform):
        X_copy = X.copy()
        for i in range(len(X)):
            X_copy[i] = ' '.join([self.stemmer.stem(token) 
                                  for token in word_tokenize(X_copy[i])])
        return X_copy

In [17]:
porter_stemmer = PorterStemmer()

In [18]:
apply_stemmer = ApplyStemmer(porter_stemmer)

In [19]:
apply_stemmer.fit_transform(['There is the house, where my family lives!'])

['there is the hous , where my famili live !']

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

In [22]:
pipe = Pipeline([
    ('norm', TextNormalizer()),
    ('Extractor', WordExtractor(stop_words)),
    ('stemmer', ApplyStemmer(PorterStemmer())),
    ('vectorizer', CountVectorizer()),
    ('logic', LogisticRegression())
])

In [23]:
X = df['filepath'].values
y = df['labels'].values

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [25]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('norm', TextNormalizer()),
                ('Extractor',
                 WordExtractor(stop_words=['i', 'me', 'my', 'myself', 'we',
                                           'our', 'ours', 'ourselves', 'you',
                                           "you're", "you've", "you'll",
                                           "you'd", 'your', 'yours', 'yourself',
                                           'yourselves', 'he', 'him', 'his',
                                           'himself', 'she', "she's", 'her',
                                           'hers', 'herself', 'it', "it's",
                                           'its', 'itself', ...])),
                ('stemmer', ApplyStemmer(stemmer=<PorterStemmer>)),
                ('vectorizer', CountVectorizer()),
                ('logic', LogisticRegression())])

In [26]:
y_pred = pipe.predict(X_test)

In [27]:
accuracy_score(y_pred, y_test)

0.44806007509386736