In [3]:
import numpy as np
import pandas as pd
import glob

In [54]:
def create_dataframe(folder_name):
    
    dataset = pd.DataFrame(columns=['caption', 'label'])

    folders = glob.glob('./dataset/{}/sentences/*'.format(folder_name))
    for folder in folders:
        classes = glob.glob('{}/*.txt'.format(folder))
        for caption in classes:
            with open(caption, 'r') as file:
                inside_file = file.read()
                label = str(folder).split('\\')[1]
                for line in inside_file.split('\n'):
                    if line:
                        dataset = dataset.append({'caption': line, 'label': label}, ignore_index=True)
    return dataset

In [55]:
train = create_dataframe('train')
train.head()

Unnamed: 0,caption,label
0,Two gentleman talking in front of propeller pl...,aeroplane
1,Two men are conversing next to a small airplane.,aeroplane
2,Two men talking in front of a plane,aeroplane
3,Two men talking in front of a small plane.,aeroplane
4,Two men talk while standing next to a small pa...,aeroplane


In [56]:
test = create_dataframe('test')
test.head()

Unnamed: 0,caption,label
0,An airplane facing the camera.,aeroplane
1,A plane is sitting on the cement at a small ai...,aeroplane
2,Front of a twin engine propeller airplane.,aeroplane
3,Front view of a propeller airplane parked on a...,aeroplane
4,The historic planes sits in its place.,aeroplane


In [59]:
import spacy

# I'm gonna use medium size of this model
nlp = spacy.load('en_core_web_md')

In [None]:
def spacy_semantic_similarity(sentence1, sentence2):
    sentence1 = nlp(sentence1)
    sentence2 = nlp(sentence2)
    
    return sentence1.similarity(sentence2)


In [71]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

X_train = train.caption
X_test = test.caption

y_train = train.label
y_test = test.label

# text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
#                      ('tfidf', TfidfTransformer()),
#                      ('clf', KNeighborsClassifier())])

text_clf = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),
                     ('clf', KNeighborsClassifier())])



text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
print(metrics.classification_report(y_test, predicted))
accuracy_score(y_test, predicted)

              precision    recall  f1-score   support

   aeroplane       0.73      0.82      0.77       100
     bicycle       0.44      0.57      0.50       100
        bird       0.67      0.71      0.69       100
        boat       0.68      0.79      0.73       100
         bus       0.54      0.74      0.63       100
         car       0.46      0.41      0.43       100
         cat       0.53      0.70      0.61       100
       chair       0.23      0.17      0.19       101
         cow       0.67      0.70      0.69       100
 diningtable       0.51      0.56      0.54       100
         dog       0.44      0.39      0.41       100
       horse       0.78      0.72      0.75       100
   motorbike       0.69      0.61      0.65       100
      person       0.21      0.11      0.14       100
 pottedplant       0.56      0.22      0.32       100
       sheep       0.74      0.75      0.74       100
        sofa       0.50      0.43      0.46       100
       train       0.77    

0.5728563913729616