In [3]:
import numpy as np
import pandas as pd
import glob

In [54]:
def create_dataframe(folder_name):
    
    dataset = pd.DataFrame(columns=['caption', 'label'])

    folders = glob.glob('./dataset/{}/sentences/*'.format(folder_name))
    for folder in folders:
        classes = glob.glob('{}/*.txt'.format(folder))
        for caption in classes:
            with open(caption, 'r') as file:
                inside_file = file.read()
                label = str(folder).split('\\')[1]
                for line in inside_file.split('\n'):
                    if line:
                        dataset = dataset.append({'caption': line, 'label': label}, ignore_index=True)
    return dataset

In [55]:
train = create_dataframe('train')
train.head()

Unnamed: 0,caption,label
0,Two gentleman talking in front of propeller pl...,aeroplane
1,Two men are conversing next to a small airplane.,aeroplane
2,Two men talking in front of a plane,aeroplane
3,Two men talking in front of a small plane.,aeroplane
4,Two men talk while standing next to a small pa...,aeroplane


In [56]:
test = create_dataframe('test')
test.head()

Unnamed: 0,caption,label
0,An airplane facing the camera.,aeroplane
1,A plane is sitting on the cement at a small ai...,aeroplane
2,Front of a twin engine propeller airplane.,aeroplane
3,Front view of a propeller airplane parked on a...,aeroplane
4,The historic planes sits in its place.,aeroplane


In [92]:
import spacy

#I'm gonna use medium size of this model
nlp = spacy.load('en_core_web_md')

In [None]:
def spacy_semantic_similarity(sentence1, sentence2):
    sentence1 = nlp(sentence1)
    sentence2 = nlp(sentence2)
    
    return sentence1.similarity(sentence2)


In [78]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import pos_tag
import string


In [98]:
preprocessed_comments = []
for _, (caption, label) in train.iterrows():

    #Lowercase
    caption = caption.lower()
    
    # Removing Punctuation
    caption = "".join([char for char in caption if char not in string.punctuation])
    
    # Word Tokenization
    caption = word_tokenize(caption)
    
    # Stopword Filtering
    stop_words = stopwords.words('english')
    [stop_words.remove(x) for x in ['not', 'very']]
    caption = [word for word in caption if word not in stop_words]
    
    # Filter 
#     caption = pos_tag(caption)
#     caption = [word for word, tag in caption if tag in ('NN')]
    
    # Stemming
    porter = PorterStemmer()
    caption = [porter.stem(word) for word in caption]
    caption = ' '.join(caption)
    
    preprocessed_comments.append(caption)
    
preprocessed_comments

['two gentleman talk front propel plane',
 'two men convers next small airplan',
 'two men talk front plane',
 'two men talk front small plane',
 'two men talk stand next small passeng plane airport',
 'derfw6 flight',
 'armi green plane fli sky',
 'old fighter plane fli german militari mark',
 'small green yellow plane sky',
 'wwii fighter plane land gear',
 'larger plane fli smaller plane',
 'black white scene two plane fli',
 'two airplan sky',
 'two fighter plane midflight',
 'two militari plane fli near',
 'blue ground fighter jet park grass front glass build',
 'blue jet stop lawn',
 'fighter jet sit display',
 'jet plane exhibit front modern build',
 'blue fighter jet park green grass',
 'airplan sit tarmac airport anoth plane background',
 'white blue airplan park airport near anoth small plane',
 'blue white airplan park',
 'two airplan wait tarmac',
 'two airplan park airport',
 'airplan approach runway',
 'swissair flight taken runway',
 'white swiss airplan approach runway'

In [99]:
preprocessed_comments_test = []
for _, (caption, label) in test.iterrows():

    #Lowercase
    caption = caption.lower()
    
    # Removing Punctuation
    caption = "".join([char for char in caption if char not in string.punctuation])
    
    # Word Tokenization
    caption = word_tokenize(caption)
    
    # Stopword Filtering
    stop_words = stopwords.words('english')
    [stop_words.remove(x) for x in ['not', 'very']]
    caption = [word for word in caption if word not in stop_words]
    
    # Filter 
#     caption = pos_tag(caption)
#     caption = [word for word, tag in caption if tag in ('NN')]
    
    # Stemming
    porter = PorterStemmer()
    caption = [porter.stem(word) for word in caption]
    caption = ' '.join(caption)
    
    preprocessed_comments_test.append(caption)
    
preprocessed_comments_test

['airplan face camera',
 'plane sit cement small airport',
 'front twin engin propel airplan',
 'front view propel airplan park runway',
 'histor plane sit place',
 'gray jet tarmac',
 'airplan inspect take',
 'airplan get servic check next flight',
 'sever peopl stand beneath larg gray plane',
 'sever men stand around plane',
 'airplan fli color sky sun rise',
 'plane fli near sunset',
 'silhouett plane take distanc sunset',
 'plane midflight sunset',
 'airplan fli high yellow color sky',
 'two biplan fli side side emit contrail',
 'two biplan fli close togeth amidst lot smoke',
 'two plane leav smoki trail sky',
 'two stunt airplan pilot stand wing',
 'two stunt biplan walker smoke plume',
 'blue orang airplan fli land gear',
 'blue red airplan flight',
 'red blue plane fli land gear',
 'blue red plane midair flight',
 'front half blue orang airplan land gear',
 'biplan perform aerobat',
 'blue yellow plane fli straight emit white smoke',
 'airplan dive posit',
 'trick plane make loo

In [128]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

X_train = preprocessed_comments
X_test = preprocessed_comments_test

y_train = train.label
y_test = test.label

text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', KNeighborsClassifier(n_neighbors=50))])



text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

   aeroplane       0.85      0.91      0.88       100
     bicycle       0.66      0.84      0.74       100
        bird       0.90      0.78      0.83       100
        boat       0.83      0.85      0.84       100
         bus       0.68      0.88      0.77       100
         car       0.49      0.43      0.46       100
         cat       0.66      0.86      0.75       100
       chair       0.33      0.13      0.19       101
         cow       0.82      0.93      0.87       100
 diningtable       0.55      0.78      0.64       100
         dog       0.60      0.64      0.62       100
       horse       0.79      0.89      0.84       100
   motorbike       0.76      0.71      0.74       100
      person       0.22      0.12      0.15       100
 pottedplant       0.65      0.34      0.45       100
       sheep       0.93      0.90      0.91       100
        sofa       0.55      0.67      0.60       100
       train       0.83    