In [263]:
import numpy as np
import pandas as pd
import glob

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import pos_tag
import string

from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [54]:
def create_dataframe(folder_name):
    
    dataset = pd.DataFrame(columns=['caption', 'label'])

    folders = glob.glob('./dataset/{}/sentences/*'.format(folder_name))
    for folder in folders:
        classes = glob.glob('{}/*.txt'.format(folder))
        for caption in classes:
            with open(caption, 'r') as file:
                inside_file = file.read()
                label = str(folder).split('\\')[1]
                for line in inside_file.split('\n'):
                    if line:
                        dataset = dataset.append({'caption': line, 'label': label}, ignore_index=True)
    return dataset

In [55]:
train = create_dataframe('train')
train.head()

Unnamed: 0,caption,label
0,Two gentleman talking in front of propeller pl...,aeroplane
1,Two men are conversing next to a small airplane.,aeroplane
2,Two men talking in front of a plane,aeroplane
3,Two men talking in front of a small plane.,aeroplane
4,Two men talk while standing next to a small pa...,aeroplane


In [56]:
test = create_dataframe('test')
test.head()

Unnamed: 0,caption,label
0,An airplane facing the camera.,aeroplane
1,A plane is sitting on the cement at a small ai...,aeroplane
2,Front of a twin engine propeller airplane.,aeroplane
3,Front view of a propeller airplane parked on a...,aeroplane
4,The historic planes sits in its place.,aeroplane


In [142]:
# import spacy

# #I'm gonna use medium size of this model
# nlp = spacy.load('en_core_web_md')

In [143]:
# def spacy_semantic_similarity(sentence1, sentence2):
#     sentence1 = nlp(sentence1)
#     sentence2 = nlp(sentence2)
    
#     return sentence1.similarity(sentence2)

In [365]:
def nlprocess(dataset):
    
    preprocessed_comments = []
    for _, (caption, label) in dataset.iterrows():

        #Lowercase
        caption = caption.lower()

        # Removing Punctuation
        caption = "".join([char for char in caption if char not in string.punctuation])

        # Word Tokenization
        caption = word_tokenize(caption)

        # Stopword Filtering
        stop_words = stopwords.words('english')
        caption = [word for word in caption if word not in stop_words]

        # Filter 
        caption = pos_tag(caption)
        caption = [word for word, tag in caption if tag not in ('JJ', 'CD')]


        # Stemming
    #     porter = PorterStemmer()
    #     caption = [porter.stem(word) for word in caption]


        caption = ' '.join(caption)

        preprocessed_comments.append(caption)

    return preprocessed_comments

In [366]:
preprocessed_captions_train = nlprocess(train)
preprocessed_captions_test = nlprocess(test)

In [367]:
X_train = preprocessed_captions_train
X_test = preprocessed_captions_test

y_train = train.label
y_test = test.label

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', KNeighborsClassifier(n_neighbors=50))])



text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

   aeroplane       0.85      0.90      0.87       100
     bicycle       0.66      0.67      0.66       100
        bird       0.87      0.74      0.80       100
        boat       0.82      0.80      0.81       100
         bus       0.62      0.85      0.71       100
         car       0.48      0.43      0.45       100
         cat       0.59      0.79      0.68       100
       chair       0.34      0.18      0.23       101
         cow       0.70      0.89      0.78       100
 diningtable       0.50      0.69      0.58       100
         dog       0.47      0.58      0.52       100
       horse       0.61      0.91      0.73       100
   motorbike       0.81      0.69      0.75       100
      person       0.20      0.10      0.13       100
 pottedplant       0.63      0.27      0.38       100
       sheep       0.81      0.52      0.63       100
        sofa       0.49      0.56      0.52       100
       train       0.85    

In [368]:
# from sklearn.preprocessing import LabelEncoder
# label_encoder = LabelEncoder()
# preprocessed_captions_train = label_encoder.fit_transform(preprocessed_captions_train)
# preprocessed_captions_test = label_encoder.fit_transform(preprocessed_captions_test)

In [369]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.66070489216202
              precision    recall  f1-score   support

   aeroplane       0.85      0.90      0.87       100
     bicycle       0.66      0.78      0.71       100
        bird       0.86      0.79      0.82       100
        boat       0.86      0.87      0.87       100
         bus       0.70      0.88      0.78       100
         car       0.55      0.48      0.51       100
         cat       0.65      0.79      0.71       100
       chair       0.31      0.18      0.23       101
         cow       0.71      0.91      0.80       100
 diningtable       0.55      0.70      0.62       100
         dog       0.48      0.47      0.48       100
       horse       0.80      0.91      0.85       100
   motorbike       0.67      0.70      0.68       100
      person       0.21      0.18      0.19       100
 pottedplant       0.62      0.34      0.44       100
       sheep       0.77      0.49      0.60       100
        sofa       0.53      0.62      0.57       100
 

In [370]:
from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)


y_pred = sgd.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.6870068385060495
              precision    recall  f1-score   support

   aeroplane       0.91      0.91      0.91       100
     bicycle       0.61      0.83      0.71       100
        bird       0.86      0.81      0.84       100
        boat       0.76      0.93      0.84       100
         bus       0.71      0.87      0.78       100
         car       0.60      0.52      0.56       100
         cat       0.80      0.89      0.84       100
       chair       0.42      0.20      0.27       101
         cow       0.76      0.94      0.84       100
 diningtable       0.58      0.69      0.63       100
         dog       0.46      0.63      0.53       100
       horse       0.82      0.95      0.88       100
   motorbike       0.76      0.79      0.77       100
      person       0.21      0.08      0.12       100
 pottedplant       0.67      0.33      0.44       100
       sheep       0.68      0.55      0.61       100
        sofa       0.52      0.55      0.53       100

In [371]:
from sklearn.linear_model import LogisticRegression

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg.fit(X_train, y_train)


y_pred = logreg.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.6212519726459758
              precision    recall  f1-score   support

   aeroplane       0.96      0.81      0.88       100
     bicycle       0.70      0.78      0.74       100
        bird       0.92      0.77      0.84       100
        boat       0.76      0.76      0.76       100
         bus       0.66      0.77      0.71       100
         car       0.36      0.38      0.37       100
         cat       0.83      0.80      0.82       100
       chair       0.21      0.23      0.22       101
         cow       0.78      0.86      0.82       100
 diningtable       0.44      0.43      0.43       100
         dog       0.40      0.29      0.34       100
       horse       0.89      0.91      0.90       100
   motorbike       0.81      0.71      0.76       100
      person       0.17      0.14      0.16       100
 pottedplant       0.35      0.37      0.36       100
       sheep       0.67      0.64      0.66       100
        sofa       0.48      0.51      0.49       100

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
