In [1]:
import numpy as np
import pandas as pd
import glob

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import pos_tag
import string

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
def create_dataframe(folder_name):
    
    dataset = pd.DataFrame(columns=['caption', 'label'])

    folders = glob.glob('./dataset/{}/sentences/*'.format(folder_name))
    for folder in folders:
        classes = glob.glob('{}/*.txt'.format(folder))
        for caption in classes:
            with open(caption, 'r') as file:
                inside_file = file.read()
                label = str(folder).split('\\')[1]
                lines = ''
                for line in inside_file.split('\n'):
                    if line:
                        lines += ' ' + line
                dataset = dataset.append({'caption': lines, 'label': label}, ignore_index=True)
    return dataset

In [3]:
train = create_dataframe('train')
train.head()

Unnamed: 0,caption,label
0,Two gentleman talking in front of propeller p...,aeroplane
1,A D-ERFW-6 in flight. An army green plane fly...,aeroplane
2,a larger plane in flying above a smaller plan...,aeroplane
3,A blue grounded fighter jet is parked on gras...,aeroplane
4,An airplane sitting on the tarmac at an airpo...,aeroplane


In [4]:
test = create_dataframe('test')
test.head()

Unnamed: 0,caption,label
0,An airplane facing the camera. A plane is sit...,aeroplane
1,A gray jet on a tarmac. An airplane being ins...,aeroplane
2,An airplane flies against a colorful sky with...,aeroplane
3,"Two bi-planes are flying side by side, emitti...",aeroplane
4,A blue and orange airplane flying with its la...,aeroplane


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 532 entries, 0 to 531
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   caption  532 non-null    object
 1   label    532 non-null    object
dtypes: object(2)
memory usage: 4.2+ KB


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380 entries, 0 to 379
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   caption  380 non-null    object
 1   label    380 non-null    object
dtypes: object(2)
memory usage: 3.0+ KB


In [7]:
def nlprocess(dataset):
    
    preprocessed_comments = []
    for _, (caption, label) in dataset.iterrows():

        #Lowercase
        caption = caption.lower()

        # Removing Punctuation
        caption = "".join([char for char in caption if char not in string.punctuation])

        # Word Tokenization
        caption = word_tokenize(caption)

        # Stopword Filtering
        stop_words = stopwords.words('english')
        caption = [word for word in caption if word not in stop_words]

        # Filter 
#         caption = pos_tag(caption)
#         caption = [word for word, tag in caption if tag not in ('JJ', 'CD')]

        # Stemming
        porter = PorterStemmer()
        caption = [porter.stem(word) for word in caption]

        caption = ' '.join(caption)

        preprocessed_comments.append(caption)

    return preprocessed_comments

In [8]:
# preprocessed_captions_train = nlprocess(train[train.label == 'person'])
preprocessed_captions_train = nlprocess(train)
preprocessed_captions_test = nlprocess(test)

In [9]:
def machine_learning(algo):
    X_train = preprocessed_captions_train
    X_test  = preprocessed_captions_test

    y_train = train.label
    y_test  = test.label

    text_clf = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', algo)])

    text_clf.fit(X_train, y_train)
    predicted = text_clf.predict(X_test)
    
    print(metrics.classification_report(y_test, predicted))

In [47]:
machine_learning(algo=KNeighborsClassifier(n_neighbors=50))

              precision    recall  f1-score   support

   aeroplane       0.95      0.95      0.95        20
     bicycle       0.66      0.95      0.78        20
        bird       1.00      0.80      0.89        20
        boat       0.90      0.95      0.93        20
         bus       0.73      0.95      0.83        20
         car       0.75      0.60      0.67        20
         cat       0.86      0.90      0.88        20
       chair       0.50      0.20      0.29        20
         cow       0.91      1.00      0.95        20
 diningtable       0.66      0.95      0.78        20
         dog       0.55      0.80      0.65        20
       horse       0.91      1.00      0.95        20
   motorbike       0.79      0.75      0.77        20
      person       0.40      0.20      0.27        20
 pottedplant       1.00      0.35      0.52        20
       sheep       1.00      1.00      1.00        20
        sofa       0.50      0.70      0.58        20
       train       0.91    

In [11]:
machine_learning(algo=MultinomialNB())

              precision    recall  f1-score   support

   aeroplane       0.95      0.95      0.95        20
     bicycle       0.71      1.00      0.83        20
        bird       1.00      0.85      0.92        20
        boat       0.86      0.95      0.90        20
         bus       0.76      0.95      0.84        20
         car       0.85      0.55      0.67        20
         cat       0.86      0.90      0.88        20
       chair       0.50      0.20      0.29        20
         cow       0.91      1.00      0.95        20
 diningtable       0.66      0.95      0.78        20
         dog       0.59      0.80      0.68        20
       horse       0.87      1.00      0.93        20
   motorbike       0.89      0.85      0.87        20
      person       0.56      0.25      0.34        20
 pottedplant       0.89      0.40      0.55        20
       sheep       0.95      1.00      0.98        20
        sofa       0.64      0.80      0.71        20
       train       0.91    

In [48]:
machine_learning(algo=SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=0, max_iter=5, tol=None))

              precision    recall  f1-score   support

   aeroplane       1.00      1.00      1.00        20
     bicycle       0.70      0.95      0.81        20
        bird       1.00      0.85      0.92        20
        boat       0.90      0.95      0.93        20
         bus       0.79      0.95      0.86        20
         car       0.67      0.60      0.63        20
         cat       0.86      0.90      0.88        20
       chair       0.56      0.25      0.34        20
         cow       0.95      1.00      0.98        20
 diningtable       0.67      1.00      0.80        20
         dog       0.64      0.80      0.71        20
       horse       0.87      1.00      0.93        20
   motorbike       0.94      0.80      0.86        20
      person       0.45      0.25      0.32        20
 pottedplant       0.67      0.40      0.50        20
       sheep       1.00      1.00      1.00        20
        sofa       0.73      0.80      0.76        20
       train       0.91    

In [34]:
machine_learning(algo=LogisticRegression(n_jobs=1, C=1e5))

              precision    recall  f1-score   support

   aeroplane       1.00      0.95      0.97        20
     bicycle       0.67      0.90      0.77        20
        bird       1.00      0.85      0.92        20
        boat       0.86      0.95      0.90        20
         bus       0.75      0.90      0.82        20
         car       0.58      0.55      0.56        20
         cat       0.86      0.95      0.90        20
       chair       0.32      0.30      0.31        20
         cow       0.91      1.00      0.95        20
 diningtable       0.56      0.70      0.62        20
         dog       0.65      0.65      0.65        20
       horse       0.87      1.00      0.93        20
   motorbike       1.00      0.65      0.79        20
      person       0.33      0.20      0.25        20
 pottedplant       0.70      0.35      0.47        20
       sheep       0.95      1.00      0.98        20
        sofa       0.71      0.85      0.77        20
       train       0.91    