In [31]:
from keras.layers import Dense,LSTM,Input,Concatenate,Embedding,add,Dropout
from keras.models import Model,Sequential
from keras.optimizers import Adam
from keras.preprocessing.sequence import pad_sequences
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as pl
import keras
import json
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.applications.inception_v3 import preprocess_input
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.preprocessing.image import array_to_img

In [32]:
data=pd.read_csv("captions.txt",sep=",") #1k unique image repeated times ith different caption
print(data.shape)
data.head()

(40455, 2)


Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...


In [33]:
data["image"].unique().shape

(8091,)

In [34]:
data["caption"][0]

'A child in a pink dress is climbing up a set of stairs in an entry way .'

In [35]:
X_train,X_test=train_test_split(data,test_size=0.2)

In [36]:
X_train["image"].unique().shape,X_test["image"].unique().shape

((8090,), (5466,))

# Text preprocessing 

In [37]:
def check_words_of_length_one(df,st):
    count=0
    for i in X_train["caption"].str.split(" "):
        for j in i:
            if len(j)==1:
                count=count+1
    print("There are {} words whose length is one in {}".format(count,st))
check_words_of_length_one(X_train,"X_train")
check_words_of_length_one(X_test,"X_test")

There are 82570 words whose length is one in X_train
There are 82570 words whose length is one in X_test


In [38]:
def remove_words_length1(text): #here text is list of words
    for i in text:
        if len(i)==1:
            text.remove(i)
    return " ".join(text)

def remove_words_occurs_lessthan_5(df):
    tfidf=TfidfVectorizer(min_df=1) #min_df=5 means only considers words a index which occurs >= 5 and ignore <5
    tfidf=tfidf.fit(df["caption"])
    words_occurs_more_than_5=tfidf.get_feature_names_out() #it gives array of words with words more than occuring 5 times
    L=[]
    for i in df["caption"].str.split(" "): #this loop takes each row as list and check each word present in word_occurs_5 list and create the whole list of text
        r=[]
        for j in i:
            if j in words_occurs_more_than_5:
                r.append(j)
        L.append(" ".join(r))
    return L
    
def remove_punctuation(text):
    punctuation=string.punctuation
    for i in punctuation:
        text=text.replace(i,"")
    return text
def add_start_end(text):
    text="start"+" "+text
    text=text+" "+"end"
    return text
def preprocessing_text(df):
    df["caption"]=df["caption"].str.lower()  #converting to lower case
    df["caption"]=df["caption"].apply(remove_punctuation) #removing punctuations
    df["caption"]=df["caption"].str.replace("<.*>","",regex=True) #removing the html tags
    df["caption"]=df["caption"].str.replace("https?://\S+|www\.\S+","",regex=True) #removing the url
    df["caption"]=df["caption"].str.strip() #removing extra spaces from the start and from the end
    df["caption"]=df["caption"].str.split(" ").apply(remove_words_length1) #remove words of length 1
    df["caption"]=remove_words_occurs_lessthan_5(df)
    df["output"]=df["caption"].apply(add_start_end)
preprocessing_text(X_train)
preprocessing_text(X_test)

In [39]:
X_train.head(1)

Unnamed: 0,image,caption,output
18661,3025334206_76888792e5.jpg,two dogs are running on the grass in front of ...,start two dogs are running on the grass in fro...


In [40]:
X_train["caption"][18661],X_train["output"][18661]

('two dogs are running on the grass in front of the trees',
 'start two dogs are running on the grass in front of the trees end')

In [41]:
X_test.head(1)

Unnamed: 0,image,caption,output
12378,2629334536_11f2d49e05.jpg,young person wearing shorts and blue top is sw...,start young person wearing shorts and blue top...


In [42]:
X_test["caption"][12378],X_test["output"][12378]

('young person wearing shorts and blue top is swinging in child protector seat',
 'start young person wearing shorts and blue top is swinging in child protector seat end')

In [44]:
X_train.to_csv("Data\\train.csv",columns=["image","output"],index=False)
X_test.to_csv("Data\\test.csv",columns=["image","output"],index=False)

# lets create a word_to_idx dictionary and idx_to_word dictionary

In [45]:
def word_index(df):
    count=0
    word_to_idx={}
    for i in df["output"]:
        for j in i.split(" "):
            if j not in word_to_idx:
                word_to_idx[j]=count
                count=count+1
    idx_to_word={j:i for i,j in word_to_idx.items()}
    return idx_to_word,word_to_idx
idx_to_word,word_to_idx=word_index(X_train)
vocab_size=len(idx_to_word) #length of unique words in our corpus
len(idx_to_word),len(word_to_idx)

(8082, 8082)

In [46]:
with open('Data\\idx_to_word.json', 'w') as fp:
        json.dump(idx_to_word, fp)
with open('Data\\word_to_idx.json', 'w') as fp:
        json.dump(word_to_idx, fp)