### Microsoft Coco Process

This notebook is used to analyze Microsoft Coco dataset.

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import re
import glob
from pathlib import Path
import shutil
from os.path import isfile, join
import json

In [2]:
# language pair
lang_folder = "Turkish"  # Arabic, English, French, German, Turkish, Spanish, Portuguese, Dutch, Italian ==> target language for learner
#lang_pair = "English"  # Arabic, English, French, German, Turkish, Spanish, Portuguese, Dutch, Italian ==> native language

In [3]:
path = f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/\
Lemma Stem POS/Result/4-0-Microsoft Coco Process"

Path(path).mkdir(parents=True, exist_ok=True)

In [4]:
def lower_strip_func(x):
    try:
        var_low = str(x).lower()
        var_out = var_low.strip()
    except:
        var_out = x
    return var_out 

In [5]:
en = re.compile(r"[abcdefghıijklmnopqrstxuvwyz]+", re.IGNORECASE|re.UNICODE) # English
def clean_text(text):
    text_clean = re.findall(en, text)
    text_result = " ".join(text_clean)
    return text_result

#### Microsoft Coco Data

In [15]:
coco_questions_data = json.load(open('/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Turkish/Lemma Stem POS/Data/Coco/v2_OpenEnded_mscoco_val2014_questions.json'))
coco_questions_data

{'info': {'description': 'This is v2.0 of the VQA dataset.',
  'url': 'http://visualqa.org',
  'version': '2.0',
  'year': 2017,
  'contributor': 'VQA Team',
  'date_created': '2017-04-26 17:00:44'},
 'task_type': 'Open-Ended',
 'data_type': 'mscoco',
 'license': {'url': 'http://creativecommons.org/licenses/by/4.0/',
  'name': 'Creative Commons Attribution 4.0 International License'},
 'data_subtype': 'val2014',
 'questions': [{'image_id': 262148,
   'question': 'Where is he looking?',
   'question_id': 262148000},
  {'image_id': 262148,
   'question': 'What are the people in the background doing?',
   'question_id': 262148001},
  {'image_id': 262148,
   'question': 'What is he on top of?',
   'question_id': 262148002},
  {'image_id': 393225,
   'question': 'What website copyrighted the picture?',
   'question_id': 393225000},
  {'image_id': 393225,
   'question': 'Is this a creamy soup?',
   'question_id': 393225001},
  {'image_id': 393225,
   'question': 'Is this rice noodle soup?',


In [16]:
df_coco_questions = pd.DataFrame(coco_questions_data["questions"])
df_coco_questions

Unnamed: 0,image_id,question,question_id
0,262148,Where is he looking?,262148000
1,262148,What are the people in the background doing?,262148001
2,262148,What is he on top of?,262148002
3,393225,What website copyrighted the picture?,393225000
4,393225,Is this a creamy soup?,393225001
...,...,...,...
214349,393212,What is the main color in the photo?,393212000
214350,393212,What is the meaning of this sign?,393212001
214351,393212,What is on the sign?,393212002
214352,393212,Does the arrow point left or right?,393212003


In [17]:
coco_annotations_data = json.load(open('/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Turkish/Lemma Stem POS/Data/Coco/v2_mscoco_val2014_annotations.json'))
coco_annotations_data

{'info': {'description': 'This is v2.0 of the VQA dataset.',
  'url': 'http://visualqa.org',
  'version': '2.0',
  'year': 2017,
  'contributor': 'VQA Team',
  'date_created': '2017-04-26 17:00:44'},
 'license': {'url': 'http://creativecommons.org/licenses/by/4.0/',
  'name': 'Creative Commons Attribution 4.0 International License'},
 'data_subtype': 'val2014',
 'annotations': [{'question_type': 'none of the above',
   'multiple_choice_answer': 'down',
   'answers': [{'answer': 'down', 'answer_confidence': 'yes', 'answer_id': 1},
    {'answer': 'down', 'answer_confidence': 'yes', 'answer_id': 2},
    {'answer': 'at table', 'answer_confidence': 'yes', 'answer_id': 3},
    {'answer': 'skateboard', 'answer_confidence': 'yes', 'answer_id': 4},
    {'answer': 'down', 'answer_confidence': 'yes', 'answer_id': 5},
    {'answer': 'table', 'answer_confidence': 'yes', 'answer_id': 6},
    {'answer': 'down', 'answer_confidence': 'yes', 'answer_id': 7},
    {'answer': 'down', 'answer_confidence': '

In [18]:
df_coco_annotations = pd.DataFrame(coco_annotations_data["annotations"])
df_coco_annotations.drop(["question_type","answers","answer_type"], axis=1, inplace=True)
df_coco_annotations

Unnamed: 0,multiple_choice_answer,image_id,question_id
0,down,262148,262148000
1,watching,262148,262148001
2,picnic table,262148,262148002
3,foodiebakercom,393225,393225000
4,no,393225,393225001
...,...,...,...
214349,green,393212,393212000
214350,go left,393212,393212001
214351,arrow,393212,393212002
214352,left,393212,393212003


In [26]:
coco_instances_train_data = json.load(open('/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Turkish/Lemma Stem POS/Data/ML Train Coco/instances_train2017.json'))
coco_instances_train_data

{'info': {'description': 'COCO 2017 Dataset',
  'url': 'http://cocodataset.org',
  'version': '1.0',
  'year': 2017,
  'contributor': 'COCO Consortium',
  'date_created': '2017/09/01'},
 'licenses': [{'url': 'http://creativecommons.org/licenses/by-nc-sa/2.0/',
   'id': 1,
   'name': 'Attribution-NonCommercial-ShareAlike License'},
  {'url': 'http://creativecommons.org/licenses/by-nc/2.0/',
   'id': 2,
   'name': 'Attribution-NonCommercial License'},
  {'url': 'http://creativecommons.org/licenses/by-nc-nd/2.0/',
   'id': 3,
   'name': 'Attribution-NonCommercial-NoDerivs License'},
  {'url': 'http://creativecommons.org/licenses/by/2.0/',
   'id': 4,
   'name': 'Attribution License'},
  {'url': 'http://creativecommons.org/licenses/by-sa/2.0/',
   'id': 5,
   'name': 'Attribution-ShareAlike License'},
  {'url': 'http://creativecommons.org/licenses/by-nd/2.0/',
   'id': 6,
   'name': 'Attribution-NoDerivs License'},
  {'url': 'http://flickr.com/commons/usage/',
   'id': 7,
   'name': 'No kn

In [27]:
df_coco_instances_train = pd.DataFrame(coco_instances_train_data["images"])
df_coco_instances_train

Unnamed: 0,license,file_name,coco_url,height,width,date_captured,flickr_url,id
0,3,000000391895.jpg,http://images.cocodataset.org/train2017/000000...,360,640,2013-11-14 11:18:45,http://farm9.staticflickr.com/8186/8119368305_...,391895
1,4,000000522418.jpg,http://images.cocodataset.org/train2017/000000...,480,640,2013-11-14 11:38:44,http://farm1.staticflickr.com/1/127244861_ab0c...,522418
2,3,000000184613.jpg,http://images.cocodataset.org/train2017/000000...,336,500,2013-11-14 12:36:29,http://farm3.staticflickr.com/2169/2118578392_...,184613
3,3,000000318219.jpg,http://images.cocodataset.org/train2017/000000...,640,556,2013-11-14 13:02:53,http://farm5.staticflickr.com/4125/5094763076_...,318219
4,3,000000554625.jpg,http://images.cocodataset.org/train2017/000000...,640,426,2013-11-14 16:03:19,http://farm5.staticflickr.com/4086/5094162993_...,554625
...,...,...,...,...,...,...,...,...
118282,1,000000444010.jpg,http://images.cocodataset.org/train2017/000000...,480,640,2013-11-25 14:46:11,http://farm4.staticflickr.com/3697/9303670993_...,444010
118283,3,000000565004.jpg,http://images.cocodataset.org/train2017/000000...,427,640,2013-11-25 19:59:30,http://farm2.staticflickr.com/1278/4677568591_...,565004
118284,3,000000516168.jpg,http://images.cocodataset.org/train2017/000000...,480,640,2013-11-25 21:03:34,http://farm3.staticflickr.com/2379/2293730995_...,516168
118285,4,000000547503.jpg,http://images.cocodataset.org/train2017/000000...,375,500,2013-11-25 21:20:21,http://farm1.staticflickr.com/178/423174638_1c...,547503


In [28]:
coco_instances_val_data = json.load(open('/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Turkish/Lemma Stem POS/Data/ML Train Coco/instances_val2017.json'))
coco_instances_val_data

{'info': {'description': 'COCO 2017 Dataset',
  'url': 'http://cocodataset.org',
  'version': '1.0',
  'year': 2017,
  'contributor': 'COCO Consortium',
  'date_created': '2017/09/01'},
 'licenses': [{'url': 'http://creativecommons.org/licenses/by-nc-sa/2.0/',
   'id': 1,
   'name': 'Attribution-NonCommercial-ShareAlike License'},
  {'url': 'http://creativecommons.org/licenses/by-nc/2.0/',
   'id': 2,
   'name': 'Attribution-NonCommercial License'},
  {'url': 'http://creativecommons.org/licenses/by-nc-nd/2.0/',
   'id': 3,
   'name': 'Attribution-NonCommercial-NoDerivs License'},
  {'url': 'http://creativecommons.org/licenses/by/2.0/',
   'id': 4,
   'name': 'Attribution License'},
  {'url': 'http://creativecommons.org/licenses/by-sa/2.0/',
   'id': 5,
   'name': 'Attribution-ShareAlike License'},
  {'url': 'http://creativecommons.org/licenses/by-nd/2.0/',
   'id': 6,
   'name': 'Attribution-NoDerivs License'},
  {'url': 'http://flickr.com/commons/usage/',
   'id': 7,
   'name': 'No kn

In [29]:
df_coco_instances_val = pd.DataFrame(coco_instances_val_data["images"])
df_coco_instances_val

Unnamed: 0,license,file_name,coco_url,height,width,date_captured,flickr_url,id
0,4,000000397133.jpg,http://images.cocodataset.org/val2017/00000039...,427,640,2013-11-14 17:02:52,http://farm7.staticflickr.com/6116/6255196340_...,397133
1,1,000000037777.jpg,http://images.cocodataset.org/val2017/00000003...,230,352,2013-11-14 20:55:31,http://farm9.staticflickr.com/8429/7839199426_...,37777
2,4,000000252219.jpg,http://images.cocodataset.org/val2017/00000025...,428,640,2013-11-14 22:32:02,http://farm4.staticflickr.com/3446/3232237447_...,252219
3,1,000000087038.jpg,http://images.cocodataset.org/val2017/00000008...,480,640,2013-11-14 23:11:37,http://farm8.staticflickr.com/7355/8825114508_...,87038
4,6,000000174482.jpg,http://images.cocodataset.org/val2017/00000017...,388,640,2013-11-14 23:16:55,http://farm8.staticflickr.com/7020/6478877255_...,174482
...,...,...,...,...,...,...,...,...
4995,3,000000512403.jpg,http://images.cocodataset.org/val2017/00000051...,640,529,2013-11-24 05:12:53,http://farm1.staticflickr.com/143/350452845_fa...,512403
4996,4,000000168974.jpg,http://images.cocodataset.org/val2017/00000016...,500,375,2013-11-24 07:19:48,http://farm3.staticflickr.com/2360/2063838083_...,168974
4997,1,000000552775.jpg,http://images.cocodataset.org/val2017/00000055...,500,375,2013-11-24 10:38:31,http://farm4.staticflickr.com/3136/3106037881_...,552775
4998,3,000000394940.jpg,http://images.cocodataset.org/val2017/00000039...,640,426,2013-11-24 13:47:05,http://farm9.staticflickr.com/8227/8566023505_...,394940


In [36]:
df_coco_instances = pd.concat([df_coco_instances_train,df_coco_instances_val],axis=0)
df_coco_instances.rename(columns={"id":"image_id"}, inplace=True)
df_coco_instances.drop("date_captured", axis=1, inplace=True)
df_coco_instances

Unnamed: 0,license,file_name,coco_url,height,width,flickr_url,image_id
0,3,000000391895.jpg,http://images.cocodataset.org/train2017/000000...,360,640,http://farm9.staticflickr.com/8186/8119368305_...,391895
1,4,000000522418.jpg,http://images.cocodataset.org/train2017/000000...,480,640,http://farm1.staticflickr.com/1/127244861_ab0c...,522418
2,3,000000184613.jpg,http://images.cocodataset.org/train2017/000000...,336,500,http://farm3.staticflickr.com/2169/2118578392_...,184613
3,3,000000318219.jpg,http://images.cocodataset.org/train2017/000000...,640,556,http://farm5.staticflickr.com/4125/5094763076_...,318219
4,3,000000554625.jpg,http://images.cocodataset.org/train2017/000000...,640,426,http://farm5.staticflickr.com/4086/5094162993_...,554625
...,...,...,...,...,...,...,...
4995,3,000000512403.jpg,http://images.cocodataset.org/val2017/00000051...,640,529,http://farm1.staticflickr.com/143/350452845_fa...,512403
4996,4,000000168974.jpg,http://images.cocodataset.org/val2017/00000016...,500,375,http://farm3.staticflickr.com/2360/2063838083_...,168974
4997,1,000000552775.jpg,http://images.cocodataset.org/val2017/00000055...,500,375,http://farm4.staticflickr.com/3136/3106037881_...,552775
4998,3,000000394940.jpg,http://images.cocodataset.org/val2017/00000039...,640,426,http://farm9.staticflickr.com/8227/8566023505_...,394940


In [31]:
coco_captions_train_data = json.load(open('/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Turkish/Lemma Stem POS/Data/ML Train Coco/captions_train2017.json'))
coco_captions_train_data

{'info': {'description': 'COCO 2017 Dataset',
  'url': 'http://cocodataset.org',
  'version': '1.0',
  'year': 2017,
  'contributor': 'COCO Consortium',
  'date_created': '2017/09/01'},
 'licenses': [{'url': 'http://creativecommons.org/licenses/by-nc-sa/2.0/',
   'id': 1,
   'name': 'Attribution-NonCommercial-ShareAlike License'},
  {'url': 'http://creativecommons.org/licenses/by-nc/2.0/',
   'id': 2,
   'name': 'Attribution-NonCommercial License'},
  {'url': 'http://creativecommons.org/licenses/by-nc-nd/2.0/',
   'id': 3,
   'name': 'Attribution-NonCommercial-NoDerivs License'},
  {'url': 'http://creativecommons.org/licenses/by/2.0/',
   'id': 4,
   'name': 'Attribution License'},
  {'url': 'http://creativecommons.org/licenses/by-sa/2.0/',
   'id': 5,
   'name': 'Attribution-ShareAlike License'},
  {'url': 'http://creativecommons.org/licenses/by-nd/2.0/',
   'id': 6,
   'name': 'Attribution-NoDerivs License'},
  {'url': 'http://flickr.com/commons/usage/',
   'id': 7,
   'name': 'No kn

In [32]:
df_coco_captions_train = pd.DataFrame(coco_captions_train_data["annotations"])
df_coco_captions_train

Unnamed: 0,image_id,id,caption
0,203564,37,A bicycle replica with a clock as the front wh...
1,322141,49,A room with blue walls and a white sink and door.
2,16977,89,A car that seems to be parked illegally behind...
3,106140,98,A large passenger airplane flying through the ...
4,106140,101,There is a GOL plane taking off in a partly cl...
...,...,...,...
591748,133071,829655,a slice of bread is covered with a sour cream ...
591749,410182,829658,A long plate hold some fries with some sliders...
591750,180285,829665,Two women sit and pose with stuffed animals.
591751,133071,829693,White Plate with a lot of guacamole and an ext...


In [33]:
coco_captions_val_data = json.load(open('/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Turkish/Lemma Stem POS/Data/ML Train Coco/captions_val2017.json'))
coco_captions_val_data

{'info': {'description': 'COCO 2017 Dataset',
  'url': 'http://cocodataset.org',
  'version': '1.0',
  'year': 2017,
  'contributor': 'COCO Consortium',
  'date_created': '2017/09/01'},
 'licenses': [{'url': 'http://creativecommons.org/licenses/by-nc-sa/2.0/',
   'id': 1,
   'name': 'Attribution-NonCommercial-ShareAlike License'},
  {'url': 'http://creativecommons.org/licenses/by-nc/2.0/',
   'id': 2,
   'name': 'Attribution-NonCommercial License'},
  {'url': 'http://creativecommons.org/licenses/by-nc-nd/2.0/',
   'id': 3,
   'name': 'Attribution-NonCommercial-NoDerivs License'},
  {'url': 'http://creativecommons.org/licenses/by/2.0/',
   'id': 4,
   'name': 'Attribution License'},
  {'url': 'http://creativecommons.org/licenses/by-sa/2.0/',
   'id': 5,
   'name': 'Attribution-ShareAlike License'},
  {'url': 'http://creativecommons.org/licenses/by-nd/2.0/',
   'id': 6,
   'name': 'Attribution-NoDerivs License'},
  {'url': 'http://flickr.com/commons/usage/',
   'id': 7,
   'name': 'No kn

In [34]:
df_coco_captions_val = pd.DataFrame(coco_captions_val_data["annotations"])
df_coco_captions_val

Unnamed: 0,image_id,id,caption
0,179765,38,A black Honda motorcycle parked in front of a ...
1,179765,182,A Honda motorcycle parked in a grass driveway
2,190236,401,An office cubicle with four different types of...
3,331352,441,A small closed toilet in a cramped space.
4,517069,447,Two women waiting at a bench next to a street.
...,...,...,...
25009,9590,821635,A group of men sipping drinks and talking at a...
25010,84664,822557,"A plate of food with some eggs, potatoes, brea..."
25011,331569,824852,The strawberries was sitting beside the tall g...
25012,231237,825902,A bunch of small red flowers in a barnacle enc...


In [39]:
df_coco_captions = pd.concat([df_coco_captions_train,df_coco_captions_val],axis=0)
df_coco_captions.drop("id", axis=1, inplace=True)
df_coco_captions

Unnamed: 0,image_id,caption
0,203564,A bicycle replica with a clock as the front wh...
1,322141,A room with blue walls and a white sink and door.
2,16977,A car that seems to be parked illegally behind...
3,106140,A large passenger airplane flying through the ...
4,106140,There is a GOL plane taking off in a partly cl...
...,...,...
25009,9590,A group of men sipping drinks and talking at a...
25010,84664,"A plate of food with some eggs, potatoes, brea..."
25011,331569,The strawberries was sitting beside the tall g...
25012,231237,A bunch of small red flowers in a barnacle enc...


##### Question Answers Data Analysis

In [20]:
df_coco_question_answers = pd.merge(df_coco_questions,df_coco_annotations,how="inner",on=["image_id","question_id"])
df_coco_question_answers.drop_duplicates(inplace=True)
df_coco_question_answers

Unnamed: 0,image_id,question,question_id,multiple_choice_answer
0,262148,Where is he looking?,262148000,down
1,262148,What are the people in the background doing?,262148001,watching
2,262148,What is he on top of?,262148002,picnic table
3,393225,What website copyrighted the picture?,393225000,foodiebakercom
4,393225,Is this a creamy soup?,393225001,no
...,...,...,...,...
214349,393212,What is the main color in the photo?,393212000,green
214350,393212,What is the meaning of this sign?,393212001,go left
214351,393212,What is on the sign?,393212002,arrow
214352,393212,Does the arrow point left or right?,393212003,left


In [21]:
df_coco_question_answers["question"] = df_coco_question_answers["question"].apply(lambda x: lower_strip_func(x))
df_coco_question_answers["multiple_choice_answer"] = df_coco_question_answers["multiple_choice_answer"].apply(lambda x: lower_strip_func(x))
df_coco_question_answers

Unnamed: 0,image_id,question,question_id,multiple_choice_answer
0,262148,where is he looking?,262148000,down
1,262148,what are the people in the background doing?,262148001,watching
2,262148,what is he on top of?,262148002,picnic table
3,393225,what website copyrighted the picture?,393225000,foodiebakercom
4,393225,is this a creamy soup?,393225001,no
...,...,...,...,...
214349,393212,what is the main color in the photo?,393212000,green
214350,393212,what is the meaning of this sign?,393212001,go left
214351,393212,what is on the sign?,393212002,arrow
214352,393212,does the arrow point left or right?,393212003,left


In [22]:
df_coco_question_answers["question"] = df_coco_question_answers["question"].apply(lambda x: clean_text(x))
df_coco_question_answers["multiple_choice_answer"] = df_coco_question_answers["multiple_choice_answer"].apply(lambda x: clean_text(x))
df_coco_question_answers

Unnamed: 0,image_id,question,question_id,multiple_choice_answer
0,262148,where is he looking,262148000,down
1,262148,what are the people in the background doing,262148001,watching
2,262148,what is he on top of,262148002,picnic table
3,393225,what website copyrighted the picture,393225000,foodiebakercom
4,393225,is this a creamy soup,393225001,no
...,...,...,...,...
214349,393212,what is the main color in the photo,393212000,green
214350,393212,what is the meaning of this sign,393212001,go left
214351,393212,what is on the sign,393212002,arrow
214352,393212,does the arrow point left or right,393212003,left


In [23]:
df_coco_question_answers.to_csv("Microsoft_Coco_Question_Answers_Analysis.csv", index=False)

##### Captions Instances Data Analysis

In [40]:
df_coco_captions

Unnamed: 0,image_id,caption
0,203564,A bicycle replica with a clock as the front wh...
1,322141,A room with blue walls and a white sink and door.
2,16977,A car that seems to be parked illegally behind...
3,106140,A large passenger airplane flying through the ...
4,106140,There is a GOL plane taking off in a partly cl...
...,...,...
25009,9590,A group of men sipping drinks and talking at a...
25010,84664,"A plate of food with some eggs, potatoes, brea..."
25011,331569,The strawberries was sitting beside the tall g...
25012,231237,A bunch of small red flowers in a barnacle enc...


In [41]:
df_coco_instances

Unnamed: 0,license,file_name,coco_url,height,width,flickr_url,image_id
0,3,000000391895.jpg,http://images.cocodataset.org/train2017/000000...,360,640,http://farm9.staticflickr.com/8186/8119368305_...,391895
1,4,000000522418.jpg,http://images.cocodataset.org/train2017/000000...,480,640,http://farm1.staticflickr.com/1/127244861_ab0c...,522418
2,3,000000184613.jpg,http://images.cocodataset.org/train2017/000000...,336,500,http://farm3.staticflickr.com/2169/2118578392_...,184613
3,3,000000318219.jpg,http://images.cocodataset.org/train2017/000000...,640,556,http://farm5.staticflickr.com/4125/5094763076_...,318219
4,3,000000554625.jpg,http://images.cocodataset.org/train2017/000000...,640,426,http://farm5.staticflickr.com/4086/5094162993_...,554625
...,...,...,...,...,...,...,...
4995,3,000000512403.jpg,http://images.cocodataset.org/val2017/00000051...,640,529,http://farm1.staticflickr.com/143/350452845_fa...,512403
4996,4,000000168974.jpg,http://images.cocodataset.org/val2017/00000016...,500,375,http://farm3.staticflickr.com/2360/2063838083_...,168974
4997,1,000000552775.jpg,http://images.cocodataset.org/val2017/00000055...,500,375,http://farm4.staticflickr.com/3136/3106037881_...,552775
4998,3,000000394940.jpg,http://images.cocodataset.org/val2017/00000039...,640,426,http://farm9.staticflickr.com/8227/8566023505_...,394940


In [42]:
df_coco_captions_instances = pd.merge(df_coco_captions,df_coco_instances,how="inner",on="image_id")
df_coco_captions_instances.drop_duplicates(inplace=True)
df_coco_captions_instances

Unnamed: 0,image_id,caption,license,file_name,coco_url,height,width,flickr_url
0,203564,A bicycle replica with a clock as the front wh...,4,000000203564.jpg,http://images.cocodataset.org/train2017/000000...,400,400,http://farm8.staticflickr.com/7366/9643253026_...
1,203564,The bike has a clock as a tire.,4,000000203564.jpg,http://images.cocodataset.org/train2017/000000...,400,400,http://farm8.staticflickr.com/7366/9643253026_...
2,203564,A black metal bicycle with a clock inside the ...,4,000000203564.jpg,http://images.cocodataset.org/train2017/000000...,400,400,http://farm8.staticflickr.com/7366/9643253026_...
3,203564,A bicycle figurine in which the front wheel is...,4,000000203564.jpg,http://images.cocodataset.org/train2017/000000...,400,400,http://farm8.staticflickr.com/7366/9643253026_...
4,203564,A clock with the appearance of the wheel of a ...,4,000000203564.jpg,http://images.cocodataset.org/train2017/000000...,400,400,http://farm8.staticflickr.com/7366/9643253026_...
...,...,...,...,...,...,...,...,...
616762,537802,Three teddy bears sit on a fake sled in fake s...,1,000000537802.jpg,http://images.cocodataset.org/val2017/00000053...,480,640,http://farm6.staticflickr.com/5216/5385022107_...
616763,537802,a picture of stuffed animals on a sled in a fa...,1,000000537802.jpg,http://images.cocodataset.org/val2017/00000053...,480,640,http://farm6.staticflickr.com/5216/5385022107_...
616764,537802,Three teddy bears sit in a sled in fake snow.,1,000000537802.jpg,http://images.cocodataset.org/val2017/00000053...,480,640,http://farm6.staticflickr.com/5216/5385022107_...
616765,537802,Three stuffed bears wearing clothes riding on ...,1,000000537802.jpg,http://images.cocodataset.org/val2017/00000053...,480,640,http://farm6.staticflickr.com/5216/5385022107_...


In [43]:
df_coco_captions_instances["caption"] = df_coco_captions_instances["caption"].apply(lambda x: lower_strip_func(x))
df_coco_captions_instances

Unnamed: 0,image_id,caption,license,file_name,coco_url,height,width,flickr_url
0,203564,a bicycle replica with a clock as the front wh...,4,000000203564.jpg,http://images.cocodataset.org/train2017/000000...,400,400,http://farm8.staticflickr.com/7366/9643253026_...
1,203564,the bike has a clock as a tire.,4,000000203564.jpg,http://images.cocodataset.org/train2017/000000...,400,400,http://farm8.staticflickr.com/7366/9643253026_...
2,203564,a black metal bicycle with a clock inside the ...,4,000000203564.jpg,http://images.cocodataset.org/train2017/000000...,400,400,http://farm8.staticflickr.com/7366/9643253026_...
3,203564,a bicycle figurine in which the front wheel is...,4,000000203564.jpg,http://images.cocodataset.org/train2017/000000...,400,400,http://farm8.staticflickr.com/7366/9643253026_...
4,203564,a clock with the appearance of the wheel of a ...,4,000000203564.jpg,http://images.cocodataset.org/train2017/000000...,400,400,http://farm8.staticflickr.com/7366/9643253026_...
...,...,...,...,...,...,...,...,...
616762,537802,three teddy bears sit on a fake sled in fake s...,1,000000537802.jpg,http://images.cocodataset.org/val2017/00000053...,480,640,http://farm6.staticflickr.com/5216/5385022107_...
616763,537802,a picture of stuffed animals on a sled in a fa...,1,000000537802.jpg,http://images.cocodataset.org/val2017/00000053...,480,640,http://farm6.staticflickr.com/5216/5385022107_...
616764,537802,three teddy bears sit in a sled in fake snow.,1,000000537802.jpg,http://images.cocodataset.org/val2017/00000053...,480,640,http://farm6.staticflickr.com/5216/5385022107_...
616765,537802,three stuffed bears wearing clothes riding on ...,1,000000537802.jpg,http://images.cocodataset.org/val2017/00000053...,480,640,http://farm6.staticflickr.com/5216/5385022107_...


In [44]:
df_coco_captions_instances["caption"] = df_coco_captions_instances["caption"].apply(lambda x: clean_text(x))
df_coco_captions_instances

Unnamed: 0,image_id,caption,license,file_name,coco_url,height,width,flickr_url
0,203564,a bicycle replica with a clock as the front wheel,4,000000203564.jpg,http://images.cocodataset.org/train2017/000000...,400,400,http://farm8.staticflickr.com/7366/9643253026_...
1,203564,the bike has a clock as a tire,4,000000203564.jpg,http://images.cocodataset.org/train2017/000000...,400,400,http://farm8.staticflickr.com/7366/9643253026_...
2,203564,a black metal bicycle with a clock inside the ...,4,000000203564.jpg,http://images.cocodataset.org/train2017/000000...,400,400,http://farm8.staticflickr.com/7366/9643253026_...
3,203564,a bicycle figurine in which the front wheel is...,4,000000203564.jpg,http://images.cocodataset.org/train2017/000000...,400,400,http://farm8.staticflickr.com/7366/9643253026_...
4,203564,a clock with the appearance of the wheel of a ...,4,000000203564.jpg,http://images.cocodataset.org/train2017/000000...,400,400,http://farm8.staticflickr.com/7366/9643253026_...
...,...,...,...,...,...,...,...,...
616762,537802,three teddy bears sit on a fake sled in fake snow,1,000000537802.jpg,http://images.cocodataset.org/val2017/00000053...,480,640,http://farm6.staticflickr.com/5216/5385022107_...
616763,537802,a picture of stuffed animals on a sled in a fa...,1,000000537802.jpg,http://images.cocodataset.org/val2017/00000053...,480,640,http://farm6.staticflickr.com/5216/5385022107_...
616764,537802,three teddy bears sit in a sled in fake snow,1,000000537802.jpg,http://images.cocodataset.org/val2017/00000053...,480,640,http://farm6.staticflickr.com/5216/5385022107_...
616765,537802,three stuffed bears wearing clothes riding on ...,1,000000537802.jpg,http://images.cocodataset.org/val2017/00000053...,480,640,http://farm6.staticflickr.com/5216/5385022107_...


In [45]:
df_coco_captions_instances.to_csv("Microsoft_Coco_Captions_Instances_Analysis.csv", index=False)

#### Copy Move And Delete

In [46]:
output_file = glob.glob(f"Microsoft_Coco_*_Analysis.csv")
output_file

['Microsoft_Coco_Question_Answers_Analysis.csv',
 'Microsoft_Coco_Captions_Instances_Analysis.csv']

In [47]:
for l in output_file:
    source = l # source directory
    destination = path
    shutil.copy2(source, destination)

In [48]:
for j in output_file:
    try:
        os.remove(j)
    except:
        pass