# Image Captioning Generator

In [1]:
!pip install transformers

Defaulting to user installation because normal site-packages is not writeable


In [2]:
!python3 -m pip install --upgrade pip

Defaulting to user installation because normal site-packages is not writeable


In [3]:
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor,AutoTokenizer
import torch
from PIL import Image

2023-05-30 11:23:20.907997: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

In [5]:
feature_extractor=ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")



In [6]:
tokenizer=AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0): ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, 

In [8]:
max_length=256
num_beams=4
gen_kwargs={"max_length":max_length,"num_beams":num_beams}

In [9]:
def predict_step(image_paths):
    images=[]
    for image_path in image_paths:
        i_image=Image.open(image_path)
        if i_image.mode!="RGB":
            i_image = i_image.convert(mode="RGB")
        images.append(i_image)
    pixel_values = feature_extractor(images=images,return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)
    output_ids = model.generate(pixel_values,**gen_kwargs)
    preds=tokenizer.batch_decode(output_ids,skip_special_tokens=True)
    preds =[pred.strip() for pred in preds]
    i_image.show()
    return preds

In [10]:
predict_step(['dog.jpg'])

['a brown and white dog laying in the grass']

In [11]:
ls

 [0m[01;35mdog.jpg[0m   [01;35mImage1.png[0m   [01;35mImage2.png[0m   [01;35mImage3.png[0m  'Img Caption Generator.ipynb'


In [12]:

from transformers import pipeline

image_to_text = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")

image_to_text("https://ankur3107.github.io/assets/images/image-captioning-example.png")



Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


[{'generated_text': 'a soccer game with a player jumping to catch the ball '}]

# Hashtag Generator

In [14]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('book')

[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /home/divum/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package brown to /home/divum/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package chat80 to
[nltk_data]    |     /home/divum/nltk_data...
[nltk_data]    |   Package chat80 is already up-to-date!
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /home/divum/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package conll2000 to
[nltk_data]    |     /home/divum/nltk_data...
[nltk_data]    |   Package conll2000 is already up-to-date!
[nltk_data]    | Downloading package conll2002 to
[nltk_data]    |     /home/divum/nltk_data...
[nltk_data]    |   Package conll2002 is already up-to-date!
[nltk_data]    | Downloading package dependency_treebank to
[nltk_data]    |

True

In [24]:
l=predict_step(['dog.jpg'])
l=l[0]


In [26]:
stop_words=set(stopwords.words('english'))
word_tokens=word_tokenize(l)
filtered_sentence=[]
for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)
# print(filtered_sentence)
if '.' in filtered_sentence:
    filtered_sentence.remove('.')
x=len(filtered_sentence)
for i in filtered_sentence:
    print("#"+i,end=" ")

#brown #white #dog #laying #grass 

# Question_Answering_using_HuggingFace

In [27]:
from transformers import pipeline
import pandas as pd

In [None]:
# tqa = pipeline(task="table-question-answering",model="google/tapas-base-finetuned-wtq")

In [29]:
table=pd.read_csv("meat_consumption.csv")

In [30]:
table = table.astype(str)

In [31]:
table

Unnamed: 0,location,indicator,subject,measure,frequency,time,value
0,AUS,MEATCONSUMP,BEEF,KG_CAP,A,1990,4.1076362659e-06
1,AUS,MEATCONSUMP,BEEF,KG_CAP,A,1991,27.8084010802765
2,AUS,MEATCONSUMP,BEEF,KG_CAP,A,1992,26.2781655161047
3,AUS,MEATCONSUMP,BEEF,KG_CAP,A,1993,26.2444784528083
4,AUS,MEATCONSUMP,BEEF,KG_CAP,A,1994,25.5412444303129
...,...,...,...,...,...,...,...
12135,EU27,MEATCONSUMP,SHEEP,THND_TONNE,A,2024,732.149997986808
12136,EU27,MEATCONSUMP,SHEEP,THND_TONNE,A,2025,737.008238160185
12137,EU27,MEATCONSUMP,SHEEP,THND_TONNE,A,2026,741.104095445183
12138,EU27,MEATCONSUMP,SHEEP,THND_TONNE,A,2027,743.489548800371


In [35]:
query = "Which year has consumed highest meat ?"
print(tqa(table=table,query=query)["answer"])