In [1]:
#necessary packages
import pandas as pd
import numpy as np

import os
working_directory = os.getcwd()
print(working_directory)


import warnings
warnings.filterwarnings('ignore')

/content


In [2]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
working_directory = '/content/drive/My Drive/project'

**Multimodal data**

The dataset originally has 59000 records of data which is computationally causing problems to extact such huge records of images. So, reducing the dataset size accordingly as below while loading dataset.

This "multimodal_test_public" consists of various columns that contains insights of the each post including images attached and title of the post. 

In [54]:
path_train = working_directory + '/multimodal_test_public.tsv'


df_train = pd.read_csv(path_train,sep='\t', nrows=3250)

In [55]:
df_train

Unnamed: 0,author,clean_title,created_utc,domain,hasImage,id,image_url,linked_submission_id,num_comments,score,subreddit,title,upvote_ratio,2_way_label,3_way_label,6_way_label
0,trustbytrust,stargazer,1.425139e+09,,True,cozywbv,http://i.imgur.com/BruWKDi.jpg,2xct9d,,3,psbattle_artwork,stargazer,,0,2,4
1,,yeah,1.438173e+09,,True,ctk61yw,http://i.imgur.com/JRZT727.jpg,3f0h7o,,2,psbattle_artwork,yeah,,0,2,4
2,chaseoes,pd phoenix car thief gets instructions from yo...,1.560492e+09,abc15.com,True,c0gl7r,https://external-preview.redd.it/1A2_4VwgS8Qd2...,,2.0,16,nottheonion,PD: Phoenix car thief gets instructions from Y...,0.89,1,0,0
3,SFepicure,as trump accuses iran he has one problem his o...,1.560606e+09,nytimes.com,True,c0xdqy,https://external-preview.redd.it/9BKRcgvaobpTo...,,4.0,45,neutralnews,"As Trump Accuses Iran, He Has One Problem: His...",0.78,1,0,0
4,fragments_from_Work,believers hezbollah,1.515139e+09,i.imgur.com,True,7o9rmx,https://external-preview.redd.it/rbwXHncnjVh51...,,40.0,285,propagandaposters,"""Believers"" - Hezbollah 2011",0.95,0,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3245,phil5or30d,toronto sunset just before a storm,1.388938e+09,flickr.com,True,1ugqlf,https://external-preview.redd.it/XBMRTI7gE-pKO...,,1.0,14,pic,Toronto sunset just before a storm,0.79,1,0,0
3246,Winkie1,sing it,1.373651e+09,,True,cb1aome,http://i.imgur.com/5vkuQ8q.jpg,1i5917,,19,psbattle_artwork,Sing it,,0,2,4
3247,Dawgmeat9,time to give someone else a turn,1.421533e+09,,True,cns9pd3,http://i.imgur.com/Ts1Z1MO.jpg,2srqvt,,3,psbattle_artwork,Time to give someone else a turn,,0,2,4
3248,Germomics,this sleeping snowy owl,1.559673e+09,i.redd.it,True,bws81n,https://preview.redd.it/fmt73fgrvd231.jpg?widt...,,9.0,79,photoshopbattles,PsBattle: this sleeping snowy owl,0.93,1,0,0


**Data Cleaning**

Initially dataset has unecessary information and features that are not helpful for improving the performance of model. So, removing such features and noise while checking for duplicates and rows with "nan" values.

In [56]:
def clean_data(data):
    
    # As clean_title is a version of title, replacing the clean_title's missing values with corresponding values in the title feature.
    
    data['clean_title'] = data['title'].where(data['clean_title'].isna(), data['clean_title'])

    #checking for duplicate values
    duplicates = data.duplicated().sum()

    if (duplicates==0):
        print("No duplicates found")
    else:
        data = pd.DataFrame.drop_duplicates(data)

    # Reset index after drop
    data = data.dropna(subset=['image_url']).reset_index(drop=True)

    #drop unnecessary features
    
    clean_data = data[['id','image_url','clean_title','2_way_label','hasImage']]
    return clean_data

In [57]:
df_train = clean_data(df_train)

No duplicates found


In [58]:
cd '/content/drive/My Drive/project'

/content/drive/My Drive/project


**Image downloading**

Multimodal datset have the "image_url" feature which contains the url's to the image where image can be accessed. So, urllib library is used to download images as follows.

1. Imports urllib.requests. 
2. Checks if the folder with sugested name is exsisting in current directory else create the new folder with given name.
3. Interates over each row in dataframe to extract the url of that row.
4. From there, assigns the opening of the url to a variable.
5. if "hasImage" is true and "image_url" is not null, then stores image with "id".jpg as its name.




In [60]:
import pandas as pd
import os
from tqdm import tqdm as tqdm
import urllib.request
import numpy as np
import sys

df_train = df_train.replace(np.nan, '', regex=True)
df_train.fillna('', inplace=True)

pbar = tqdm(total=len(df_train))

if not os.path.exists("images"):
  os.makedirs("images")

for index, row in df_train.iterrows():
  if row["hasImage"] == True and row["image_url"] != "" and row["image_url"] != "nan":
    image_url = row["image_url"]
    try:
      urllib.request.urlretrieve(image_url, "images/" + row["id"] + ".jpg")
    except:
      pass
  pbar.update(1)
print("done")

100%|██████████| 50/50 [02:02<00:00,  2.44s/it]
100%|█████████▉| 3240/3242 [04:12<00:00,  9.07it/s]

done


In [62]:
df_train = df_train[['id','clean_title','2_way_label']]
df_train

Unnamed: 0,id,clean_title,2_way_label
0,cozywbv,stargazer,0
1,ctk61yw,yeah,0
2,c0gl7r,pd phoenix car thief gets instructions from yo...,1
3,c0xdqy,as trump accuses iran he has one problem his o...,1
4,7o9rmx,believers hezbollah,0
...,...,...,...
3237,1ugqlf,toronto sunset just before a storm,1
3238,cb1aome,sing it,0
3239,cns9pd3,time to give someone else a turn,0
3240,bws81n,this sleeping snowy owl,1


In [63]:
df_train['2_way_label'].value_counts()

0    1939
1    1303
Name: 2_way_label, dtype: int64

In [64]:
import pickle

df_train.to_pickle("multimodal_text")