# **1. Imports & drive mount**

In [None]:
import requests 
import shutil 
import pandas as pd
import io
import numpy as np
import cv2
import os
from PIL import Image

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **2. Load data to dataframe**

In [None]:
loaded_df = pd.read_excel("/content/drive/MyDrive/AI_Artathon/Phase_2/Models/Prediction_Model/Dataset/SuperRareArtwork_processed.xlsx", header=0)

In [None]:
loaded_df.shape

(1424, 7)

In [None]:
loaded_df.columns

Index(['sr_no', 'Name', 'List_Price', 'Artist', 'URL', 'Tags', 'Collected by'], dtype='object')

In [None]:
loaded_df.index = loaded_df['sr_no']

In [None]:
image_df = loaded_df.drop(columns= ['sr_no', 'Name', 'List_Price', 'Artist', 'Tags', 'Collected by'])
meta_df = loaded_df.drop(columns= ['sr_no', 'Name', 'List_Price', 'URL', 'Collected by'])
target_df = loaded_df.drop(columns= ['sr_no', 'Name', 'Artist', 'URL', 'Tags', 'Collected by'])

In [None]:
print(image_df.head())
print(meta_df.head())
print(target_df.head())

                                                     URL
sr_no                                                   
1      https://ipfs.pixura.io/ipfs/QmUGKFiPmEYpAvGrfM...
2      https://ipfs.pixura.io/ipfs/QmYHcL8iBJLpZGQjMy...
3      https://ipfs.pixura.io/ipfs/QmaoPaYM7cZrarwZzW...
4      https://ipfs.pixura.io/ipfs/QmZkqFSgXdRcH5Pc7P...
5      https://ipfs.pixura.io/ipfs/Qmf5wvkS1HbsMy4esj...
                 Artist                                               Tags
sr_no                                                                     
1          @placeofmany  #decades #80s #bones #anatomy #outrun #newwave...
2                @roses                       ##roses#roseart#flowers#rose
3              @chicago  #cryptoart #digitalart #iphonedrawing #iphonea...
4      @ornamentalhermi              ##NeuralStyle#AI#Painting#AdaLovelace
5           @videodrome  #Landscape #AI #Painting #Portrait #Generative...
       List_Price
sr_no            
1            15.0
2           200.0
3   

# **3. Download images from URL**

In [None]:
for i in image_df.index:
    image_url = image_df['URL'][i]
    filename = '/content/drive/MyDrive/AI_Artathon/Phase_2/Models/Prediction_Model/Dataset/SuperRare_Original_Processed/{}.png'.format(i)
    #print("Image [{}]:".format(i))
    try:
      r = requests.get(image_url, stream = True)
    except:
      print("Exception")
      pass
    if r.status_code == 200:
      r.raw.decode_content = True
      with open(filename,'wb') as f:
        shutil.copyfileobj(r.raw, f)
        #print('Image sucessfully Downloaded')        
    else:
      print('Image[{}]: Couldn\'t be retreived'.format(i))


# **4. Save images as Numpy arrays**

In [None]:
#Storing the path of the extracted "train" folder 
train_dir = '/content/drive/MyDrive/AI_Artathon/Phase_2/Models/Prediction_Model/Dataset/SuperRare_Original_Processed/'

SIZE = 224

train_images = []

In [None]:
images_path = os.listdir(train_dir)
for i, image_name in enumerate(images_path):
  try:
    image = Image.open(train_dir + image_name)
    image = image.resize((SIZE, SIZE)).convert("RGB")
    train_images.append(np.array(image))
  except Exception:
    print(image_name)
    pass       

In [None]:
#Converting lists to arrays
train_images = np.array(train_images)

In [None]:
print(train_images.shape)

(1424, 224, 224, 3)


In [None]:
train_images_norm = train_images / 255.0
np.save('/content/drive/MyDrive/AI_Artathon/Phase_2/Models/Prediction_Model/Dataset/Numpy_224px_11122021/SuperRare_224px_norm_train.npy', train_images_norm)

# **5. Save prices as Numpy arrays**

In [None]:
target_df.shape

(1424, 1)

In [None]:
target_df.head()

Unnamed: 0_level_0,List_Price
sr_no,Unnamed: 1_level_1
1,15.0
2,200.0
3,3.8
4,0.1
5,100.0


In [None]:
target_df.describe()

Unnamed: 0,List_Price
count,1424.0
mean,43.206613
std,328.591801
min,0.01
25%,0.5
50%,1.75
75%,5.0
max,8888.888


In [None]:
target_df.isna().sum()

List_Price    0
dtype: int64

In [None]:
target_df['List_Price'].mean()

43.20661314374995

In [None]:
train_labels = {}
for i in target_df.index:
  train_labels.update({i: round(target_df['List_Price'][i],4)})  

In [None]:
len(train_labels)

1424

In [None]:
train_labels_np = np.array( tuple(train_labels.values()) )

In [None]:
np.save('/content/drive/MyDrive/AI_Artathon/Phase_2/Models/Prediction_Model/Dataset/Numpy_224px_11122021/SuperRare_target_train.npy', train_labels_np)

In [None]:
print(train_images_norm.shape)
print(train_labels_np.shape)

(1424, 224, 224, 3)
(1424,)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_rem, y_train, y_rem = train_test_split(train_images_norm, train_labels_np, train_size=0.8)
X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, train_size=0.5)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)
print(X_test.shape)
print(y_test.shape)

(1139, 224, 224, 3)
(1139,)
(142, 224, 224, 3)
(142,)
(143, 224, 224, 3)
(143,)


In [None]:
#Save the numpy arrays to google drive
np.save('/content/drive/MyDrive/AI_Artathon/Phase_2/Models/Prediction_Model/Dataset/Numpy_224px_11122021/X_train_224.npy', X_train)
np.save('/content/drive/MyDrive/AI_Artathon/Phase_2/Models/Prediction_Model/Dataset/Numpy_224px_11122021/y_train_224.npy', y_train)
np.save('/content/drive/MyDrive/AI_Artathon/Phase_2/Models/Prediction_Model/Dataset/Numpy_224px_11122021/X_val_224.npy', X_val)
np.save('/content/drive/MyDrive/AI_Artathon/Phase_2/Models/Prediction_Model/Dataset/Numpy_224px_11122021/y_val_224.npy', y_val)
np.save('/content/drive/MyDrive/AI_Artathon/Phase_2/Models/Prediction_Model/Dataset/Numpy_224px_11122021/X_test_224.npy', X_test)
np.save('/content/drive/MyDrive/AI_Artathon/Phase_2/Models/Prediction_Model/Dataset/Numpy_224px_11122021/y_test_224.npy', y_test)

#**6. Meta Data Processing**

In [None]:
meta_df = loaded_df.drop(columns= ['sr_no', 'Name', 'URL', 'Collected by'])
meta_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1424 entries, 1 to 1424
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   List_Price  1424 non-null   float64
 1   Artist      1248 non-null   object 
 2   Tags        826 non-null    object 
dtypes: float64(1), object(2)
memory usage: 84.5+ KB


In [None]:
meta_df.isna().sum()

List_Price      0
Artist        176
Tags          598
dtype: int64

In [None]:
meta_df.to_csv('/content/drive/MyDrive/AI_Artathon/Phase_2/Models/Prediction_Model/Dataset/MetaData_11122021/meta_df.csv',index=False)

In [None]:
loaded_df = pd.read_csv("/content/drive/MyDrive/AI_Artathon/Phase_2/Models/Prediction_Model/Dataset/MetaData_11122021/meta_df_processed.csv", header=0)

In [None]:
print(loaded_df.shape)
print(loaded_df.columns)

(826, 3)
Index(['List_Price', 'Artist', 'Tags'], dtype='object')


In [None]:
X_meta = loaded_df.drop(columns=['List_Price'])
y_meta = loaded_df.drop(columns=['Artist', 'Tags'])

In [None]:
def convert(data): # Why did I encode X_meta and not y_meta?
    number = preprocessing.LabelEncoder()
    data['Artist'] = number.fit_transform(data.Artist)
    data['Tags'] = number.fit_transform(data.Tags)
    return data

train_meta=convert(X_meta)

In [None]:
X_meta_train, X_meta_test, y_meta_train, y_meta_test = train_test_split(train_meta, y_meta, test_size=0.2, random_state=42)

In [None]:
X_meta_train.to_csv('/content/drive/MyDrive/AI_Artathon/Phase_2/Models/Prediction_Model/Dataset/MetaData_11122021/X_train.csv')
X_meta_test.to_csv('/content/drive/MyDrive/AI_Artathon/Phase_2/Models/Prediction_Model/Dataset/MetaData_11122021/X_test.csv')
y_meta_train.to_csv('/content/drive/MyDrive/AI_Artathon/Phase_2/Models/Prediction_Model/Dataset/MetaData_11122021/y_train.csv')
y_meta_test.to_csv('/content/drive/MyDrive/AI_Artathon/Phase_2/Models/Prediction_Model/Dataset/MetaData_11122021/y_test.csv')