## Imports

**requests**          -> Used to recieve content from webpage.
<br>
**json**              -> Used to turn json files into dictionaries.
<br>
**PIL**               -> Used to turn file into image that can be resized.
<br>
**numpy**             -> Many uses, including turing PIL Image into an array.
<br>
**pandas**            -> Used for dataframes and dataframes manipulation.
<br>
**matplotlib.pyplot** -> For plotting graphs.
<br>
**sklearn**           -> Allows to train model from a percentage of itself.
<br>
**keras**             -> Used for the actual model.
<br>
**scripts**           -> Used to grab max likes on all photos from all of the users.

In [1]:
import requests
import json
from time import sleep

from PIL import Image

import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from keras.models import Sequential
from keras.layers import Dense, Flatten, Conv2D, MaxPooling2D

from scripts.get_users import GetUsers

%matplotlib inline

Using TensorFlow backend.


## Data Gathering

Using scrapy's crawler, gathered data as XHR from multiple sources and users. Gathered all observations along with it in case there's a need for it later. In addition, gathered max like of all photos of each user in order to scale the original likes on the photos.

#### Grabs X and y from text

In [None]:
photos_info = []

for i in range(1, 178):
    filename = f'./data/photos-{i}.txt'
    with open(filename, 'r') as f:
        text = f.read()
        items = json.loads(text)
    
    for item in items:
        photos_user_url = item['user']['username']
        photos_url = item['urls']['small']
        photos_likes = item['likes']
        
        photos_info.append([photos_user_url, photos_url, photos_likes])
        
photos_df = pd.DataFrame(photos_info, columns=['user_url', 'small_url', 'likes'])
photos_df.to_csv('./datasets/user_url_likes.csv', index=False)
photos_df.head()

In [None]:
base_url = 'https://unsplash.com/napi/users/{}/photos?page={}&per_page=10&order_by=latest'

i = 1
likes = set()
while True:
    url = base_url.format('gallivantinglife', i)
    response = requests.get(url)
    if response.status_code == 404: return 0

    json_file = json.loads(response.text)
    if len(json_file) == 0:
        print(f'broke at: {i}')
        break

    for item in json_file:
        print(item)
        print(item['likes'])
        print(i)
        likes.add(item['likes'])
    i += 1
    sleep(1)
try:
    print(max(likes))
except:
    print(0)

In [None]:
photos_df.iloc[1081:, 0]

In [None]:
# Uses Requests
photos_df = pd.read_csv('./datasets/user_url_likes.csv')

# Start where left off
users_list = photos_df.iloc[1081:, 0].values.tolist()

get_users = GetUsers(users_list, 'user_errors/', 'user_files/', 'users_info_3/')

get_users.run()

## Get Mask Delete

In [None]:
photos_df = pd.read_csv('./datasets/user_url_likes.csv')

# First json dictionary
file = './users_info/user_info.json'
with open(file, 'r') as f:
    stuff = json.loads(f.read())
    
# Second json dictionary
file_2 = './users_info_2/user_info.json'
with open(file_2, 'r') as f:
    stuff_2 = json.loads(f.read())
    
# Third json dictionary
file_3 = './users_info_3/user_info.json'
with open(file_3, 'r') as f:
    stuff_3 = json.loads(f.read())
    
# Combine the dictionaries
stuff_2.update(stuff)
stuff_3.update(stuff_2)

transformed_likes = stuff_3.copy()

# Get setup to delete any observation where profile no long exists, i.e. 0 value.
name_delete = set()
for key, value in transformed_likes.items():
    if value <= 0: name_delete.add(key)
len(name_delete)

# Declare the mask for user(s) with value 0      
mask = photos_df['user_url'].map(lambda x: x in name_delete)

# Declare the index of mask
mask_delete = photos_df[mask].index

# Check and Drop
print('Sum Before:', photos_df['user_url'].map(lambda x: x in name_delete).sum())
check = photos_df.drop(mask_delete)
print('Sum After:', check['user_url'].map(lambda x: x in name_delete).sum())
proceed = input("Do you want to continue?\n>>> ")
if proceed.lower() in 'yes':
    photos_df.drop(mask_delete, inplace=True)
else:
    print('Goodbye.')

In [None]:
# To store
mask_delete_df = pd.DataFrame(mask_delete, columns=['mask_delete'])

mask_delete_df.to_csv('datasets/mask_delete.csv', index=False)

In [None]:
# Create a transformed columns so we can save old y column
photos_df['transformed_likes'] = photos_df['likes'] / photos_df['user_url'].map(transformed_likes).values

# Save as official csv for CNN
photos_df.to_csv('datasets/train.csv', index=False)

## Photos' Url

In [None]:
# Use this to store photo's url
photos_info = []

for i in range(1, 178):
    filename = f'./data/photos-{i}.txt'
    with open(filename, 'r') as f:
        text = f.read()
        items = json.loads(text)
    
    for item in items:
        photos_url = item['urls']['small']
        photos_info.append(photos_url)

## Download Photos

In [None]:
# Uses Requests to download photos
for i, url in enumerate(photos_info):
    img_filename = f'./img_small/photos-{i}.jpg'
    img_data = requests.get(url).content

    with open(img_filename, 'wb') as f:
        f.write(img_data)
        
    sleep(1)

## Img to Array

In [None]:
# Load in mask_delete
mask_delete = pd.read_csv('./datasets/mask_delete.csv')['mask_delete']

# Need to grab base size in order to resize later
img = Image.open('./img_small/photos-0.jpg')
size = img.size

# Holder will store numpy arrays of pixels
holder = []

# I know less than 18,000 but not exactly sure so stop whenever
for i in range(2000):
    if i not in mask_delete:
        try:
            # Grab each file
            file = f'./img_small/photos-{i}.jpg'

            # Open img and proceed to resize it
            img = Image.open(file)
            img = img.resize(size)

            # Temp variable to store numpy array of pixels
            temp = np.array(img).ravel()
            holder.append(temp)
        except:
            print(f"Broke at {i}")
            break

In [None]:
# Holder is arrays of pixels, stack to set up for CNN
X = np.array(holder)

# Only need transformed likes for y
y = pd.read_csv('./datasets/train.csv')['transformed_likes'].values
y = np.log10(y)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# # CNN
X_train = X_train.astype('float64')
X_test = X_test.astype('float64')

ss = StandardScaler()

X_train_ss = ss.fit_transform(X_train)
X_test_ss = ss.transform(X_test)

## Kera Train/Test Split

In [None]:
# Instantiate model
model = Sequential()

# Add layers
model.add(Dense(
    128,
    activation='relu',
    input_dim=X_train.shape[1]
))

model.add(Dense(
    32,
    activation='relu'
))

# Add output layer
model.add(Dense(
    1,
))

# Compile
model.compile(
    loss='mse',
    optimizer='adam',
    metrics=['mae']
)

# Fit model
# Only need 50 epochs
history = model.fit(
    X_train_ss,
    y_train,
    epochs=50,
    batch_size=32,
    validation_data=(X_test_ss, y_test),
)

# Get predictions
test_pred = model.predict(X_test_ss)
train_pred = model.predict(X_train_ss)

# Plot mse loss
plt.figure(figsize=(12,8))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Testing Loss');
plt.legend();

# Plot mae loss
plt.figure(figsize=(12,8))
plt.plot(history.history['mean_absolute_error'], label='Training Loss')
plt.plot(history.history['val_mean_absolute_error'], label='Testing Loss');
plt.legend();

In [None]:
# Load in mask_delete
mask_delete = pd.read_csv('./datasets/mask_delete.csv')['mask_delete']

# Need to grab base size in order to resize later
img = Image.open('./img_small/photos-0.jpg')
size = img.size

# Holder will store numpy arrays of pixels
holder = []

# I know less than 18,000 but not exactly sure so stop whenever
for i in range(2000):
    if i not in mask_delete:
        try:
            # Grab each file
            file = f'./img_small/photos-{i}.jpg'

            # Open img and proceed to resize it
            img = Image.open(file)
            img = img.resize(size)

            # Temp variable to store numpy array of pixels
            temp = np.array(img).ravel()
            holder.append(temp)
        except:
            print(f"Broke at {i}")
            break

In [None]:
# Holder is arrays of pixels, stack to set up for CNN
X = np.array(holder)

# Only need transformed likes for y
y = pd.read_csv('./datasets/train.csv')['transformed_likes'].values
y = np.log10(y)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# # CNN
X_train = X_train.astype('float64')
X_test = X_test.astype('float64')

ss = StandardScaler()

X_train_ss = ss.fit_transform(X_train)
X_test_ss = ss.transform(X_test)

# FINAL MODEL

In [2]:
# Load in mask_delete
mask_delete = pd.read_csv('./datasets/mask_delete.csv')['mask_delete']

# Need to grab base size in order to resize later
img = Image.open('./img_small/photos-0.jpg')
size = img.size

# Holder will store numpy arrays of pixels
holder = []

# I know less than 18,000 but not exactly sure so stop whenever
for i in range(2000):
    if i not in mask_delete:
        try:
            # Grab each file
            file = f'./img_small/photos-{i}.jpg'

            # Open img and proceed to resize it
            img = Image.open(file)
            img = img.resize(size)

            # Temp variable to store numpy array of pixels
            temp = np.array(img).ravel()
            holder.append(temp)
        except:
            print(f"Broke at {i}")
            break

# Holder is arrays of pixels, stack to set up for CNN
X = np.array(holder)

# Only need transformed likes for y
y = pd.read_csv('./datasets/train.csv')['transformed_likes'].values
y = np.log10(y)

ss = StandardScaler()

X_train_ss = ss.fit_transform(X)

Broke at 1770




In [7]:
# Instantiate model
model = Sequential()

# Add layers
model.add(Dense(
    128,
    activation='relu',
    input_dim=X_train_ss.shape[1]
))

model.add(Dense(
    32,
    activation='relu'
))

# Add output layer
model.add(Dense(
    1,
))

# Compile
model.compile(
    loss='mse',
    optimizer='adam',
    metrics=['mae']
)

# Fit model
# Only need 12 epochs
history = model.fit(
    X_train_ss,
    y,
    epochs=12,
    batch_size=32,
)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


In [8]:
filename = 'kera_v_0.sav'
pickle.dump(model, open(filename, 'wb'))