In [38]:
import pandas as pd

import numpy as np

from PIL import Image

import concurrent.futures as cf

from tqdm.notebook import tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

### Read metadata

In [39]:
instagram_df = pd.read_csv('instagram_data.csv')
instagram_df['image_path'] = instagram_df['image_path'].str.replace('../Data/', '')
instagram_df["class"] = instagram_df["likes"].apply(lambda x: min(len(str(x)), 6)) - 4
instagram_df

Unnamed: 0,likes,no_of_comments,t,follower_count_at_t,image_path,class
0,154552,0,1594174009,40934474,insta_data/0.jpg,2
1,97386,0,1593571666,40934474,insta_data/2.jpg,1
2,145632,0,1593136341,40934474,insta_data/4.jpg,2
3,76461,0,1592981047,40934474,insta_data/6.jpg,1
4,174620,0,1592703461,40934474,insta_data/8.jpg,2
...,...,...,...,...,...,...
3780,341895,6029,1555164674,4229627,insta_data/9435.jpg,2
3781,217095,2591,1554706282,4229627,insta_data/9436.jpg,2
3782,17902,253,1553945996,4229627,insta_data/9437.jpg,1
3783,141694,1175,1552645088,4229627,insta_data/9442.jpg,2


In [40]:
# likes_upper_bound = instagram_df['likes'].quantile(0.9)
# comments_upper_bound = instagram_df['no_of_comments'].quantile(0.9)
# t_low = instagram_df['likes'].quantile(0.1)
# instagram_df = instagram_df[(instagram_df['likes'] < likes_upper_bound) & (instagram_df['no_of_comments'] < comments_upper_bound) & (instagram_df['t'] > t_low)]
# print(instagram_df['class'].value_counts())

### Read images (multi-threaded)

In [41]:
def open_image(image_path):
    img = Image.open(image_path)
    img = img.resize((224, 224))
    x = np.array(img)
    shape = x.shape
    x = np.expand_dims(x, axis=0)
    return image_path, x, shape

image_path_to_array = {}
shapes = []
with cf.ThreadPoolExecutor() as executor:
    futures = [executor.submit(open_image, image_path) for image_path in instagram_df['image_path']]
    for future in tqdm(cf.as_completed(futures), total=len(futures), desc="Reading Images"):
        image_path, img_np, shape = future.result()
        shapes.append(shape)
        image_path_to_array[image_path] = img_np

Reading Images:   0%|          | 0/3785 [00:00<?, ?it/s]

### Make Numpy Arrays of Dataset

In [42]:
def get_X_Y(instagram_df, image_path_to_array, pca = None):
    X = []
    Y = []
    Y_class = []
    for _, row in instagram_df.iterrows():
        new_row = [row['no_of_comments'], row['t'], row['follower_count_at_t']]
        image_path = row['image_path']
        if pca is not None:
            new_row.extend(pca.transform(image_path_to_array[image_path].mean(axis = 2).flatten().reshape(1, -1))[0])
        else:
            new_row.extend(image_path_to_array[image_path].mean(axis = 2).flatten())
        X.append(new_row)
        Y.append([row['likes']])
        Y_class.append([row['class']])
    return np.array(X), np.array(Y), np.array(Y_class)

pca = PCA(n_components=5)
pca.fit([img.mean(axis = 2).flatten() for img in image_path_to_array.values()])

X, Y, Y_class = get_X_Y(instagram_df, image_path_to_array, pca)
indexes = np.array(range(len(X)))
standard_scaler = StandardScaler()
standard_scaler.fit(X)
X = standard_scaler.transform(X)

X_train, X_test, train_indexes, test_indexes = train_test_split(X, indexes, test_size=0.2, random_state=42)
Y_train = Y[train_indexes]
Y_test = Y[test_indexes]
Y_class_train = Y_class[train_indexes]
Y_class_test = Y_class[test_indexes]

## Regression

In [43]:
LR = LinearRegression()
LR.fit(X_train, Y_train)
print(f"Linear Regression Score: {LR.score(X_test, Y_test)}")

Linear Regression Score: 0.03235664507602065


In [44]:
KNN = KNeighborsRegressor(n_neighbors=5)
KNN.fit(X_train, Y_train)
print(f"KNN Score: {KNN.score(X_test, Y_test)}")

KNN Score: 0.02914342327140329


In [45]:
MLP = MLPRegressor(hidden_layer_sizes=(100, 100), max_iter=1000, random_state=42, validation_fraction=0.1, early_stopping=True)
MLP.fit(X_train, Y_train.flatten())
print(f"MLP Score: {MLP.score(X_test, Y_test)}")

MLP Score: 0.010700241014972356


In [46]:
RF = RandomForestRegressor(n_estimators=100, random_state=42)
RF.fit(X_train, Y_train.flatten())
print(f"Random Forest Score: {RF.score(X_test, Y_test)}")

Random Forest Score: 0.7758338892880675


In [47]:
ET = ExtraTreesRegressor(n_estimators=100, random_state=42)
ET.fit(X_train, Y_train.flatten())
print(f"Extra Trees Score: {ET.score(X_test, Y_test)}")

Extra Trees Score: 0.8247956815981461


## Classification

In [48]:
MLPC = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=1000, random_state=42, validation_fraction=0.1)
MLPC.fit(X_train, Y_class_train.flatten())
y_pred = MLPC.predict(X_test)
print(f"MLP Accuracy: {accuracy_score(Y_class_test, y_pred)}")
print(f"MLP Precision: {precision_score(Y_class_test, y_pred, average='weighted')}")
print(f"MLP Recall: {recall_score(Y_class_test, y_pred, average='weighted')}")
print(f"MLP F1: {f1_score(Y_class_test, y_pred, average='weighted')}")

MLP Accuracy: 0.7424042272126816
MLP Precision: 0.7416303711775856
MLP Recall: 0.7424042272126816
MLP F1: 0.7392871570079449


In [49]:
KNNC = KNeighborsClassifier(n_neighbors=5)
KNNC.fit(X_train, Y_class_train.flatten())
y_pred = KNNC.predict(X_test)
print(f"KNN Accuracy: {accuracy_score(Y_class_test, y_pred)}")
print(f"KNN Precision: {precision_score(Y_class_test, y_pred, average='weighted')}")
print(f"KNN Recall: {recall_score(Y_class_test, y_pred, average='weighted')}")
print(f"KNN F1: {f1_score(Y_class_test, y_pred, average='weighted')}")

KNN Accuracy: 0.5891677675033025
KNN Precision: 0.5875116002308227
KNN Recall: 0.5891677675033025
KNN F1: 0.5866646128371538


In [50]:
RFC = RandomForestClassifier(n_estimators=100, random_state=42)
RFC.fit(X_train, Y_class_train.flatten())
y_pred = RFC.predict(X_test)
print(f"Random Forest Accuracy: {accuracy_score(Y_class_test, y_pred)}")
print(f"Random Forest Precision: {precision_score(Y_class_test, y_pred, average='weighted')}")
print(f"Random Forest Recall: {recall_score(Y_class_test, y_pred, average='weighted')}")
print(f"Random Forest F1: {f1_score(Y_class_test, y_pred, average='weighted')}")

Random Forest Accuracy: 0.8441215323645971
Random Forest Precision: 0.8486768182629967
Random Forest Recall: 0.8441215323645971
Random Forest F1: 0.8400276865499082


In [51]:
ETC = ExtraTreesClassifier(n_estimators=100, random_state=42)
ETC.fit(X_train, Y_class_train.flatten())
y_pred = ETC.predict(X_test)
print(f"Extra Trees Accuracy: {accuracy_score(Y_class_test, y_pred)}")
print(f"Extra Trees Precision: {precision_score(Y_class_test, y_pred, average='weighted')}")
print(f"Extra Trees Recall: {recall_score(Y_class_test, y_pred, average='weighted')}")
print(f"Extra Trees F1: {f1_score(Y_class_test, y_pred, average='weighted')}")

Extra Trees Accuracy: 0.8282694848084544
Extra Trees Precision: 0.8311476721120191
Extra Trees Recall: 0.8282694848084544
Extra Trees F1: 0.8230973873570815
