## Image to vector: pre-trained CNN

In [15]:
import os
import pandas as pd
from PIL import Image
import numpy as np
from tqdm import tqdm
from ast import literal_eval

import torch
from torchvision import models, transforms

# Load your DataFrame
csv_path = '/content/drive/MyDrive/Colab Notebooks/Data/instagram_data.csv'
updated_csv_path = '/content/drive/MyDrive/Colab Notebooks/Data/instagram_data_with_CNN.csv'

device = "cuda" if torch.cuda.is_available() else "cpu"

if os.path.exists(updated_csv_path):
    df = pd.read_csv(updated_csv_path)
    df['img_vec'] = df['img_vec'].apply(literal_eval)

    img_vec_columns = pd.DataFrame(df['img_vec'].tolist(), index=df.index)
    df = pd.concat([df, img_vec_columns], axis=1).drop(columns=['img_vec'])
else:
    df = pd.read_csv(csv_path)
    df['img_vec'] = None


    # Load a pre-trained CNN (e.g., ResNet)
    cnn_model = models.resnet50(weights='DEFAULT')  # Use the latest method for weights
    cnn_model.eval()
    cnn_model.to(device)

    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    # Process images
    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc='Processing Images'):
        image_path = row['image_path']
        full_image_path = os.path.join('/content/drive/MyDrive/Colab Notebooks/Data', image_path)

        if os.path.exists(full_image_path):
            raw_image = Image.open(full_image_path).convert('RGB')
            img_tensor = transform(raw_image).unsqueeze(0).to(device)

            with torch.no_grad():
                cnn_features = cnn_model(img_tensor).cpu().numpy()
            df.at[index, 'img_vec'] = cnn_features.flatten().tolist()

        else:
            print(f"Image not found at {full_image_path}")
            df.at[index, 'img_vec'] = None

    df.to_csv(updated_csv_path, index=False)

    img_vec_columns = pd.DataFrame(df['img_vec'].tolist(), index=df.index)
    df = pd.concat([df, img_vec_columns], axis=1).drop(columns=['img_vec'])


In [17]:
df.head()

Unnamed: 0,likes,no_of_comments,t,follower_count_at_t,image_path,0,1,2,3,4,...,990,991,992,993,994,995,996,997,998,999
0,154552,0,1594174009,40934474,../Data/insta_data/0.jpg,-0.967442,1.104774,-0.388086,1.617857,0.861507,...,-0.140474,0.295819,-0.366622,-0.147057,-0.280668,0.225625,-0.815258,-0.415552,-0.669577,0.437415
1,97386,0,1593571666,40934474,../Data/insta_data/2.jpg,-0.031442,-0.079294,-0.31051,0.143409,0.176592,...,-0.348174,-0.750318,-0.399996,0.04699,-0.526279,-0.316225,-0.270202,0.079031,-1.003817,0.286315
2,145632,0,1593136341,40934474,../Data/insta_data/4.jpg,-0.539304,-0.172383,-0.557214,-0.04905,-0.365062,...,-0.273688,-0.603226,0.19705,-0.303838,0.100485,-0.444856,0.156611,-0.447702,-0.845766,2.051532
3,76461,0,1592981047,40934474,../Data/insta_data/6.jpg,-0.792022,-0.485644,-0.099522,0.404537,-0.006167,...,0.175973,-0.354174,-0.24399,-0.575918,-0.234847,0.095493,-0.175676,-0.722137,0.075589,-0.81308
4,174620,0,1592703461,40934474,../Data/insta_data/8.jpg,-0.889818,-0.275837,0.976324,-0.41411,1.543747,...,-0.272002,-0.637751,0.026815,-0.12567,-0.168254,-0.364859,-1.165386,-0.303809,-0.41031,0.61136


In [18]:
df['log_likes'] = np.log(df['likes'] + 1)
df['log_comments'] = np.log(df['no_of_comments'] + 1)
df['log_followers'] = np.log(df['follower_count_at_t'] + 1)

# remove timestamp, and image_path
df.drop(['t', 'image_path', 'likes', 'no_of_comments', 'follower_count_at_t'], axis=1, inplace=True)

df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,993,994,995,996,997,998,999,log_likes,log_comments,log_followers
0,-0.967442,1.104774,-0.388086,1.617857,0.861507,0.99024,1.319291,0.017189,-0.645274,-0.086096,...,-0.147057,-0.280668,0.225625,-0.815258,-0.415552,-0.669577,0.437415,11.948292,0.0,17.527483
1,-0.031442,-0.079294,-0.31051,0.143409,0.176592,-0.418672,-0.070648,-0.137374,-0.30312,-0.0879,...,0.04699,-0.526279,-0.316225,-0.270202,0.079031,-1.003817,0.286315,11.486448,0.0,17.527483
2,-0.539304,-0.172383,-0.557214,-0.04905,-0.365062,-0.089457,-0.507202,-0.112224,0.010705,-0.528833,...,-0.303838,0.100485,-0.444856,0.156611,-0.447702,-0.845766,2.051532,11.888845,0.0,17.527483
3,-0.792022,-0.485644,-0.099522,0.404537,-0.006167,-0.250397,-0.301057,-0.849119,-0.418845,-0.251193,...,-0.575918,-0.234847,0.095493,-0.175676,-0.722137,0.075589,-0.81308,11.244549,0.0,17.527483
4,-0.889818,-0.275837,0.976324,-0.41411,1.543747,0.258953,0.267523,-0.036151,-0.747129,0.177904,...,-0.12567,-0.168254,-0.364859,-1.165386,-0.303809,-0.41031,0.61136,12.070373,0.0,17.527483


### Normalizing

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df.drop(columns=['log_likes'])
X.columns = X.columns.astype(str) # convert column name to str type to avoid error
y = df['log_likes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Training

In [20]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import torch.nn as nn
import torch.optim as optim


models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "Support Vector Regression": SVR()
}

results = []
for model_name, model_instance in models.items():
    model_instance.fit(X_train, y_train)
    y_pred = model_instance.predict(X_test)

    results.append({
        "Model": model_name,
        "MSE": mean_squared_error(y_test, y_pred),
        "MAE": mean_absolute_error(y_test, y_pred),
        "RMSE": mean_squared_error(y_test, y_pred) ** 0.5,
        "R-squared": r2_score(y_test, y_pred)
    })

In [21]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1).to(device)  # (N, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1).to(device)

class RegressionNN(nn.Module):
    def __init__(self, input_dim):
        super(RegressionNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512) # input
        self.bn1 = nn.BatchNorm1d(512)
        self.dropout1 = nn.Dropout(0.3)

        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.dropout2 = nn.Dropout(0.3)

        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.dropout3 = nn.Dropout(0.3)

        self.fc4 = nn.Linear(128, 1)  # output

    def forward(self, x):
        x = self.dropout1(torch.relu(self.bn1(self.fc1(x))))
        x = self.dropout2(torch.relu(self.bn2(self.fc2(x))))
        x = self.dropout3(torch.relu(self.bn3(self.fc3(x))))
        x = self.fc4(x)
        return x

input_dim = X_train.shape[1]  # Number of input features
dnn_model = RegressionNN(input_dim).to(device)
criterion = nn.MSELoss()  # Mean Squared Error for regression
optimizer = optim.Adam(dnn_model.parameters(), lr=0.001)

epochs = 1000
for epoch in range(epochs):
    dnn_model.train()
    optimizer.zero_grad()
    predictions = dnn_model(X_train_tensor)
    loss = criterion(predictions, y_train_tensor)
    loss.backward()
    optimizer.step()

dnn_model.eval()
with torch.no_grad():
    predictions_test = dnn_model(X_test_tensor).cpu().numpy()

# Convert predictions to 1D array
predictions_test = predictions_test.flatten()

results.append({
    "Model": "DNN",
    "MSE": mean_squared_error(y_test, predictions_test),
    "MAE": mean_absolute_error(y_test, predictions_test),
    "RMSE": mean_squared_error(y_test, predictions_test)**0.5,
    "R-squared": r2_score(y_test, predictions_test)
})
results_df = pd.DataFrame(results)
print(results_df)

                       Model       MSE       MAE      RMSE  R-squared
0          Linear Regression  0.826499  0.681169  0.909120   0.379627
1           Ridge Regression  0.821462  0.678891  0.906346   0.383408
2           Lasso Regression  1.332298  0.941401  1.154252  -0.000027
3              Decision Tree  0.728817  0.623636  0.853708   0.452948
4              Random Forest  0.387304  0.461662  0.622338   0.709288
5          Gradient Boosting  0.343729  0.442663  0.586284   0.741996
6  Support Vector Regression  0.690629  0.654223  0.831041   0.481612
7                        DNN  1.078008  0.802246  1.038272   0.190844
