In [2]:
import cv2
import numpy as np
import pandas as pd
import os

# Function to extract SIFT features
def extract_sift_features(image_path):
    try:
        # Read the image in grayscale
        image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        if image is None:
            print(f"Failed to load image {image_path}")
            return np.zeros(500)  # Return zero vector if image cannot be loaded
        
        # Initialize SIFT detector
        sift = cv2.SIFT_create()
        
        # Detect keypoints and compute descriptors
        keypoints, descriptors = sift.detectAndCompute(image, None)
        
        if descriptors is not None:
            # Flatten the descriptors into a 1D array
            sift_features = descriptors.flatten()
            
            # Limit the size of the feature vector
            max_features = 500
            if sift_features.size < max_features:
                # Pad with zeros
                sift_features = np.pad(sift_features, (0, max_features - sift_features.size), 'constant')
            else:
                # Truncate
                sift_features = sift_features[:max_features]
            return sift_features
        else:
            # Return zero vector if no descriptors are found
            return np.zeros(500)
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return np.zeros(500)

# Load your dataset
data = pd.read_csv('../data/instagram_data.csv')
data.dropna(inplace=True)

# Convert 'likes' into 3 classes
bins = [0, 100000, 200000, np.inf]
labels = ['Low', 'Medium', 'High']
data['likes_class'] = pd.cut(data['likes'], bins=bins, labels=labels)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data['likes_class_encoded'] = le.fit_transform(data['likes_class'])

# Extract SIFT features for all images
sift_features_list = []
image_paths = []

for idx, row in data.iterrows():
    img_path = row['image_path']
    if not os.path.exists(img_path):
        print(f"File not found: {img_path}")
        sift_features_list.append(np.zeros(500))
        image_paths.append(img_path)
        continue
    features = extract_sift_features(img_path)
    sift_features_list.append(features)
    image_paths.append(img_path)

# Create DataFrame of features
sift_features_df = pd.DataFrame(sift_features_list)

# Combine features with original data
data.reset_index(drop=True, inplace=True)
sift_features_df.reset_index(drop=True, inplace=True)
combined_data = pd.concat([data, sift_features_df], axis=1)

# Prepare features and target
exclude_columns = ['likes', 'no_of_comments', 't', 'follower_count_at_t', 'image_path', 'likes_class', 'likes_class_encoded']
feature_columns = [col for col in combined_data.columns if col not in exclude_columns]
X = combined_data[feature_columns]
X.columns = X.columns.map(str)
X.fillna(0, inplace=True)
y = combined_data['likes_class_encoded']

# Split dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Train Random Forest model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate model
from sklearn.metrics import classification_report, confusion_matrix
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=labels))

# Confusion Matrix
import seaborn as sns
import matplotlib.pyplot as plt
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens',
            xticklabels=labels, yticklabels=labels)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()


Low: 0 to 68295.68000000001
Medium: 68295.68000000001 to 189775.28
High: 189775.28 and above


KeyboardInterrupt: 