In [None]:
import instaloader
import pandas as pd
import json
import os
from datetime import datetime
from dotenv import load_dotenv

In [None]:
# Load env file, session and target profile
load_dotenv()
IG_USERNAME = os.getenv("IG_USERNAME")
IG_TARGET = os.getenv("IG_TARGET")

if not IG_USERNAME or not IG_TARGET:
    raise ValueError("Please set IG_USERNAME and IG_TARGET in your .env file.")

# Initialize Instaloader and load session
L = instaloader.Instaloader()
L.load_session_from_file(IG_USERNAME)
profile = instaloader.Profile.from_username(L.context, IG_TARGET)


In [None]:

# Collect posts with images (ignore videos)
posts_data = []
print("Fetching image posts...")

for post in profile.get_posts():
    if post.typename not in ['GraphImage', 'GraphSidecar']:
        continue

    image_urls = []
    if post.typename == 'GraphImage':
        image_urls = [post.url]
    elif post.typename == 'GraphSidecar':
        image_urls = [n.display_url for n in post.get_sidecar_nodes() if not n.is_video]

    if image_urls:
        posts_data.append({
            'shortcode': post.shortcode,
            'url': post.url,
            'caption': post.caption or "",
            'date': post.date,
            'likes': post.likes,
            'is_sidecar': post.typename == 'GraphSidecar',
            'image_urls': image_urls,
            'download_urls': image_urls,
        })

print(f"✅ Collected {len(posts_data)} image-based posts.")


In [None]:
# Convert to DataFrame and summarize
df_posts = pd.DataFrame(posts_data)
print(f"📊 Total posts: {len(df_posts)} | Date Range: {df_posts['date'].min()} to {df_posts['date'].max()}")

# Create output directory
data_dir = 'data'
os.makedirs(data_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save full data as JSON
json_path = os.path.join(data_dir, f'{IG_TARGET}_posts_{timestamp}.json')
with open(json_path, 'w', encoding='utf-8') as f:
    json.dump(posts_data, f, ensure_ascii=False, indent=2, default=str)
print(f"📁 Saved full data to {json_path}")

In [None]:
# Create processed, ranked version by likes
processed_df = df_posts[['shortcode', 'url', 'caption', 'likes', 'download_urls']].copy()
processed_df = processed_df.sort_values(by='likes', ascending=False).reset_index(drop=True)
processed_data = processed_df.to_dict('records')

# Save processed data
processed_path = os.path.join(data_dir, f'{IG_TARGET}_processed_{timestamp}.json')
with open(processed_path, 'w', encoding='utf-8') as f:
    json.dump(processed_data, f, ensure_ascii=False, indent=2, default=str)
print(f"📁 Saved processed data to {processed_path}")

In [None]:
# Report summary
print(f"\n✅ Processed data saved.")
print(f"🏆 Top post: {processed_data[0]['likes']:,} likes")
print(f"📉 Bottom post: {processed_data[-1]['likes']:,} likes")
print(f"🔢 Total ranked posts: {len(processed_data)}")

# Show preview of top 3 posts
print("\n📋 Top 3 posts:")
for i, post in enumerate(processed_data[:3], 1):
    print(f"{i}. Shortcode: {post['shortcode']} | Likes: {post['likes']:,}")


In [None]:
# Download top posts
top_n = 10
download_dir = os.path.join(data_dir, f'top_posts_{timestamp}')
os.makedirs(download_dir, exist_ok=True)

print(f"📥 Downloading top {top_n} posts...")

for i, post in enumerate(processed_data[:top_n], 1):
    shortcode = post['shortcode']
    caption = post['caption']
    urls = post['download_urls']

    post_dir = os.path.join(download_dir, f"{i:02d}_{shortcode}")
    os.makedirs(post_dir, exist_ok=True)

    # Save images
    for j, url in enumerate(urls, 1):
        image_path = os.path.join(post_dir, f"img{j}.jpg")
        L.download_pic(image_path, url, datetime.now())
    
    # Save caption
    caption_path = os.path.join(post_dir, "caption.txt")
    with open(caption_path, "w", encoding="utf-8") as f:
        f.write(caption.strip())

    print(f"✅ Saved post {i}: {shortcode} with {len(urls)} image(s)")

print(f"\n📁 All top {top_n} posts downloaded to: {download_dir}")

# BONUS:

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set modern plotting style
plt.style.use('dark_background')
sns.set_palette("husl")

class InstagramAnalytics:
    """Professional Instagram data analytics class with advanced visualizations"""
    
    def __init__(self, df_posts):
        self.df = df_posts.copy()
        self._prepare_data()
        
    def _prepare_data(self):
        """Prepare and enrich the dataset with additional metrics"""
        # Ensure we have the image_count column for all posts
        if 'image_count' not in self.df.columns:
            self.df['image_count'] = self.df.apply(
                lambda row: len(row['image_urls']) if 'image_urls' in row and row['image_urls'] else 1, 
                axis=1
            )
        
        # Fill missing image_count values
        self.df['image_count'] = self.df['image_count'].fillna(1)
        
        # Create datetime features
        self.df['year_month'] = self.df['date'].dt.to_period('M')
        self.df['year'] = self.df['date'].dt.year
        self.df['month'] = self.df['date'].dt.month
        self.df['weekday'] = self.df['date'].dt.day_name()
        self.df['hour'] = self.df['date'].dt.hour
        
        # Engagement metrics
        self.df['likes_per_image'] = self.df['likes'] / self.df['image_count']
        self.df['engagement_score'] = np.log1p(self.df['likes'])  # Log-scaled engagement
        
        # Caption analysis
        self.df['caption_length'] = self.df['caption'].fillna('').str.len()
        self.df['has_hashtags'] = self.df['caption'].fillna('').str.contains('#')
        self.df['hashtag_count'] = self.df['caption'].fillna('').str.count('#')
        
        # Performance categories
        likes_quantiles = self.df['likes'].quantile([0.33, 0.66])
        self.df['performance'] = pd.cut(
            self.df['likes'], 
            bins=[0, likes_quantiles[0.33], likes_quantiles[0.66], float('inf')],
            labels=['Low', 'Medium', 'High']
        )
    
    def print_enhanced_summary(self):
        """Print comprehensive data summary with advanced metrics"""
        print("=" * 60)
        print("🚀 INSTAGRAM ANALYTICS DASHBOARD")
        print("=" * 60)
        
        # Basic metrics
        total_posts = len(self.df)
        date_range = f"{self.df['date'].min().strftime('%Y-%m-%d')} → {self.df['date'].max().strftime('%Y-%m-%d')}"
        total_likes = self.df['likes'].sum()
        avg_images = self.df['image_count'].mean()
        
        print(f"📊 Dataset Overview:")
        print(f"   ├─ Total Posts: {total_posts:,}")
        print(f"   ├─ Date Range: {date_range}")
        print(f"   ├─ Days Active: {(self.df['date'].max() - self.df['date'].min()).days}")
        print(f"   └─ Average Posts/Month: {total_posts / max(1, len(self.df['year_month'].unique())):.1f}")
        
        print(f"\n❤️ Engagement Metrics:")
        print(f"   ├─ Total Likes: {total_likes:,}")
        print(f"   ├─ Average Likes: {self.df['likes'].mean():.0f}")
        print(f"   ├─ Median Likes: {self.df['likes'].median():.0f}")
        print(f"   ├─ Like Rate Std: {self.df['likes'].std():.0f}")
        print(f"   └─ Best Performance: {self.df['likes'].max():,} likes")
        
        print(f"\n📸 Content Analysis:")
        print(f"   ├─ Avg Images/Post: {avg_images:.1f}")
        print(f"   ├─ Single Image Posts: {(~self.df['is_sidecar']).sum()} ({(~self.df['is_sidecar']).mean()*100:.1f}%)")
        print(f"   ├─ Multi-Image Posts: {self.df['is_sidecar'].sum()} ({self.df['is_sidecar'].mean()*100:.1f}%)")
        print(f"   └─ Avg Caption Length: {self.df['caption_length'].mean():.0f} chars")
        
        # Performance insights
        best_month = self.df.groupby('year_month')['likes'].mean().idxmax()
        best_weekday = self.df.groupby('weekday')['likes'].mean().idxmax()
        
        print(f"\n🏆 Performance Insights:")
        print(f"   ├─ Best Month: {best_month}")
        print(f"   ├─ Best Day: {best_weekday}")
        print(f"   ├─ Posts with Hashtags: {self.df['has_hashtags'].sum()} ({self.df['has_hashtags'].mean()*100:.1f}%)")
        print(f"   └─ Avg Hashtags/Post: {self.df['hashtag_count'].mean():.1f}")
    
    def create_advanced_visualizations(self):
        """Create professional-grade visualizations"""
        # Create figure with custom layout
        fig = plt.figure(figsize=(20, 16))
        gs = fig.add_gridspec(4, 4, hspace=0.3, wspace=0.3)
        
        # Color palette
        colors = sns.color_palette("husl", 8)
        
        # 1. Engagement Timeline (spans 2 columns)
        ax1 = fig.add_subplot(gs[0, :2])
        monthly_data = self.df.groupby('year_month').agg({
            'likes': ['mean', 'count'],
            'engagement_score': 'mean'
        }).round(2)
        
        monthly_data.columns = ['avg_likes', 'post_count', 'engagement_score']
        
        # Dual y-axis plot
        ax1_twin = ax1.twinx()
        
        line1 = ax1.plot(monthly_data.index.astype(str), monthly_data['avg_likes'], 
                        marker='o', linewidth=3, markersize=8, color=colors[0], label='Avg Likes')
        bars = ax1_twin.bar(monthly_data.index.astype(str), monthly_data['post_count'], 
                           alpha=0.3, color=colors[1], label='Post Count')
        
        ax1.set_title('📈 Engagement Timeline & Posting Frequency', fontsize=16, fontweight='bold', pad=20)
        ax1.set_ylabel('Average Likes', fontsize=12)
        ax1_twin.set_ylabel('Posts Count', fontsize=12)
        ax1.tick_params(axis='x', rotation=45)
        ax1.grid(True, alpha=0.3)
        
        # Combined legend
        lines1, labels1 = ax1.get_legend_handles_labels()
        lines2, labels2 = ax1_twin.get_legend_handles_labels()
        ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper left')
        
        # 2. Likes Distribution with KDE (spans 2 columns)
        ax2 = fig.add_subplot(gs[0, 2:])
        sns.histplot(data=self.df, x='likes', bins=30, kde=True, alpha=0.7, 
                    color=colors[2], ax=ax2, stat='density')
        ax2.axvline(self.df['likes'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {self.df["likes"].mean():.0f}')
        ax2.axvline(self.df['likes'].median(), color='orange', linestyle='--', linewidth=2, label=f'Median: {self.df["likes"].median():.0f}')
        ax2.set_title('❤️ Engagement Distribution Analysis', fontsize=16, fontweight='bold')
        ax2.legend()
        ax2.grid(True, alpha=0.3)
        
        # 3. Content Performance Matrix
        ax3 = fig.add_subplot(gs[1, :2])
        performance_matrix = pd.crosstab(self.df['is_sidecar'], self.df['performance'], normalize='columns') * 100
        sns.heatmap(performance_matrix, annot=True, fmt='.1f', cmap='RdYlGn', 
                   ax=ax3, cbar_kws={'label': 'Percentage'})
        ax3.set_title('🎯 Content Type vs Performance Matrix', fontsize=16, fontweight='bold')
        ax3.set_xlabel('Performance Level')
        ax3.set_ylabel('Content Type')
        ax3.set_yticklabels(['Single Image', 'Multiple Images'], rotation=0)
        
        # 4. Weekday Performance Analysis
        ax4 = fig.add_subplot(gs[1, 2:])
        weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
        weekday_stats = self.df.groupby('weekday')['likes'].agg(['mean', 'count']).reindex(weekday_order)
        
        bars = ax4.bar(weekday_stats.index, weekday_stats['mean'], 
                      color=[colors[i] for i in range(len(weekday_order))], alpha=0.8)
        
        # Add count labels on bars
        for bar, count in zip(bars, weekday_stats['count']):
            height = bar.get_height()
            ax4.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
                    f'{count} posts', ha='center', va='bottom', fontsize=10)
        
        ax4.set_title('📅 Performance by Day of Week', fontsize=16, fontweight='bold')
        ax4.set_ylabel('Average Likes')
        ax4.tick_params(axis='x', rotation=45)
        ax4.grid(True, alpha=0.3, axis='y')
        
        # 5. Caption Length vs Engagement
        ax5 = fig.add_subplot(gs[2, :2])
        scatter = ax5.scatter(self.df['caption_length'], self.df['likes'], 
                             c=self.df['hashtag_count'], cmap='viridis', 
                             alpha=0.6, s=50)
        
        # Add trend line
        z = np.polyfit(self.df['caption_length'], self.df['likes'], 1)
        p = np.poly1d(z)
        ax5.plot(self.df['caption_length'], p(self.df['caption_length']), 
                "r--", alpha=0.8, linewidth=2, label=f'Trend')
        
        ax5.set_title('📝 Caption Analysis: Length vs Engagement', fontsize=16, fontweight='bold')
        ax5.set_xlabel('Caption Length (characters)')
        ax5.set_ylabel('Likes')
        ax5.grid(True, alpha=0.3)
        ax5.legend()
        
        # Add colorbar
        cbar = plt.colorbar(scatter, ax=ax5)
        cbar.set_label('Hashtag Count', rotation=270, labelpad=15)
        
        # 6. Image Count Distribution
        ax6 = fig.add_subplot(gs[2, 2:])
        image_count_data = self.df['image_count'].value_counts().sort_index()
        bars = ax6.bar(image_count_data.index, image_count_data.values, 
                      color=colors[3], alpha=0.8, edgecolor='white', linewidth=1)
        
        # Add percentage labels
        total = len(self.df)
        for bar, count in zip(bars, image_count_data.values):
            percentage = (count / total) * 100
            ax6.text(bar.get_x() + bar.get_width()/2., bar.get_height() + count*0.01,
                    f'{percentage:.1f}%', ha='center', va='bottom', fontweight='bold')
        
        ax6.set_title('📊 Images per Post Distribution', fontsize=16, fontweight='bold')
        ax6.set_xlabel('Number of Images')
        ax6.set_ylabel('Number of Posts')
        ax6.grid(True, alpha=0.3, axis='y')
        
        # 7. Top Performers Analysis (spans full width)
        ax7 = fig.add_subplot(gs[3, :])
        top_posts = self.df.nlargest(10, 'likes')[['date', 'likes', 'image_count', 'caption_length', 'hashtag_count']]
        
        x_pos = np.arange(len(top_posts))
        bars = ax7.bar(x_pos, top_posts['likes'], color=colors[4], alpha=0.8)
        
        # Customize bars based on content type
        for i, (idx, post) in enumerate(top_posts.iterrows()):
            color = colors[5] if post['image_count'] > 1 else colors[4]
            bars[i].set_color(color)
            
            # Add data labels
            ax7.text(i, post['likes'] + post['likes']*0.01, 
                    f"{post['likes']:,}\n{post['date'].strftime('%m/%d')}", 
                    ha='center', va='bottom', fontsize=10, fontweight='bold')
        
        ax7.set_title('🏆 Top 10 Performing Posts Analysis', fontsize=16, fontweight='bold')
        ax7.set_xlabel('Post Rank')
        ax7.set_ylabel('Likes')
        ax7.set_xticks(x_pos)
        ax7.set_xticklabels([f'#{i+1}' for i in range(len(top_posts))])
        ax7.grid(True, alpha=0.3, axis='y')
        
        # Add legend for content types
        from matplotlib.patches import Patch
        legend_elements = [Patch(facecolor=colors[4], label='Single Image'),
                          Patch(facecolor=colors[5], label='Multiple Images')]
        ax7.legend(handles=legend_elements, loc='upper right')
        
        plt.suptitle('🎨 Instagram Analytics Dashboard - Professional Edition', 
                    fontsize=20, fontweight='bold', y=0.98)
        plt.tight_layout()
        plt.show()
    
    def print_advanced_insights(self):
        """Generate advanced insights and recommendations"""
        print("\n" + "="*60)
        print("🧠 ADVANCED INSIGHTS & RECOMMENDATIONS")
        print("="*60)
        
        # Content strategy insights
        single_img_avg = self.df[~self.df['is_sidecar']]['likes'].mean()
        multi_img_avg = self.df[self.df['is_sidecar']]['likes'].mean()
        
        print(f"📈 Content Strategy:")
        print(f"   ├─ Single Image Avg: {single_img_avg:.0f} likes")
        print(f"   ├─ Multiple Image Avg: {multi_img_avg:.0f} likes")
        
        if multi_img_avg > single_img_avg:
            improvement = ((multi_img_avg - single_img_avg) / single_img_avg) * 100
            print(f"   └─ 💡 Multi-image posts perform {improvement:.1f}% better!")
        else:
            improvement = ((single_img_avg - multi_img_avg) / multi_img_avg) * 100
            print(f"   └─ 💡 Single-image posts perform {improvement:.1f}% better!")
        
        # Hashtag analysis
        hashtag_correlation = self.df['hashtag_count'].corr(self.df['likes'])
        optimal_hashtags = self.df.groupby('hashtag_count')['likes'].mean().idxmax()
        
        print(f"\n#️⃣ Hashtag Strategy:")
        print(f"   ├─ Hashtag-Likes Correlation: {hashtag_correlation:.3f}")
        print(f"   ├─ Optimal Hashtag Count: {optimal_hashtags}")
        print(f"   └─ 💡 {'Strong positive' if hashtag_correlation > 0.3 else 'Weak' if hashtag_correlation > 0.1 else 'No significant'} correlation with engagement")
        
        # Timing insights
        best_hour = self.df.groupby('hour')['likes'].mean().idxmax()
        best_month = self.df.groupby('month')['likes'].mean().idxmax()
        
        print(f"\n⏰ Optimal Timing:")
        print(f"   ├─ Best Hour to Post: {best_hour}:00")
        print(f"   ├─ Best Month: {pd.Timestamp(2024, best_month, 1).strftime('%B')}")
        print(f"   └─ 💡 Consider posting around {best_hour}:00 for maximum engagement")
        
        # Performance predictions
        print(f"\n🔮 Performance Predictions:")
        recent_trend = self.df.tail(10)['likes'].mean()
        overall_avg = self.df['likes'].mean()
        trend_direction = "📈 Improving" if recent_trend > overall_avg else "📉 Declining"
        
        print(f"   ├─ Recent Performance: {recent_trend:.0f} likes (last 10 posts)")
        print(f"   ├─ Overall Average: {overall_avg:.0f} likes")
        print(f"   └─ Trend: {trend_direction}")

# Execute the enhanced analytics
analyzer = InstagramAnalytics(df_posts)
analyzer.print_enhanced_summary()
analyzer.create_advanced_visualizations()
analyzer.print_advanced_insights()