In [8]:
import pandas as pd
import numpy as np

def analyze_video_performance(csv_path):
    """
    Perform basic analysis of video performance data
    
    Args:
        csv_path (str): Path to CSV file with performance and video URLs
    
    Returns:
        dict: Key performance insights
    """
    # Read the CSV file
    try:
        df = pd.read_csv(csv_path)
        
        # Ensure numeric performance column
        df['Performance'] = pd.to_numeric(df['Performance'], errors='coerce')
        df = df.dropna(subset=['Performance'])
        
        # Basic performance analysis
        performance_analysis = {
            'Total Videos': len(df),
            'Average Performance': df['Performance'].mean(),
            'Median Performance': df['Performance'].median(),
            'Performance Std Dev': df['Performance'].std(),
            'Min Performance': df['Performance'].min(),
            'Max Performance': df['Performance'].max()
        }
        
        # Top 5 and Bottom 5 performing videos
        top_5 = df.nlargest(5, 'Performance')[['Performance', 'Video URL']]
        bottom_5 = df.nsmallest(5, 'Performance')[['Performance', 'Video URL']]
        
        # Performance distribution
        performance_bins = pd.cut(
            df['Performance'], 
            bins=4, 
            labels=['Low', 'Medium-Low', 'Medium-High', 'High']
        )
        performance_distribution = performance_bins.value_counts(normalize=True) * 100
        
        # Export results
        with open('performance_summary.txt', 'w') as f:
            f.write("Performance Analysis Summary\n")
            f.write("==========================\n\n")
            f.write("Overall Statistics:\n")
            for stat, value in performance_analysis.items():
                f.write(f"{stat}: {value:.2f}\n")
            
            f.write("\n\nPerformance Distribution:\n")
            for category, percentage in performance_distribution.items():
                f.write(f"{category}: {percentage:.2f}%\n")
            
            f.write("\n\nTop 5 Performing Videos:\n")
            f.write(top_5.to_string())
            
            f.write("\n\nBottom 5 Performing Videos:\n")
            f.write(bottom_5.to_string())
        
        # Save detailed data
        df.to_csv('performance_details.csv', index=False)
        
        return performance_analysis
    
    except Exception as e:
        print(f"Error analyzing performance: {e}")
        return None

def main():
    # Analyze performance from CSV
    result = analyze_video_performance('assignmentData.csv')
    
    # Print key insights
    if result:
        print("Performance Analysis Complete!")
        print("Check 'performance_summary.txt' for detailed insights")
        print("Check 'performance_details.csv' for full dataset")

if __name__ == '__main__':
    main()

# README
"""
Video Performance Analysis Tool

Key Outputs:
1. performance_summary.txt
   - Overall performance statistics
   - Performance distribution
   - Top and bottom performing videos

2. performance_details.csv
   - Full dataset with performance details

Prerequisites:
- Install pandas: pip install pandas

Usage:
1. Ensure CSV has 'Performance' and 'Video URL' columns
2. Replace 'performance_data.csv' with your file path
3. Run the script
"""

Performance Analysis Complete!
Check 'performance_summary.txt' for detailed insights
Check 'performance_details.csv' for full dataset


"\nVideo Performance Analysis Tool\n\nKey Outputs:\n1. performance_summary.txt\n   - Overall performance statistics\n   - Performance distribution\n   - Top and bottom performing videos\n\n2. performance_details.csv\n   - Full dataset with performance details\n\nPrerequisites:\n- Install pandas: pip install pandas\n\nUsage:\n1. Ensure CSV has 'Performance' and 'Video URL' columns\n2. Replace 'performance_data.csv' with your file path\n3. Run the script\n"

In [3]:
# Step 1: Extract frames from videos
def extract_frames(video_path, output_folder, interval=2):
    os.makedirs(output_folder, exist_ok=True)
    cap = cv2.VideoCapture(video_path)
    frame_rate = cap.get(cv2.CAP_PROP_FPS)  # Frame rate of the video
    count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if int(count % (frame_rate * interval)) == 0:  # Extract every `interval` seconds
            frame_filename = os.path.join(output_folder, f"frame_{count}.jpg")
            cv2.imwrite(frame_filename, frame)
        count += 1
    cap.release()
    print(f"Frames saved to {output_folder}")

In [4]:
# Step 2: Detect faces and extract features
def get_face_embeddings(image_path):
    image = face_recognition.load_image_file(image_path)
    face_locations = face_recognition.face_locations(image)
    face_encodings = face_recognition.face_encodings(image, face_locations)
    
    if face_encodings:
        return face_encodings[0]  # Return the first face detected
    else:
        return None

def extract_visual_features(image_path):
    img = image.load_img(image_path, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    model = ResNet50(weights='imagenet', include_top=False, pooling='avg')
    features = model.predict(x)
    return features.flatten()

In [5]:
# Step 3: Clustering the face embeddings and visual features
def cluster_faces_and_visuals(embeddings, features, eps=0.5, min_samples=2):
    combined_features = np.concatenate([embeddings, features], axis=1)
    clustering_model = DBSCAN(eps=eps, min_samples=min_samples, metric='cosine')
    cluster_labels = clustering_model.fit_predict(combined_features)
    return cluster_labels

In [6]:
# Step 4: Calculate average performance for each influencer (cluster)
def calculate_avg_performance(df, cluster_labels):
    df['cluster'] = cluster_labels
    avg_performance = df.groupby('cluster')['Performance'].mean()
    return avg_performance

In [7]:
# Step 5: Complete pipeline
def process_videos_and_calculate_performance(video_urls, output_folder):
    all_face_embeddings = []
    all_visual_features = []
    video_performance = []  # Store video performance for later mapping

    for video_url in video_urls:
        video_path = download_video(video_url)  # You may download the video here
        frames_folder = os.path.join(output_folder, os.path.basename(video_url))
        extract_frames(video_path, frames_folder)

        for frame_file in os.listdir(frames_folder):
            frame_path = os.path.join(frames_folder, frame_file)
            embeddings = get_face_embeddings(frame_path)  # Try face_recognition first
            if embeddings is None:
                embeddings = extract_visual_features(frame_path)  # Fallback to ResNet features

            # Collect the embeddings and video performance
            all_face_embeddings.append(embeddings)
            all_visual_features.append(embeddings if embeddings is not None else np.zeros(512))  # Handle None case
            video_performance.append(fetch_video_performance(video_url))  # You can get the performance from your data

    # Perform clustering on the embeddings
    cluster_labels = cluster_faces_and_visuals(all_face_embeddings, all_visual_features)
    df = pd.DataFrame({'Video URL': video_urls, 'Performance': video_performance})
    avg_performance = calculate_avg_performance(df, cluster_labels)

    print(avg_performance)