In [23]:
import numpy as np
import pandas as pd
import gzip
import json
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report
from sklearn.utils.class_weight import compute_class_weight
import nltk
from nltk.corpus import stopwords

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

nltk.download('stopwords')
turkish_stopwords = stopwords.words('turkish')

class InfluencerFeatureExtractor:
    def __init__(self, max_features=5000):
        self.vectorizer = TfidfVectorizer(
            stop_words=turkish_stopwords,
            max_features=max_features,
            ngram_range=(1, 2)
        )
        self.scaler = RobustScaler()
        
    def preprocess_text(self, text: str) -> tuple:
        """Enhanced text preprocessing"""
        if text is None or not isinstance(text, str):
            return "", 0
            
        # lowercase convert
        text = text.casefold()
        
        emoji_count = sum(1 for c in text if ord(c) > 127)
        
        # remove urls
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        
        # keep Turkish characters but remove other specials
        text = re.sub(r'[^a-zçğıöşü0-9\s#@]', ' ', text)
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text, emoji_count

    def extract_profile_features(self, profile):
        """Extract numerical and categorical features from profile"""
        try:
            def safe_float(value):
                if value is None:
                    return 0.0
                try:
                    if isinstance(value, str):
                        value = value.replace(',', '')
                    return float(value)
                except (ValueError, TypeError):
                    return 0.0

            features = {
                'follower_count': safe_float(profile.get('follower_count')),
                'following_count': safe_float(profile.get('following_count')),
                'post_count': safe_float(profile.get('post_count')),
                'is_business': float(bool(profile.get('is_business_account'))),
                'is_private': float(bool(profile.get('is_private'))),
                'is_verified': float(bool(profile.get('is_verified'))),
                'has_website': float(bool(profile.get('external_url'))),
                'has_business_email': float(bool(profile.get('business_email'))),
                'bio_length': float(len(str(profile.get('biography', ''))))
            }
            
            # engagement ratios
            follower_count = features['follower_count']
            features['following_ratio'] = (
                features['following_count'] / follower_count if follower_count > 0 else 0.0
            )
            
            return features
        except Exception as e:
            print(f"Error processing profile {profile.get('username', 'unknown')}: {str(e)}")
            return {
                'follower_count': 0.0,
                'following_count': 0.0,
                'post_count': 0.0,
                'is_business': 0.0,
                'is_private': 0.0,
                'is_verified': 0.0,
                'has_website': 0.0,
                'has_business_email': 0.0,
                'bio_length': 0.0,
                'following_ratio': 0.0
            }

    def extract_post_features(self, posts):
        """Extract features from user posts"""
        if not posts:
            return {
                'avg_likes': 0,
                'std_likes': 0,
                'avg_comments': 0,
                'avg_caption_length': 0,
                'avg_emoji_count': 0,
                'avg_hashtags': 0,
                'avg_mentions': 0
            }
            
        likes = []
        comments = []
        caption_lengths = []
        emoji_counts = []
        hashtag_counts = []
        mention_counts = []
        
        for post in posts:
            # Engagement metrics
            like_count = post.get('like_count')
            if like_count is not None:
                likes.append(float(like_count))
                
            comment_count = post.get('comments_count')
            if comment_count is not None:
                comments.append(float(comment_count))
            
            # Caption analysis
            caption = post.get('caption', '')
            if caption:
                caption_text, emoji_count = self.preprocess_text(caption)
                caption_lengths.append(len(caption_text))
                emoji_counts.append(emoji_count)
                hashtag_counts.append(caption_text.count('#'))
                mention_counts.append(caption_text.count('@'))
        
        # Compute features
        features = {
            'avg_likes': float(np.mean(likes)) if likes else 0.0,
            'std_likes': float(np.std(likes)) if len(likes) > 1 else 0.0,
            'avg_comments': float(np.mean(comments)) if comments else 0.0,
            'avg_caption_length': float(np.mean(caption_lengths)) if caption_lengths else 0.0,
            'avg_emoji_count': float(np.mean(emoji_counts)) if emoji_counts else 0.0,
            'avg_hashtags': float(np.mean(hashtag_counts)) if hashtag_counts else 0.0,
            'avg_mentions': float(np.mean(mention_counts)) if mention_counts else 0.0
        }
        
        return features
    def fit_transform(self, username2posts, username2profile):
        """Fit and transform the data"""
        profile_features = []
        post_features = []
        post_texts = []
        usernames = []
        
        for username, profile in username2profile.items():
            usernames.append(username)
            
            # Extract profile features
            prof_feats = self.extract_profile_features(profile)
            profile_features.append(prof_feats)
            
            # Extract post features
            posts = username2posts.get(username, [])
            post_feats = self.extract_post_features(posts)
            post_features.append(post_feats)
            
            # Aggregate post text
            post_text = "\n".join([
                self.preprocess_text(post.get('caption', ''))[0]
                for post in posts if post.get('caption')
            ])
            post_texts.append(post_text)
        
        # Convert to DataFrames
        profile_df = pd.DataFrame(profile_features, index=usernames)
        post_df = pd.DataFrame(post_features, index=usernames)
        
        # TF-IDF for text
        text_features = self.vectorizer.fit_transform(post_texts)
        text_df = pd.DataFrame(
            text_features.toarray(),
            columns=self.vectorizer.get_feature_names_out(),
            index=usernames
        )
        
        # Combine features
        combined_df = pd.concat([profile_df, post_df, text_df], axis=1)
        
        # Scale numerical features
        numerical_cols = profile_df.columns.tolist() + post_df.columns.tolist()
        
        # Log transform heavily skewed features
        skewed_features = ['follower_count', 'following_count', 'post_count', 
                          'avg_likes', 'avg_comments']
        for col in skewed_features:
            if col in numerical_cols:
                combined_df[col] = np.log1p(combined_df[col].clip(lower=0))
        
        # Handle following_ratio separately
        if 'following_ratio' in numerical_cols:
            ratio_99th = np.percentile(combined_df['following_ratio'], 99)
            combined_df['following_ratio'] = combined_df['following_ratio'].clip(upper=ratio_99th)
            combined_df['following_ratio'] = np.log1p(combined_df['following_ratio'].clip(lower=0))
        
        # Apply robust scaling
        combined_df[numerical_cols] = self.scaler.fit_transform(combined_df[numerical_cols])
        
        return combined_df

    def transform(self, username2posts, username2profile):
        """Transform new data using fitted parameters"""
        profile_features = []
        post_features = []
        post_texts = []
        usernames = []
        
        for username, profile in username2profile.items():
            usernames.append(username)
            
            # Extract features
            prof_feats = self.extract_profile_features(profile)
            profile_features.append(prof_feats)
            
            posts = username2posts.get(username, [])
            post_feats = self.extract_post_features(posts)
            post_features.append(post_feats)
            
            post_text = "\n".join([
                self.preprocess_text(post.get('caption', ''))[0]
                for post in posts if post.get('caption')
            ])
            post_texts.append(post_text)
        
        # Convert to DataFrames
        profile_df = pd.DataFrame(profile_features, index=usernames)
        post_df = pd.DataFrame(post_features, index=usernames)
        
        # Generate text features
        text_features = self.vectorizer.transform(post_texts)
        text_df = pd.DataFrame(
            text_features.toarray(),
            columns=self.vectorizer.get_feature_names_out(),
            index=usernames
        )
        
        # Combine features
        combined_df = pd.concat([profile_df, post_df, text_df], axis=1)
        
        # Scale numerical features
        numerical_cols = profile_df.columns.tolist() + post_df.columns.tolist()
        
        # Apply transformations
        for col in ['follower_count', 'following_count', 'post_count', 
                   'avg_likes', 'avg_comments']:
            if col in numerical_cols:
                combined_df[col] = np.log1p(combined_df[col].clip(lower=0))
        
        if 'following_ratio' in numerical_cols:
            ratio_99th = np.percentile(combined_df['following_ratio'], 99)
            combined_df['following_ratio'] = combined_df['following_ratio'].clip(upper=ratio_99th)
            combined_df['following_ratio'] = np.log1p(combined_df['following_ratio'].clip(lower=0))
        
        combined_df[numerical_cols] = self.scaler.transform(combined_df[numerical_cols])
        
        return combined_df
    
def load_data():
    """Load and preprocess training and test data with combined annotations"""
    print("Loading data...")
    
    # Load existing training classification labels
    train_classification_df = pd.read_csv('train-classification.csv')
    train_classification_df = train_classification_df.rename(
        columns={'Unnamed: 0': 'user_id', 'label': 'category'}
    )
    train_classification_df["category"] = train_classification_df["category"].apply(str.lower)
    
    print("\nBase training data distribution:")
    print(train_classification_df['category'].value_counts())
    
    # Load my annotations
    annotated_df = pd.read_csv('annotated_users_CS412-411c414b075a.csv')
    annotated_df['username'] = annotated_df['url'].apply(
        lambda x: x.split('instagram.com/')[-1].split('&')[0]
    )
    annotated_df['category'] = annotated_df['influencerCategory'].str.lower()
    
    print("\nAnnotated data distribution:")
    print(annotated_df['category'].value_counts())
    
    # Combine base training data with my annotated data
    base_username2category = train_classification_df.set_index("user_id")["category"].to_dict()
    annotated_username2category = annotated_df.set_index('username')['category'].to_dict()
    
    # Merge the dictionaries, prioritizing annotated data if there's overlap
    username2category = {**base_username2category, **annotated_username2category}
    
    print("\nCombined distribution:")
    combined_categories = pd.Series(username2category.values()).value_counts()
    print(combined_categories)
    print(f"\nTotal number of labeled profiles: {len(username2category)}")

    # Load and match profile data
    username2posts_train = {}
    username2profile_train = {}
    processed_usernames = set()
    
    with gzip.open('training-dataset.jsonl.gz', "rt") as fh:
        for line in fh:
            sample = json.loads(line)
            profile = sample["profile"]
            username = profile["username"]
            
            if username in username2category and username not in processed_usernames:
                username2posts_train[username] = sample["posts"]
                username2profile_train[username] = profile
                processed_usernames.add(username)

    print(f"\nMatched {len(username2posts_train)} profiles with posts and profile data")
    
    final_categories = [username2category[username] for username in username2posts_train.keys()]
    print("\nFinal training data distribution:")
    print(pd.Series(final_categories).value_counts())

    # Load test data
    test_users = []
    with open('test-classification-round3.dat', 'r') as f:
        test_users = [line.strip() for line in f]
    
    test_posts = []
    try:
        with open('test-regression-round3.jsonl', 'r') as f:
            for line in f:
                if line.strip():
                    post = json.loads(line)
                    test_posts.append(post)
    except Exception as e:
        print(f"Error loading test regression data: {str(e)}")

    # Load test profiles
    username2posts_test = {}
    username2profile_test = {}
    
    with gzip.open('training-dataset.jsonl.gz', "rt") as fh:
        for line in fh:
            sample = json.loads(line)
            profile = sample["profile"]
            username = profile["username"]
            
            if username in test_users:
                username2posts_test[username] = sample["posts"]
                username2profile_test[username] = profile

    print(f"\nTest data loaded: {len(username2posts_test)} profiles")

    return (username2posts_train, username2profile_train,
            username2posts_test, username2profile_test,
            username2category, test_posts)

def prepare_regression_data(username2posts, username2profile):
    """Prepare the regression dataset"""
    print("\n=== PREPARING REGRESSION DATA ===")
    features = []
    targets = []
    
    def safe_float(value, default=0.0):
        if value is None:
            return default
        try:
            return float(value)
        except (ValueError, TypeError):
            return default

    def safe_len(value, default=0):
        if value is None:
            return default
        try:
            return len(str(value))
        except (ValueError, TypeError):
            return default

    for username, posts in username2posts.items():
        if not posts:  # Skip if no post
            continue
            
        profile = username2profile.get(username, {})
        
        for post in posts:
            if not isinstance(post, dict):  # Skip if post is not a dictionary
                continue
                
            post_features = {
                # Profile features
                'follower_count': safe_float(profile.get('follower_count')),
                'following_count': safe_float(profile.get('following_count')),
                'post_count': safe_float(profile.get('post_count')),
                'is_verified': float(profile.get('is_verified', False)),
                'is_business': float(profile.get('is_business_account', False)),
                
                # Post features
                'caption_length': safe_len(post.get('caption')),
                'media_type_is_video': float(post.get('media_type') == 'VIDEO'),
                'media_type_is_carousel': float(post.get('media_type') == 'CAROUSEL_ALBUM'),
                'comments_count': safe_float(post.get('comments_count')),
            }
            
            # Get target
            like_count = safe_float(post.get('like_count'))
            
            if like_count > 0:  # Only include posts with valid like counts
                features.append(list(post_features.values()))
                targets.append(np.log10(like_count + 1))  # Log transform
    
    if not features:  # Check if we have any valid features
        raise ValueError("No valid features could be extracted from the data")
    
    X = np.array(features)
    y = np.array(targets)
    
    print(f"Regression dataset shape: X={X.shape}, y={y.shape}")
    return X, y

def train_classification_model(X_train, y_train):
    """Train the classification model using SMOTE for class balancing"""
    print("\n=== TRAINING CLASSIFICATION MODEL WITH SMOTE ===")
    
    # Convert string labels to numerical using LabelEncoder
    le = LabelEncoder()
    y_train_encoded = le.fit_transform(y_train)
    
    # Print original class distribution
    print("\nOriginal class distribution:")
    for label, count in zip(le.classes_, np.bincount(y_train_encoded)):
        print(f"{label}: {count}")

    # Apply SMOTE to balance the dataset
    smote = SMOTE(random_state=42, k_neighbors=5)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train_encoded)
    
    print("\nClass distribution after SMOTE:")
    for label, count in zip(le.classes_, np.bincount(y_train_resampled)):
        print(f"{label}: {count}")
    
    # Random Forest classifier
    rf = RandomForestClassifier(
        random_state=42,
        n_jobs=-1
    )
    
    # parameters for RandomForest
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_features': ['sqrt', 'log2'],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    }
    
    grid_search = GridSearchCV(
        estimator=rf,
        param_grid=param_grid,
        cv=5,
        scoring='balanced_accuracy',
        n_jobs=-1,
        verbose=1
    )
    
    # Perform grid search on Smote data
    print("\nPerforming grid search with SMOTE-balanced data...")
    grid_search.fit(X_train_resampled, y_train_resampled)
    
    # Print results
    print("\nBest parameters found:")
    print(grid_search.best_params_)
    print(f"\nBest cross-validation balanced accuracy score: {grid_search.best_score_:.4f}")
    
    # Get best model
    best_model = grid_search.best_estimator_
    
    # Print feature importance
    feature_importance = pd.DataFrame({
        'feature': X_train.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nTop 10 most important features for classification:")
    print(feature_importance.head(10))
    
    # Create a wrapped model that includes label encoding
    class WrappedModel:
        def __init__(self, model, label_encoder, smote):
            self.model = model
            self.label_encoder = label_encoder
            self.smote = smote
        
        def predict(self, X):
            y_pred_encoded = self.model.predict(X)
            return self.label_encoder.inverse_transform(y_pred_encoded)
        
        @property
        def feature_importances_(self):
            return self.model.feature_importances_
    
    return WrappedModel(best_model, le, smote)
def train_regression_model(X_train_reg, y_train_reg):
    """Train the regression model with hyperparameter tuning"""
    print("\n=== TRAINING REGRESSION MODEL ===")
    
    # Define feature names
    feature_names = [
        'follower_count',
        'following_count',
        'post_count',
        'is_verified',
        'is_business',
        'caption_length',
        'media_type_is_video',
        'media_type_is_carousel',
        'comments_count'
    ]
    
    # RandomForest Regressor parameters
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    }
    
    # Initialize base regressor
    base_reg = RandomForestRegressor(
        random_state=42,
        n_jobs=-1
    )
    
    grid_search = GridSearchCV(
        estimator=base_reg,
        param_grid=param_grid,
        cv=5,
        scoring='r2',
        n_jobs=-1,
        verbose=1
    )
    
    # grid search
    print("\nPerforming grid search...")
    grid_search.fit(X_train_reg, y_train_reg)
    
    print("\nBest parameters found:")
    print(grid_search.best_params_)
    print(f"\nBest cross-validation R² score: {grid_search.best_score_:.4f}")
    
    # Get best model
    reg = grid_search.best_estimator_
    
    # Print feature importance
    feature_importance = pd.DataFrame({
        'feature': feature_names[:X_train_reg.shape[1]],
        'importance': reg.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nTop features importance for regression:")
    print(feature_importance)
    
    return reg

def extract_post_regression_features(post, profile):
    """Extract features for regression prediction of a single post"""
    def safe_float(value, default=0.0):
        if value is None:
            return default
        try:
            return float(value)
        except (ValueError, TypeError):
            return default

    def safe_len(value, default=0):
        if value is None:
            return default
        try:
            return len(str(value))
        except (ValueError, TypeError):
            return default

    features = {
        # Profile features
        'follower_count': safe_float(profile.get('follower_count')),
        'following_count': safe_float(profile.get('following_count')),
        'post_count': safe_float(profile.get('post_count')),
        'is_verified': float(profile.get('is_verified', False)),
        'is_business': float(profile.get('is_business_account', False)),
        
        # Post features
        'caption_length': safe_len(post.get('caption')),
        'media_type_is_video': float(post.get('media_type') == 'VIDEO'),
        'media_type_is_carousel': float(post.get('media_type') == 'CAROUSEL_ALBUM'),
        'comments_count': safe_float(post.get('comments_count')),
    }
    
    return list(features.values())

def evaluate_models(clf, reg, X_val, y_val, X_val_reg, y_val_reg):
    """Evaluate both classification and regression models"""
    print("\n=== MODEL EVALUATION ===")
    
    # evaluate classification
    y_pred = clf.predict(X_val)
    class_acc = accuracy_score(y_val, y_pred)
    print(f"\nClassification Accuracy: {class_acc:.4f}")
    
    print("\nClassification Report:")
    print(classification_report(y_val, y_pred, zero_division=0))
    
    # evaluate reggresion
    y_pred_reg = reg.predict(X_val_reg)
    mse = mean_squared_error(y_val_reg, y_pred_reg)
    mae = np.mean(np.abs(y_val_reg - y_pred_reg))
    r2 = reg.score(X_val_reg, y_val_reg)
    
    print(f"\nRegression Metrics:")
    print(f"MSE (log10): {mse:.4f}") # this is what we look at
    print(f"MAE (log10): {mae:.4f}")
    print(f"R² Score: {r2:.4f}")

def generate_predictions(clf, reg, feature_extractor, username2posts_test, username2profile_test, test_posts):
    """Generate predictions for test data"""
    print("\n=== GENERATING PREDICTIONS ===")
    
    # classification predictions
    print("\nGenerating classification predictions...")
    X_test = feature_extractor.transform(username2posts_test, username2profile_test)
    classification_predictions = {}
    
    for username in username2posts_test:
        if username in X_test.index:
            pred = clf.predict(X_test.loc[[username]])[0]
            classification_predictions[username] = pred
    
    # regression predictions
    print("\nGenerating regression predictions...")
    regression_predictions = {}
    
    for post in test_posts:
        post_id = str(post.get('id'))
        if post_id:
            username = post.get('username')
            profile = username2profile_test.get(username, {})
            
            post_features = extract_post_regression_features(post, profile)
            if post_features is not None:
                pred_log = reg.predict([post_features])[0]
                pred = int(np.power(10, pred_log))
                regression_predictions[post_id] = pred
    
    return classification_predictions, regression_predictions
def save_predictions(predictions, filename):
    """Save predictions to JSON file"""
    print(f"\nSaving predictions to {filename}")
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(predictions, f, ensure_ascii=False, indent=2)
    print(f"Successfully saved predictions to {filename}")

def main():
    """Main execution function"""
    print("Starting Instagram Influencer Classification and Regression Task...")
    
    # load all data
    (username2posts_train, username2profile_train,
     username2posts_test, username2profile_test,
     username2category, test_posts) = load_data()
    
    feature_extractor = InfluencerFeatureExtractor()
    
    # prepare training data for classification
    print("\nPreparing classification training data...")
    X = feature_extractor.fit_transform(username2posts_train, username2profile_train)
    y = [username2category[username] for username in X.index]
    
    # Split for validation
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )
    
    # train classification model
    clf = train_classification_model(X_train, y_train)
    
    # same process for reg
    print("\nPreparing regression data...")
    X_train_reg, y_train_reg = prepare_regression_data(
        username2posts_train, username2profile_train
    )
    X_train_reg, X_val_reg, y_train_reg, y_val_reg = train_test_split(
        X_train_reg, y_train_reg, test_size=0.2, random_state=42
    )
    reg = train_regression_model(X_train_reg, y_train_reg)
    
    # evaluate models
    evaluate_models(clf, reg, X_val, y_val, X_val_reg, y_val_reg)
    
    # generate predictions for round test data
    class_predictions, reg_predictions = generate_predictions(
        clf, reg, feature_extractor,
        username2posts_test, username2profile_test,
        test_posts
    )
    # save it
    save_predictions(class_predictions, 'prediction-classification-round3.json')
    save_predictions(reg_predictions, 'prediction-regression-round3.json')
    
    print("\nTask completed successfully!")

if __name__ == "__main__":
    main()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cagrisar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Starting Instagram Influencer Classification and Regression Task...
Loading data...

Base training data distribution:
category
food                    511
health and lifestyle    503
tech                    346
entertainment           323
fashion                 299
travel                  294
art                     191
mom and children        149
sports                  113
gaming                   13
Name: count, dtype: int64

Annotated data distribution:
category
entertainment           27
food                    23
health and lifestyle    23
art                     15
sports                  14
fashion                 14
tech                    13
travel                   7
gaming                   7
mom and children         5
Name: count, dtype: int64

Combined distribution:
food                    520
health and lifestyle    514
tech                    353
entertainment           333
fashion                 303
travel                  294
art                     194
mom and chil




Best parameters found:
{'bootstrap': False, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}

Best cross-validation balanced accuracy score: 0.8914

Top 10 most important features for classification:
                 feature  importance
2729              lezzet    0.008850
0         follower_count    0.005116
11             std_likes    0.004451
14       avg_emoji_count    0.004196
13    avg_caption_length    0.003959
699                bebek    0.003951
2739            lezzetli    0.003917
10             avg_likes    0.003857
2960              mobile    0.003797
2732             lezzeti    0.003746

Preparing regression data...

=== PREPARING REGRESSION DATA ===
Regression dataset shape: X=(92260, 9), y=(92260,)

=== TRAINING REGRESSION MODEL ===

Performing grid search...
Fitting 5 folds for each of 162 candidates, totalling 810 fits





Best parameters found:
{'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}

Best cross-validation R² score: 0.9167

Top features importance for regression:
                  feature  importance
8          comments_count    0.394470
0          follower_count    0.322411
3             is_verified    0.086875
1         following_count    0.066567
5          caption_length    0.045201
2              post_count    0.042829
4             is_business    0.030433
7  media_type_is_carousel    0.006879
6     media_type_is_video    0.004335

=== MODEL EVALUATION ===

Classification Accuracy: 0.6180

Classification Report:
                      precision    recall  f1-score   support

                 art       0.38      0.13      0.19        39
       entertainment       0.37      0.38      0.37        66
             fashion       0.58      0.65      0.61        60
                food       0.79      0.93      0.85       103
         