In [1]:
import pandas as pd
import numpy as np
import os
import itertools

DATA_DIR = r'C:\Users\Sahar\Desktop\Clickbait project\Dataset\data'

def load_data():
    data = {}
    id_cols = ['videoid', 'channelid', 'userid', 'uuid', 'hashedvideoid']
    
    for fname in os.listdir(DATA_DIR):
        if fname.lower().endswith('.csv'):
            fp = os.path.join(DATA_DIR, fname)
            try:
                df = pd.read_csv(fp, low_memory=False)
                df.columns = df.columns.str.lower().str.replace('[^a-z0-9]', '', regex=True)
                for c in df.columns:
                    if c in id_cols or c.endswith('id') or c.endswith('uuid'):
                        df[c] = df[c].astype(str).str.strip()
                data[fname] = df
                print(f"Loaded {fname} ({len(df):,} rows)")
            except Exception as e:
                print(f"Error reading {fname}: {e}")
    return data

dataset = load_data()


Loaded casualVotes.csv (21,590 rows)
Loaded casualVoteTitles.csv (12,571 rows)
Loaded categoryVotes.csv (491,806 rows)
Loaded lockCategories.csv (163,248 rows)
Loaded ratings.csv (9,848 rows)
Loaded sponsorTimes.csv (5,231,393 rows)
Loaded thumbnails.csv (182,406 rows)
Loaded thumbnailTimestamps.csv (157,264 rows)
Loaded thumbnailVotes.csv (182,401 rows)
Loaded titles.csv (493,305 rows)
Loaded titleVotes.csv (493,235 rows)
Loaded unlistedVideos.csv (121,403 rows)
Loaded userNames.csv (413,388 rows)
Loaded videoInfo.csv (9,846,585 rows)
Loaded vipUsers.csv (123 rows)


In [4]:
import time

def analyze_relations(data, sample_size=500):
    results = []
    start_time = time.time()
    
    id_keywords = {'id', 'uuid', 'hash', 'user', 'video', 'channel'}
    
    for f1, f2 in itertools.combinations(data.keys(), 2):
        df1, df2 = data[f1], data[f2]
        
        id_cols_f1 = [c for c in df1.columns if any(kw in c.lower() for kw in id_keywords)]
        id_cols_f2 = [c for c in df2.columns if any(kw in c.lower() for kw in id_keywords)]
        
        if not id_cols_f1 or not id_cols_f2:
            continue
        
        for c1 in id_cols_f1:
            col_data = df1[c1].dropna().unique()
            if len(col_data) < 2 or len(col_data) > 100000:
                continue
                
            set1 = set(col_data[:sample_size])
            
            for c2 in id_cols_f2:
                col_data2 = df2[c2].dropna().unique()
                if len(col_data2) < 2 or len(col_data2) > 100000:
                    continue
                    
                set2 = set(col_data2[:sample_size])
                
                intersect = set1.intersection(set2)
                if not intersect or len(intersect) < 2:
                    continue

                len1, len2, len_int = len(set1), len(set2), len(intersect)
                smaller = min(len1, len2)
                coverage = len_int / smaller * 100 if smaller > 0 else 0

                rel_type = None
                if len1 == len2 == len_int:
                    rel_type = 'Identity'
                elif len_int == len1 < len2:
                    rel_type = 'Subset A in B'
                elif len_int == len2 < len1:
                    rel_type = 'Subset A in B'
                    f1, f2, c1, c2 = f2, f1, c2, c1
                elif coverage >= 5.0:
                    rel_type = 'Intersection'
                
                if rel_type:
                    results.append({'File 1': f1, 'Column 1': c1, 'File 2': f2, 'Column 2': c2, 
                                   'Overlap': len_int, 'Coverage': round(coverage, 2), 'Type': rel_type})
    
    elapsed = time.time() - start_time
    print(f"Analysis completed in {elapsed:.1f}s. Found {len(results)} relations.")
    return results

print("Starting fast relation analysis (ID columns only, 500 sample)...")
relations = analyze_relations(dataset, sample_size=500)
print(f"\nResults preview:")
if relations:
    rel_df = pd.DataFrame(relations[:10])
    print(rel_df.to_string())

Starting fast relation analysis (ID columns only, 500 sample)...
Analysis completed in 669.7s. Found 19 relations.

Results preview:
                 File 1      Column 1                File 2      Column 2  Overlap  Coverage          Type
0       casualVotes.csv       titleid  casualVoteTitles.csv            id       27    100.00      Identity
1       casualVotes.csv       titleid           ratings.csv            id       26     96.30  Intersection
2  casualVoteTitles.csv            id           ratings.csv            id       26     96.30  Intersection
3    lockCategories.csv        userid        thumbnails.csv        userid       16     17.78  Intersection
4    lockCategories.csv        userid            titles.csv        userid       20     22.22  Intersection
5    lockCategories.csv        userid          vipUsers.csv        userid       80     88.89  Intersection
7      sponsorTimes.csv        hidden    thumbnailVotes.csv  shadowhidden        2     66.67  Intersection
8      spon