In [1]:
import numpy as np
import pandas as pd
from yaml import safe_load
import os
from tqdm import tqdm

In [2]:
filenames = []
for file in os.listdir('archive (1)/t20s/'):
    filenames.append(os.path.join('archive (1)/t20s/',file))

In [3]:

filenames[0:5]

['archive (1)/t20s/1001349.yaml',
 'archive (1)/t20s/1001351.yaml',
 'archive (1)/t20s/1001353.yaml',
 'archive (1)/t20s/1004729.yaml',
 'archive (1)/t20s/1007655.yaml']

In [6]:
import os
import pandas as pd
import yaml
from tqdm import tqdm
import time
import multiprocessing

data_folder = 'archive (1)/t20s/'

# Collect YAML files only
filenames = [os.path.join(data_folder, f) for f in os.listdir(data_folder) if f.endswith('.yaml')]
print(f"Total files found: {len(filenames)}")
print("First 5:", filenames[:5])

# Shared dictionary to collect results from multiple processes
manager = multiprocessing.Manager()
results = manager.dict()
error_files = manager.list()

# Function to load YAML file safely
def load_yaml_file(file_path, idx, results_dict, error_list):
    try:
        # Skip very large files (optional, set your limit)
        if os.path.getsize(file_path) > 10_000_000:  # 10 MB limit
            error_list.append((file_path, "File too large"))
            return
        
        with open(file_path, 'r') as f:
            data = yaml.safe_load(f)

        if not isinstance(data, dict):
            error_list.append((file_path, "Not a dict"))
            return

        # Flatten YAML and add match_id
        df = pd.json_normalize(data)
        df['match_id'] = idx + 1

        results_dict[idx] = df

    except Exception as e:
        error_list.append((file_path, str(e)))

# Start processing
start_time = time.time()
final_df_list = []

for idx, file in enumerate(tqdm(filenames, desc="Processing files", unit="file")):
    p = multiprocessing.Process(target=load_yaml_file, args=(file, idx, results, error_files))
    p.start()
    p.join(timeout=5)  # 5 seconds timeout per file

    if p.is_alive():
        p.terminate()
        p.join()
        error_files.append((file, "Timeout"))

    # Optional: stop total processing after some time
    if time.time() - start_time > 600:  # 10 minutes
        print("⏳ Total timeout reached, stopping early...")
        break

# Collect all DataFrames
final_df_list = [results[i] for i in sorted(results.keys())]

# Combine results
if final_df_list:
    final_df = pd.concat(final_df_list, ignore_index=True)
else:
    final_df = pd.DataFrame()

# Backup copy
backup = final_df.copy()

print("\n✅ Done.")
print(f"✅ Total processed files: {len(final_df_list)} / {len(filenames)}")
print(f"⚠️ Errors in {len(error_files)} files.")
print("Final DataFrame shape:", final_df.shape)
final_df.head()


Total files found: 4570
First 5: ['archive (1)/t20s/1001349.yaml', 'archive (1)/t20s/1001351.yaml', 'archive (1)/t20s/1001353.yaml', 'archive (1)/t20s/1004729.yaml', 'archive (1)/t20s/1007655.yaml']


Processing files: 100%|██████████| 4570/4570 [09:32<00:00,  7.98file/s]


✅ Done.
✅ Total processed files: 0 / 4570
⚠️ Errors in 0 files.
Final DataFrame shape: (0, 0)





In [18]:
import pandas as pd
import pickle

# ===============================
# STEP 0: Make sure 'final_df' exists
# ===============================

# ===============================
# STEP 1: Backup original DataFrame
# ===============================
backup = final_df.copy()

# ===============================
# STEP 2: Keep only male matches
# ===============================
if 'info.gender' in final_df.columns:
    final_df = final_df[final_df['info.gender'] == 'male']
    final_df.drop(columns=['info.gender'], inplace=True)

# ===============================
# STEP 3: Keep only T20 matches with 20 overs
# ===============================
if 'info.match_type' in final_df.columns:
    final_df = final_df[final_df['info.match_type'] == 'T20']

if 'info.overs' in final_df.columns:
    final_df = final_df[final_df['info.overs'] == 20]

# ===============================
# STEP 4: Drop unnecessary columns safely
# ===============================
columns_to_drop = [
    'info.overs', 'info.match_type',
    'meta.data_version', 'meta.created', 'meta.revision',
    'info.outcome.bowl_out', 'info.bowl_out',
    'info.supersubs.South Africa', 'info.supersubs.New Zealand',
    'info.outcome.eliminator', 'info.outcome.result', 'info.outcome.method',
    'info.neutral_venue', 'info.match_type_number',
    'info.outcome.by.runs', 'info.outcome.by.wickets'
]

final_df.drop(columns=[c for c in columns_to_drop if c in final_df.columns], inplace=True)

# ===============================
# STEP 5: Save cleaned DataFrame
# ===============================
pickle.dump(final_df, open('dataset_level1.pkl', 'wb'))

# ===============================
# STEP 6: Access first match's first innings deliveries safely
# ===============================
matches = pickle.load(open('dataset_level1.pkl', 'rb'))

if not matches.empty:
    first_match = matches.iloc[0]
    first_match_innings = first_match.get('innings', None)

    if first_match_innings and isinstance(first_match_innings, list):
        try:
            first_innings = first_match_innings[0].get('1st innings', {})
            deliveries = first_innings.get('deliveries', [])
            print(deliveries)
        except Exception as e:
            print("Error accessing deliveries:", e)
    else:
        print("No innings data found in the first match.")
else:
    print("No matches left after filtering. Check your filters (male, T20, 20 overs).")


No matches left after filtering. Check your filters (male, T20, 20 overs).


In [19]:
print("Original matches:", len(backup))

# Male filter
if 'info.gender' in backup.columns:
    male_df = backup[backup['info.gender'] == 'male']
    print("After male filter:", len(male_df))
else:
    male_df = backup

# T20 filter
if 'info.match_type' in male_df.columns:
    t20_df = male_df[male_df['info.match_type'] == 'T20']
    print("After T20 filter:", len(t20_df))
else:
    t20_df = male_df

# 20 overs filter
if 'info.overs' in t20_df.columns:
    final_df = t20_df[t20_df['info.overs'] == 20]
    print("After 20 overs filter:", len(final_df))
else:
    final_df = t20_df


Original matches: 0
