In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO, BytesIO
from urllib.parse import urlparse, urljoin

# Function to fetch repository content using web scraping
def fetch_repo_content(repo_url):
    repo_url = repo_url.rstrip('/')
    print(f"Fetching content from: {repo_url}")  # Debug statement
    
    try:
        response = requests.get(repo_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        return parse_repo_content(soup, repo_url), "Public"
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch content: {e}")  # Debug statement
        return None, "Private"

# Function to parse the HTML content of the repository page
def parse_repo_content(soup, base_url):
    content_list = []
    items = soup.find_all('a', class_='Link--primary')  # Adjusted to target the correct class for navigation

    for item in items:
        name = item.text.strip()
        if not name or name == "..":  # Skip empty names and parent directory link
            continue
        print(f"Found item: {name}")  # Debug statement
        href = item['href']
        if href.endswith('.csv') or href.endswith('.xlsx'):
            file_url = urljoin(base_url, href.replace('/blob/', '/raw/'))
            content_list.append((file_url, 'Yes'))  # Root directory indicator
            print(f"Found data file in root: {name}, URL: {file_url}")  # Debug statement
        elif 'tree' in href:
            subdir_url = urljoin(base_url, href)
            print(f"Fetching subdirectory content from: {subdir_url}")  # Debug statement
            sub_content, _ = fetch_repo_content(subdir_url)
            if sub_content:
                content_list.extend([(file_url, 'No') for file_url, _ in sub_content])  # Subdirectory indicator
    return content_list

# Function to validate CSV/XLSX against submission criteria
def validate_data_file(file_url, sample_df):
    print(f"Validating file: {file_url}")  # Debug statement
    response = requests.get(file_url)
    if response.status_code == 200:
        if file_url.endswith('.csv'):
            data_df = pd.read_csv(StringIO(response.text))
        elif file_url.endswith('.xlsx'):
            data_df = pd.read_excel(BytesIO(response.content))

        # Ensure columns match with flexible naming conventions
        if ('respondent_id' in data_df.columns and
            (('xyz_vaccine' in data_df.columns and 'seasonal_vaccine' in data_df.columns) or
             ('h1n1_vaccine' in data_df.columns and 'seasonal_vaccine' in data_df.columns))):

            print("Columns match with sample DataFrame: ['respondent_id', 'xyz_vaccine/h1n1_vaccine', 'seasonal_vaccine']")  # Debug statement

            # No need to enforce sorting or serial order of respondent_id

            # Check if respondent_id values are consistent
            if 'respondent_id' in data_df.columns:
                print("Validating respondent_id consistency...")  # Debug statement
                return True
            else:
                print("Missing required columns (respondent_id, xyz_vaccine/h1n1_vaccine, seasonal_vaccine).")  # Debug statement
        else:
            print(f"Columns do not match with sample DataFrame. Expected: ['respondent_id', 'xyz_vaccine/h1n1_vaccine', 'seasonal_vaccine'], Found: {data_df.columns.tolist()}")  # Debug statement
    else:
        print(f"Failed to fetch file from {file_url}. Status code: {response.status_code}")  # Debug statement

    return False

# Main function to process each repository and find valid submission files
def process_repositories(repo_urls, sample_df):
    results = []
    for repo_url in repo_urls:
        if "github.com" not in repo_url:
            results.append((repo_url, "Invalid URL", "Unknown", "N/A", "N/A"))  # Adjusted to include a placeholder for most recent valid submission URL
            print(f"Skipping invalid URL: {repo_url}")  # Debug statement
            continue

        content_list, repo_status = fetch_repo_content(repo_url)
        if not content_list:
            results.append((repo_url, "No content found", repo_status, "N/A", "N/A"))  # Adjusted to include placeholders for most recent valid submission URL
            print(f"No content found in repository: {repo_url}")  # Debug statement
            continue

        valid_submissions = []
        for file_url, is_root_dir in content_list:
            if validate_data_file(file_url, sample_df):
                valid_submissions.append((file_url, is_root_dir))

        if valid_submissions:
            most_recent_submission = valid_submissions[-1]  # Get the most recent valid submission
            results.append((repo_url, "Valid submission found", repo_status, most_recent_submission[1], most_recent_submission[0]))  # Include most recent valid submission URL
            print(f"Valid submission found in repository: {repo_url}")  # Debug statement
        else:
            results.append((repo_url, "No valid submission found", repo_status, "N/A", "N/A"))  # Adjusted to include placeholders for most recent valid submission URL
            print(f"No valid submission found in repository: {repo_url}")  # Debug statement

    return results

# Read Excel file containing GitHub repository links
repo_urls_df = pd.read_excel('links_of_submission.xlsx', header=None)
repo_urls = repo_urls_df.iloc[:, 0].tolist()  # Read the first (and only) column
print(f"Reading {len(repo_urls)} repository URLs from 'links_of_submission.xlsx'...")  # Debug statement

# Read sample CSV file to determine submission criteria
sample_df = pd.read_csv('submission_format.csv')
print("Sample submission criteria loaded successfully.")  # Debug statement

# Process repositories and get results
print("Processing repositories...")  # Debug statement
results = process_repositories(repo_urls, sample_df)

# Create a DataFrame to summarize the results
results_df = pd.DataFrame(results, columns=['Repository URL', 'Submission Status', 'Repository Visibility', 'Submission in Root Dir', 'Most Recent Valid Submission URL'])
print("Results processed.")  # Debug statement

# Save the results to a CSV file
results_df.to_csv('submission_results.csv', index=False)
print("Results saved to 'submission_results.csv'. Processing complete.")  # Debug statement


Reading 199 repository URLs from 'links_of_submission.xlsx'...
Sample submission criteria loaded successfully.
Processing repositories...
Fetching content from: https://github.com/ASNR1010/Hack-a-thon-DataHack-by-IIT-Guwahati
Found item: Hackathon.ipynb
Found item: Hackathon.ipynb
Found item: submission.csv
Found data file in root: submission.csv, URL: https://github.com/ASNR1010/Hack-a-thon-DataHack-by-IIT-Guwahati/raw/main/submission.csv
Found item: submission.csv
Found data file in root: submission.csv, URL: https://github.com/ASNR1010/Hack-a-thon-DataHack-by-IIT-Guwahati/raw/main/submission.csv
Found item: Releases
Found item: Packages
      0
Validating file: https://github.com/ASNR1010/Hack-a-thon-DataHack-by-IIT-Guwahati/raw/main/submission.csv
Columns match with sample DataFrame: ['respondent_id', 'xyz_vaccine/h1n1_vaccine', 'seasonal_vaccine']
Validating respondent_id consistency...
Validating file: https://github.com/ASNR1010/Hack-a-thon-DataHack-by-IIT-Guwahati/raw/main/subm