In [4]:
import pandas as pd
import numpy as np
import hashlib
import sys
import csv
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import shap

# --- CONFIGURATION ---
# UPDATE STUDENT ID. USE STU007 BECAUSE IT IS MINE (CASE-SENSITIVE)
STUDENT_ID = input("Enter STUDENT ID MINE IS STU007 : ")
BOOKS_PATH = "/content/books.csv"
REVIEWS_PATH = "/content/reviews.csv"

# --- SETUP ---
# EnsurURING necessary NLTK data is available
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

def generate_student_hash(student_id):
    """Generates the 8-char uppercase SHA256 hash from Student ID."""
    hash_object = hashlib.sha256(student_id.encode())
    return hash_object.hexdigest()[:8].upper()

def solve_flag2(reviews_path, student_id, user_hash):
    """
    FLAG 2: Locate the fake review containing the user hash.
    Flag is simply FLAG2{USER_HASH}.
    """
    print(f"\n--- SOLVING FLAG 2 ---")
    print(f"Target Hash: {user_hash}")

    try:
        df = pd.read_csv(reviews_path, engine='python', on_bad_lines='skip', quotechar='"')
        text_column = 'text'
        if text_column in df.columns:
            result = df[df[text_column].astype(str).str.contains(user_hash, case=False, na=False)]

            if not result.empty:
                print(f"Success: Found {len(result)} review(s).")
                return f"FLAG2 : {{{user_hash}}}"
            else:
                return "Error: Hash not found in dataset. Check Student ID."
        else:
            return f"Error: Column '{text_column}' not found."

    except FileNotFoundError:
        return "Error: Dataset file not found."
    except Exception as e:
        return f"Error: {e}"

def solve_flag1(books_path, reviews_path, target_hash):

    print(f"\n--- SOLVING FLAG 1 ---")
    target_hash_lower = target_hash.lower()

    try:
        # 1. Filter Books
        print("Loading books.csv...")
        df_books = pd.read_csv(books_path)
        candidates = df_books[
            (df_books['rating_number'] == 1234) &
            (df_books['average_rating'] == 5.0)
        ]

        suspect_asins = set(candidates['parent_asin'].astype(str).str.strip().tolist())
        print(f"Found {len(suspect_asins)} ")

        # 2. Search Reviews
        print("Loading reviews.csv...")
        csv.field_size_limit(sys.maxsize)
        df_reviews = pd.read_csv(reviews_path, engine='python', on_bad_lines='skip', quotechar='"')

        # Link via ASIN
        df_reviews['asin'] = df_reviews['asin'].astype(str).str.strip()
        relevant_reviews = df_reviews[df_reviews['asin'].isin(suspect_asins)]

        found_asin = None

        # Scan for hash
        for index, row in relevant_reviews.iterrows():
            review_text = str(row['text'])
            if target_hash in review_text or target_hash_lower in review_text:
                found_asin = row['asin']
                print(f"Match found in review for ASIN: {found_asin}")
                break

        if not found_asin:
            return "Error: Hash not found in suspect reviews."

        # 3. Retrieve Title & Hash
        book_row = df_books[df_books['parent_asin'].astype(str).str.strip() == found_asin].iloc[0]
        # Handle 'title' vs 'Title' column name case
        official_title = str(book_row.get('title', book_row.get('Title', '')))

        print(f"Target Book Title: '{official_title}'")

        # Generate Flag
        title_no_spaces = "".join(official_title.split())
        seed_string = title_no_spaces[:8]
        flag1 = hashlib.sha256(seed_string.encode()).hexdigest()

        return flag1

    except Exception as e:
        return f"Error calculating Flag 1: {e}"

def solve_flag3(reviews_path, student_id):

    print(f"\n--- SOLVING FLAG 3 ---")

    try:
        print("Loading all reviews for global training...")
        df = pd.read_csv(reviews_path, engine='python', on_bad_lines='skip', quotechar='"')
        df['text'] = df['text'].astype(str).fillna("")

        # Label Data: 1=Suspicious(Short), 0=Genuine(Long/Detailed)
        df['is_suspicious'] = df['text'].apply(lambda x: 1 if len(x) < 150 else 0)

        # Train Model
        print("Training Logistic Regression Model...")
        vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
        X = vectorizer.fit_transform(df['text'])
        y = df['is_suspicious']

        model = LogisticRegression(max_iter=1000)
        model.fit(X, y)

        # SHAP Analysis on Genuine Reviews
        print("Running SHAP analysis...")
        genuine_indices = np.where(y == 0)[0]
        if len(genuine_indices) == 0: return "Error: No genuine reviews."

        # Sample 100 for speed
        sample_indices = np.random.choice(genuine_indices, min(100, len(genuine_indices)), replace=False)
        X_sample = X[sample_indices]

        explainer = shap.LinearExplainer(model, X, feature_perturbation="interventional")
        shap_values = explainer.shap_values(X_sample)

        # Find Top 3 "Genuine" words (Negative SHAP values)
        mean_shap = np.mean(shap_values, axis=0)
        feature_names = np.array(vectorizer.get_feature_names_out())
        top_indices = np.argsort(mean_shap)[:3]
        top_words = feature_names[top_indices]

        print(f"Top 3 Genuine Words: {top_words}")

        # Generate Flag
        numeric_id = re.search(r'\d+', student_id).group()
        combined_string = "".join(top_words) + numeric_id
        full_hash = hashlib.sha256(combined_string.encode()).hexdigest()

        return f"FLAG3{{{full_hash[:10]}}}"

    except Exception as e:
        return f"Error calculating Flag 3: {e}"

# --- MAIN EXECUTION ---
if __name__ == "__main__":
    print(f"Starting Solver for Student ID: {STUDENT_ID}")

    # 0. Prerequisite: Generate Hash
    my_hash = generate_student_hash(STUDENT_ID)

    # 1. Solve Flag 2 (Easiest)
    flag2 = solve_flag2(REVIEWS_PATH, STUDENT_ID, my_hash)

    # 2. Solve Flag 1 (Finding the Book)
    flag1 = solve_flag1(BOOKS_PATH, REVIEWS_PATH, my_hash)

    # 3. Solve Flag 3 (ML Analysis)
    flag3 = solve_flag3(REVIEWS_PATH, STUDENT_ID)

    # --- OUTPUT RESULTS ---
    print("\n" + "="*30)
    print("FINAL RESULTS")
    print("="*30)
    print(f"FLAG1 = {flag1}")
    print(f"FLAG2 = {flag2}")
    print(f"FLAG3 = {flag3}")
    print("="*30)

    # --- WRITE THE RESULTS IN FILES.TXT

    try:
      with open('flags.txt', 'w') as f:
        f.write("FLAG1 : " + flag1 + "\n")
        f.write(" Flag2 : " + flag2 + "\n")
        f.write("FLAG3 : " + flag3 + "\n")
        print("Results written to flags.txt")

    except Exception as e:
        print(f"Error writing to file: {e}")

Enter STUDENT ID MINE IS STU007 : STU007
Starting Solver for Student ID: STU007

--- SOLVING FLAG 2 ---
Target Hash: 87ED580B
Success: Found 1 review(s).

--- SOLVING FLAG 1 ---
Loading books.csv...
Found 150 
Loading reviews.csv...
Match found in review for ASIN: 0006499333
Target Book Title: 'Borrowed Time (Alistair MacLeanâ€™s UNACO)'

--- SOLVING FLAG 3 ---
Loading all reviews for global training...
Training Logistic Regression Model...
Running SHAP analysis...
Top 3 Genuine Words: ['br' '34' 'like']





FINAL RESULTS
FLAG1 = c88d22413095ddeec420b5e477c9ac87c29313a93dd4d0069c99ee907c209229
FLAG2 = FLAG2 : {87ED580B}
FLAG3 = FLAG3{e57aaf854d}
Results written to flags.txt
