<a href="https://colab.research.google.com/github/052-Patcharida/ai_detection/blob/data-sci-mirt/Superfix_Y060.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

mitr_phol_gen_ai_hackathon_path = kagglehub.competition_download('mitr-phol-gen-ai-hackathon')

print('Data source import complete.')


In [None]:
import pandas as pd

train_df = pd.read_csv('/kaggle/input/mitr-phol-gen-ai-hackathon/train.csv')
print(train_df)
print(train_df['bu_categories'].value_counts())
print(train_df['action_non'].value_counts())

In [None]:
!pip install --quiet pdfplumber
!pip install --quiet fitz
!pip install --quiet pythainlp
!pip install --upgrade scikit-learn imbalanced-learn --quiet

In [None]:
import pandas as pd
import pdfplumber
import re

from pythainlp.tokenize import word_tokenize
from pythainlp.corpus.common import thai_stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# ------------------------------
# üîπ ‡πÇ‡∏´‡∏•‡∏î‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•
# ------------------------------
train_df = pd.read_csv('/kaggle/input/mitr-phol-gen-ai-hackathon/train.csv')
train_df = train_df.sample(frac=1.0, random_state=42).reset_index(drop=True)

# ------------------------------
# üîπ ‡∏≠‡πà‡∏≤‡∏ô PDF
# ------------------------------
base_path = '/kaggle/input/mitr-phol-gen-ai-hackathon/train_docs/'

def extract_text_from_pdf(pdf_name):
    pdf_path = base_path + pdf_name
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = ""
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
            return text
    except Exception as e:
        print(f"Error reading {pdf_name}: {e}")
        return ""

train_df['text'] = train_df['pdf_name'].apply(extract_text_from_pdf)

# ------------------------------
# üîπ ‡∏ó‡∏≥‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏∞‡∏≠‡∏≤‡∏î‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°
# ------------------------------
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\u0E00-\u0E7Fa-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

train_df['clean_text'] = train_df['text'].apply(clean_text)

# ------------------------------
# üîπ Tokenizer + Stopwords
# ------------------------------
stopwords = set(thai_stopwords())

def clean_and_tokenize(text):
    tokens = word_tokenize(text, engine='newmm')
    return [t for t in tokens if t not in stopwords and len(t) > 1]

# ------------------------------
# üîπ ‡∏™‡∏£‡πâ‡∏≤‡∏á TF-IDF ‡πÄ‡∏û‡∏µ‡∏¢‡∏á‡∏Ñ‡∏£‡∏±‡πâ‡∏á‡πÄ‡∏î‡∏µ‡∏¢‡∏ß
# ------------------------------
vectorizer = TfidfVectorizer(
    tokenizer=clean_and_tokenize,
    token_pattern=None,
    ngram_range=(1, 3),   # ‡πÄ‡∏î‡∏¥‡∏° 1,2 ‚Üí ‡∏•‡∏≠‡∏á‡πÄ‡∏û‡∏¥‡πà‡∏°‡πÄ‡∏õ‡πá‡∏ô 1,3
    max_features=50000,   # ‡πÄ‡∏û‡∏¥‡πà‡∏°‡∏à‡∏≥‡∏ô‡∏ß‡∏ô feature
    min_df=2,             # ‡∏ï‡∏±‡∏î‡∏Ñ‡∏≥‡∏ó‡∏µ‡πà‡πÄ‡∏à‡∏≠‡∏ô‡πâ‡∏≠‡∏¢‡πÄ‡∏Å‡∏¥‡∏ô
    max_df=0.9            # ‡∏ï‡∏±‡∏î‡∏Ñ‡∏≥‡∏ó‡∏µ‡πà‡πÄ‡∏à‡∏≠‡∏ö‡πà‡∏≠‡∏¢‡πÄ‡∏Å‡∏¥‡∏ô
)

X = vectorizer.fit_transform(train_df['clean_text'])

# ------------------------------
# üîπ ‡πÅ‡∏ö‡πà‡∏á‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏• train/val
# ------------------------------
X_train, X_val, y_train_action, y_val_action, y_train_cat, y_val_cat = train_test_split(
    X,
    train_df['action_non'],
    train_df['bu_categories'],
    test_size=0.05,
    random_state=42
)

# ------------------------------
# üîπ ‡πÇ‡∏°‡πÄ‡∏î‡∏• Logistic Regression (action)
# ------------------------------
model_action = LogisticRegression(
    max_iter=1000,          # ‡πÉ‡∏´‡πâ‡πÇ‡∏°‡πÄ‡∏î‡∏•‡πÄ‡∏£‡∏µ‡∏¢‡∏ô‡πÑ‡∏î‡πâ‡∏ô‡∏≤‡∏ô‡∏Ç‡∏∂‡πâ‡∏ô
    class_weight='balanced',
    solver='saga',          # ‡∏£‡∏≠‡∏á‡∏£‡∏±‡∏ö L1/L2 regularization
    penalty='l2',           # ‡∏ä‡πà‡∏ß‡∏¢‡∏Ñ‡∏ß‡∏ö‡∏Ñ‡∏∏‡∏°‡πÑ‡∏°‡πà‡πÉ‡∏´‡πâ overfit
    n_jobs=-1               # ‡πÉ‡∏ä‡πâ‡∏ó‡∏∏‡∏Å core CPU
)
model_action.fit(X_train, y_train_action)
y_pred_action = model_action.predict(X_val)

print("\nüìä Classification Report (ACTION):")
print(classification_report(y_val_action, y_pred_action))

# ------------------------------
# üîπ ‡πÇ‡∏°‡πÄ‡∏î‡∏• Logistic Regression (category)
# ------------------------------
model_cat = LogisticRegression(max_iter=300, class_weight='balanced')
model_cat.fit(X_train, y_train_cat)
y_pred_cat = model_cat.predict(X_val)

print("\nüìä Classification Report (CATEGORY):")
print(classification_report(y_val_cat, y_pred_cat))

# ------------------------------
# üîπ TF-IDF Vocabulary ‡πÅ‡∏•‡∏∞‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°
# ------------------------------
print("\nüî§ TF-IDF Vocabulary (‡∏ö‡∏≤‡∏á‡∏™‡πà‡∏ß‡∏ô):")
print(vectorizer.get_feature_names_out()[:50])

print("\nüìù Clean Text ‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á:")
print(train_df['clean_text'].head())


In [None]:
# ------------------------------
# üîπ ‡πÇ‡∏´‡∏•‡∏î submission.csv ‡πÅ‡∏•‡∏∞‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏°‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏• test
# ------------------------------
test_df = pd.read_csv('/kaggle/input/mitr-phol-gen-ai-hackathon/submission.csv')

# üîπ ‡∏≠‡πà‡∏≤‡∏ô PDF ‡∏à‡∏≤‡∏Å test_docs
test_base_path = '/kaggle/input/mitr-phol-gen-ai-hackathon/test_docs/'

def extract_text_from_pdf_test(pdf_name):
    pdf_path = test_base_path + pdf_name
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = ""
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
            return text
    except Exception as e:
        print(f"Error reading {pdf_name}: {e}")
        return ""

test_df['text'] = test_df['pdf_name'].apply(extract_text_from_pdf_test)
test_df['clean_text'] = test_df['text'].apply(clean_text)

# ------------------------------
# üîπ ‡πÅ‡∏õ‡∏•‡∏á‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏õ‡πá‡∏ô TF-IDF ‡∏î‡πâ‡∏ß‡∏¢ vectorizer ‡πÄ‡∏î‡∏¥‡∏°
# ------------------------------
X_test = vectorizer.transform(test_df['clean_text'])

# ------------------------------
# üîπ ‡∏û‡∏¢‡∏≤‡∏Å‡∏£‡∏ì‡πå‡∏î‡πâ‡∏ß‡∏¢‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏ó‡∏µ‡πà‡πÄ‡∏ó‡∏£‡∏ô‡πÑ‡∏ß‡πâ
# ------------------------------
pred_action = model_action.predict(X_test)
pred_cat = model_cat.predict(X_test)

# ------------------------------
# üîπ ‡∏£‡∏ß‡∏°‡∏Ñ‡πà‡∏≤‡πÄ‡∏õ‡πá‡∏ô‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡∏™‡∏∏‡∏î‡∏ó‡πâ‡∏≤‡∏¢
# ------------------------------
final_pred = [f"{cat}_{act}"
              for cat, act in zip(pred_cat, pred_action)]

test_df['result'] = final_pred

# ------------------------------
# üîπ ‡πÅ‡∏™‡∏î‡∏á‡∏ú‡∏•‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á
# ------------------------------
print("\nüìÑ ‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏ú‡∏•‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢‡∏à‡∏≤‡∏Å submission.csv")
print(test_df[['pdf_name', 'result']].head())

# ------------------------------
# üîπ ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÑ‡∏ü‡∏•‡πå‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå (‡∏û‡∏£‡πâ‡∏≠‡∏°‡∏™‡πà‡∏á)
# ------------------------------
test_df[['pdf_name', 'result']].to_csv("submission.csv", index=False)
print("\n‚úÖ ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÅ‡∏•‡πâ‡∏ß‡πÄ‡∏õ‡πá‡∏ô submission.csv")