<a href="https://colab.research.google.com/github/052-Patcharida/ai_detection/blob/data-sci-mirt/Superfix_Y060.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

mitr_phol_gen_ai_hackathon_path = kagglehub.competition_download('mitr-phol-gen-ai-hackathon')

print('Data source import complete.')


In [None]:
import pandas as pd

train_df = pd.read_csv('/kaggle/input/mitr-phol-gen-ai-hackathon/train.csv')
print(train_df)
print(train_df['bu_categories'].value_counts())
print(train_df['action_non'].value_counts())

In [None]:
!pip install --quiet pdfplumber
!pip install --quiet fitz
!pip install --quiet pythainlp
!pip install --upgrade scikit-learn imbalanced-learn --quiet

In [None]:
import pandas as pd
import pdfplumber
import re

from pythainlp.tokenize import word_tokenize
from pythainlp.corpus.common import thai_stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# ------------------------------
# 🔹 โหลดข้อมูล
# ------------------------------
train_df = pd.read_csv('/kaggle/input/mitr-phol-gen-ai-hackathon/train.csv')
train_df = train_df.sample(frac=1.0, random_state=42).reset_index(drop=True)

# ------------------------------
# 🔹 อ่าน PDF
# ------------------------------
base_path = '/kaggle/input/mitr-phol-gen-ai-hackathon/train_docs/'

def extract_text_from_pdf(pdf_name):
    pdf_path = base_path + pdf_name
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = ""
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
            return text
    except Exception as e:
        print(f"Error reading {pdf_name}: {e}")
        return ""

train_df['text'] = train_df['pdf_name'].apply(extract_text_from_pdf)

# ------------------------------
# 🔹 ทำความสะอาดข้อความ
# ------------------------------
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\u0E00-\u0E7Fa-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

train_df['clean_text'] = train_df['text'].apply(clean_text)

# ------------------------------
# 🔹 Tokenizer + Stopwords
# ------------------------------
stopwords = set(thai_stopwords())

def clean_and_tokenize(text):
    tokens = word_tokenize(text, engine='newmm')
    return [t for t in tokens if t not in stopwords and len(t) > 1]

# ------------------------------
# 🔹 สร้าง TF-IDF เพียงครั้งเดียว
# ------------------------------
vectorizer = TfidfVectorizer(
    tokenizer=clean_and_tokenize,
    token_pattern=None,
    ngram_range=(1, 3),   # เดิม 1,2 → ลองเพิ่มเป็น 1,3
    max_features=50000,   # เพิ่มจำนวน feature
    min_df=2,             # ตัดคำที่เจอน้อยเกิน
    max_df=0.9            # ตัดคำที่เจอบ่อยเกิน
)

X = vectorizer.fit_transform(train_df['clean_text'])

# ------------------------------
# 🔹 แบ่งข้อมูล train/val
# ------------------------------
X_train, X_val, y_train_action, y_val_action, y_train_cat, y_val_cat = train_test_split(
    X,
    train_df['action_non'],
    train_df['bu_categories'],
    test_size=0.05,
    random_state=42
)

# ------------------------------
# 🔹 โมเดล Logistic Regression (action)
# ------------------------------
model_action = LogisticRegression(
    max_iter=1000,          # ให้โมเดลเรียนได้นานขึ้น
    class_weight='balanced',
    solver='saga',          # รองรับ L1/L2 regularization
    penalty='l2',           # ช่วยควบคุมไม่ให้ overfit
    n_jobs=-1               # ใช้ทุก core CPU
)
model_action.fit(X_train, y_train_action)
y_pred_action = model_action.predict(X_val)

print("\n📊 Classification Report (ACTION):")
print(classification_report(y_val_action, y_pred_action))

# ------------------------------
# 🔹 โมเดล Logistic Regression (category)
# ------------------------------
model_cat = LogisticRegression(max_iter=300, class_weight='balanced')
model_cat.fit(X_train, y_train_cat)
y_pred_cat = model_cat.predict(X_val)

print("\n📊 Classification Report (CATEGORY):")
print(classification_report(y_val_cat, y_pred_cat))

# ------------------------------
# 🔹 TF-IDF Vocabulary และตัวอย่างข้อความ
# ------------------------------
print("\n🔤 TF-IDF Vocabulary (บางส่วน):")
print(vectorizer.get_feature_names_out()[:50])

print("\n📝 Clean Text ตัวอย่าง:")
print(train_df['clean_text'].head())


In [None]:
# ------------------------------
# 🔹 โหลด submission.csv และเตรียมข้อมูล test
# ------------------------------
test_df = pd.read_csv('/kaggle/input/mitr-phol-gen-ai-hackathon/submission.csv')

# 🔹 อ่าน PDF จาก test_docs
test_base_path = '/kaggle/input/mitr-phol-gen-ai-hackathon/test_docs/'

def extract_text_from_pdf_test(pdf_name):
    pdf_path = test_base_path + pdf_name
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = ""
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
            return text
    except Exception as e:
        print(f"Error reading {pdf_name}: {e}")
        return ""

test_df['text'] = test_df['pdf_name'].apply(extract_text_from_pdf_test)
test_df['clean_text'] = test_df['text'].apply(clean_text)

# ------------------------------
# 🔹 แปลงข้อความเป็น TF-IDF ด้วย vectorizer เดิม
# ------------------------------
X_test = vectorizer.transform(test_df['clean_text'])

# ------------------------------
# 🔹 พยากรณ์ด้วยโมเดลที่เทรนไว้
# ------------------------------
pred_action = model_action.predict(X_test)
pred_cat = model_cat.predict(X_test)

# ------------------------------
# 🔹 รวมค่าเป็นผลลัพธ์สุดท้าย
# ------------------------------
final_pred = [f"{cat}_{act}"
              for cat, act in zip(pred_cat, pred_action)]

test_df['result'] = final_pred

# ------------------------------
# 🔹 แสดงผลตัวอย่าง
# ------------------------------
print("\n📄 ตัวอย่างผลทำนายจาก submission.csv")
print(test_df[['pdf_name', 'result']].head())

# ------------------------------
# 🔹 บันทึกไฟล์ผลลัพธ์ (พร้อมส่ง)
# ------------------------------
test_df[['pdf_name', 'result']].to_csv("submission.csv", index=False)
print("\n✅ บันทึกแล้วเป็น submission.csv")