In [2]:
# ================================================================
# TRAIN AND SAVE SVM AI TEXT DETECTOR MODEL
# ================================================================
import pandas as pd
import re
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# ------------------------------------------------
# 1. Load Dataset
# ------------------------------------------------
data = pd.read_csv(r"C:\Users\Lenovo\Desktop\3A PROJECT\Training_Essay_Data.csv")

print(" Dataset loaded successfully!")
print("Columns:", data.columns.tolist())

# ------------------------------------------------
# 2. Clean Text Function
# ------------------------------------------------
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"[^a-z0-9\s]+", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

data["clean_text"] = data["text"].apply(clean_text)

# ------------------------------------------------
# 3. Split Data
# ------------------------------------------------
X = data["clean_text"]
y = data["generated"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ------------------------------------------------
# 4. Vectorize Text
# ------------------------------------------------
vectorizer = TfidfVectorizer(max_features=8000, stop_words="english")
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# ------------------------------------------------
# 5. Train SVM Model
# ------------------------------------------------
print("\n Training SVM model...")
model = SVC(kernel="linear", probability=True, C=1.0, random_state=42)
model.fit(X_train_tfidf, y_train)

# ------------------------------------------------
# 6. Evaluate Model
# ------------------------------------------------
y_pred = model.predict(X_test_tfidf)
print("\n Model Evaluation:")
print("Accuracy:", round(accuracy_score(y_test, y_pred), 4))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# ------------------------------------------------
# 7. Save Model and Vectorizer with Pickle
# ------------------------------------------------
with open("svm_ai_text_detector.pkl", "wb") as f:
    pickle.dump(model, f)

with open("svm_tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print("\n Saved:")
print(" - svm_ai_text_detector.pkl")
print(" - svm_tfidf_vectorizer.pkl")

# ------------------------------------------------
# 8. Test Loading the Model
# ------------------------------------------------
with open("svm_ai_text_detector.pkl", "rb") as f:
    loaded_model = pickle.load(f)

with open("svm_tfidf_vectorizer.pkl", "rb") as f:
    loaded_vectorizer = pickle.load(f)

print("\n Model and vectorizer loaded successfully!")

# Quick test
sample = "This essay analyzes global trends in modern technology."
clean = clean_text(sample)
vec = loaded_vectorizer.transform([clean])
pred = loaded_model.predict(vec)[0]
print("\n Sample prediction:", "AI" if pred == 1 else "Human")


 Dataset loaded successfully!
Columns: ['text', 'generated']

 Training SVM model...

 Model Evaluation:
Accuracy: 0.9943

Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      3502
           1       1.00      0.99      0.99      2327

    accuracy                           0.99      5829
   macro avg       0.99      0.99      0.99      5829
weighted avg       0.99      0.99      0.99      5829

Confusion Matrix:
 [[3491   11]
 [  22 2305]]

 Saved:
 - svm_ai_text_detector.pkl
 - svm_tfidf_vectorizer.pkl

 Model and vectorizer loaded successfully!

 Sample prediction: AI
