In [1]:
# ================================
# 1Ô∏è‚É£ Import required libraries
# ================================
import pandas as pd
import numpy as np
import joblib
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.utils import resample
from scipy.sparse import hstack
from sklearn.base import BaseEstimator, TransformerMixin


In [2]:
# ================================
# 2Ô∏è‚É£ Extra Features Class
# ================================
# This is same as in your extra_features.py

emoji_set = set(
    "üòÄüòÅüòÇü§£üòÉüòÑüòÖüòÜüòâüòäüòãüòéüòçüòòüòóüòôüòöüôÇü§óü§©ü§îü§®üòêüòëüò∂üôÑüòèüò£üò•"
    "üòÆü§êüòØüò™üò´üò¥üòåü§ìüòõüòúüòùü§§üòíüòìüòîüòïüôÉü§ëüò≤‚òπÔ∏èüôÅüòñüòûüòüüò§üò¢üò≠"
    "üò¶üòßüò®üò©ü§Øüò¨üò∞üò±üò≥ü§™üòµüò°üò†ü§¨"
)

class ExtraFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, texts):
        processed = [t if isinstance(t, str) else "" for t in texts]

        emoji_count = np.array([sum(1 for ch in t if ch in emoji_set) for t in processed]).reshape(-1, 1)
        punctuation_ratio = np.array([sum(1 for c in t if c in string.punctuation) / (len(t) + 1) for t in processed]).reshape(-1, 1)
        digit_ratio = np.array([sum(1 for c in t if c.isdigit()) / (len(t) + 1) for t in processed]).reshape(-1, 1)
        avg_word_len = np.array([
            np.mean([len(w) for w in t.split()]) if len(t.split()) > 0 else 0
            for t in processed
        ]).reshape(-1, 1)

        return np.hstack([emoji_count, punctuation_ratio, digit_ratio, avg_word_len])


In [3]:
# ================================
# 3Ô∏è‚É£ Load your CSV data
# ================================
df = pd.read_csv("RF_data.csv")  # Your CSV with Human_Content & AI_Content columns

# Combine into a single dataframe
human_df = pd.DataFrame({"text": df["Human_Content"], "label": "human"})
ai_df = pd.DataFrame({"text": df["AI_Content"], "label": "ai"})
df_final = pd.concat([human_df, ai_df], ignore_index=True)

# Clean text
df_final['text'] = df_final['text'].fillna("").str.strip()

print("Total samples:", df_final.shape[0])
print(df_final['label'].value_counts())


Total samples: 1218
label
human    609
ai       609
Name: count, dtype: int64


In [5]:
# ================================
# 4Ô∏è‚É£ Balance the classes (oversampling)
# ================================
human = df_final[df_final.label == "human"]
ai = df_final[df_final.label == "ai"]

if len(human) > len(ai):
    ai = resample(ai, replace=True, n_samples=len(human), random_state=42)
elif len(ai) > len(human):
    human = resample(human, replace=True, n_samples=len(ai), random_state=42)

df_balanced = pd.concat([human, ai]).sample(frac=1, random_state=42).reset_index(drop=True)
print("Balanced samples:", df_balanced.shape)
print(df_balanced['label'].value_counts())


Balanced samples: (1218, 2)
label
human    609
ai       609
Name: count, dtype: int64


In [None]:
# ================================
# 5Ô∏è‚É£ Train-Test Split
# ================================
X = df_balanced["text"]
y = df_balanced["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.10, random_state=42, stratify=y
)

print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])


                                                  text  label  \
0    If you believe the home alarm commercials, the...  human   
1    What do you call a midget with no teeth A gum ...  human   
2    It's just sad how often I see zookeepers break...  human   
3    You really have to question the judgment of pe...  human   
4    What's green and smells like bacon? Kermit the...  human   
..                                                 ...    ...   
395  The correct volume level for contemplating com...     ai   
396  It is a historical certainty that all forgotte...     ai   
397  The only thing preventing the world from achie...     ai   
398  The phenomenon of deja vu is a momentary serve...     ai   
399  The structural formula for a truly unproductiv...     ai   

                                            clean_text  
0    ifyoubelievethehomealarmcommercialsthefirstthi...  
1               whatdoyoucallamidgetwithnoteethagumjob  
2    itsjustsadhowofteniseezookeepersbreakingthe

In [7]:
# ================================
# 5Ô∏è‚É£ Train-Test Split
# ================================
X = df_balanced["text"]
y = df_balanced["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.10, random_state=42, stratify=y
)

print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])


Training samples: 1096
Testing samples: 122


In [8]:
# ================================
# 6Ô∏è‚É£ Vectorization + Extra Features
# ================================
# TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=15000, ngram_range=(1,3), sublinear_tf=True)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Extra features
extra = ExtraFeatures()
X_train_extra = extra.fit_transform(X_train)
X_test_extra = extra.transform(X_test)

# Combine
X_train_combined = hstack([X_train_tfidf, X_train_extra])
X_test_combined = hstack([X_test_tfidf, X_test_extra])


In [9]:
# ================================
# 7Ô∏è‚É£ Train the Linear SVC Model
# ================================
model = LinearSVC()
model.fit(X_train_combined, y_train)


In [10]:
# ================================
# 8Ô∏è‚É£ Evaluate Model
# ================================
y_pred = model.predict(X_test_combined)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy*100:.2f}%\n")

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Test Accuracy: 83.61%

Classification Report:
              precision    recall  f1-score   support

          ai       0.92      0.74      0.82        61
       human       0.78      0.93      0.85        61

    accuracy                           0.84       122
   macro avg       0.85      0.84      0.83       122
weighted avg       0.85      0.84      0.83       122

Confusion Matrix:
[[45 16]
 [ 4 57]]


In [11]:
# ================================
# 9Ô∏è‚É£ Test on new sentences
# ================================
def predict_text(text):
    vec = tfidf.transform([text])
    extra_feat = extra.transform([text])
    combined = hstack([vec, extra_feat])
    pred = model.predict(combined)[0]
    return pred

# Example sentences
examples = [
    "I had a great day at the park with friends!",
    "The probability distribution of X is calculated using the formula..."
]

for s in examples:
    print(f"Text: {s[:50]}... Prediction: {predict_text(s)}")


Text: I had a great day at the park with friends!... Prediction: ai
Text: The probability distribution of X is calculated us... Prediction: ai


In [12]:
# ================================
# 10Ô∏è‚É£ Save model & vectorizer
# ================================
joblib.dump(model, "ai_human_model.pkl")
joblib.dump(tfidf, "vectorizer.pkl")
print("Model and vectorizer saved.")


Model and vectorizer saved.


In [13]:
human_sentences = [
    "I had a fantastic weekend hiking with my friends!",
    "Can't believe how much I laughed at that movie last night.",
    "My dog loves playing fetch in the park every morning.",
    "I baked a chocolate cake yesterday and it turned out amazing!",
    "I feel so tired today, I think I need a nap."
]
ai_sentences = [
    "The probability of X is calculated by applying Bayes theorem and integrating over the sample space.",
    "Machine learning models require training datasets with labeled examples to optimize the loss function efficiently.",
    "The algorithm demonstrates a time complexity of O(n log n) under the assumption of a balanced binary search tree.",
    "Quantum computing utilizes qubits which can exist in superposition states to perform parallel computations.",
    "The economic growth rate is influenced by multiple macroeconomic indicators including inflation, unemployment, and interest rates."
]


In [14]:
# Function to predict
def predict_text(text):
    vec = tfidf.transform([text])
    extra_feat = extra.transform([text])
    combined = hstack([vec, extra_feat])
    pred = model.predict(combined)[0]
    return pred

# Test human sentences
print("=== Human Sentences ===")
for s in human_sentences:
    print(f"Text: {s[:50]}... Prediction: {predict_text(s)}")

# Test AI sentences
print("\n=== AI Sentences ===")
for s in ai_sentences:
    print(f"Text: {s[:50]}... Prediction: {predict_text(s)}")


=== Human Sentences ===
Text: I had a fantastic weekend hiking with my friends!... Prediction: human
Text: Can't believe how much I laughed at that movie las... Prediction: human
Text: My dog loves playing fetch in the park every morni... Prediction: ai
Text: I baked a chocolate cake yesterday and it turned o... Prediction: human
Text: I feel so tired today, I think I need a nap.... Prediction: human

=== AI Sentences ===
Text: The probability of X is calculated by applying Bay... Prediction: ai
Text: Machine learning models require training datasets ... Prediction: ai
Text: The algorithm demonstrates a time complexity of O(... Prediction: ai
Text: Quantum computing utilizes qubits which can exist ... Prediction: ai
Text: The economic growth rate is influenced by multiple... Prediction: ai


[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 62ms/step
Bro today I lost my charger üò≠ -> Human-Written (score: 0.496)
[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 71ms/step
Neural networks consist of layers of interconnected nodes. -> Human-Written (score: 0.496)
[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 63ms/step
I swear my phone hates me bro -> Human-Written (score: 0.496)
[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 65ms/step
Machine learning models require structured datasets -> Human-Written (score: 0.496)


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# --- 1. Data Loading and Preprocessing (Using User's Logic + Fixes) ---
# NOTE: Ensure 'AIdata.csv' contains ALL your 400 samples (200 Human_Content, 200 AI_Content).
try:
    df = pd.read_csv('RF data.csv')
except FileNotFoundError:
    print("Error: 'AIdata.csv' not found. Please ensure your combined dataset is in the working directory.")
    # Exit the script gracefully if the file is missing
    X, y = [], []
    exit()

# 1. Separate and Label Human Content
# FIX: Mapping 'Human' content to numerical label 0
df_human = pd.DataFrame({
    "text": df["Human_Content"],
    "label": 0 
})

# 2. Separate and Label AI Content
# FIX: Mapping 'AI' content to numerical label 1
df_ai = pd.DataFrame({
    "text": df["AI_Content"],
    "label": 1
})

# 3. Combine DataFrames and Clean
df_final = pd.concat([df_human, df_ai], ignore_index=True)


# Apply dropna to clean up any missing text entries after concatenation
df_final = df_final.dropna(subset=['text'])

print(f"Total samples after cleaning: {len(df_final)}")
print(f"Human samples (0): {df_final['label'].value_counts().get(0, 0)}")
print(f"AI samples (1): {df_final['label'].value_counts().get(1, 0)}")

X = df_final['text']
y = df_final['label']

# Check if there's enough data to proceed
if len(df_final) < 2:
    print("\nError: Not enough samples remaining after cleaning to train the model.")
    exit()

# --- 2. Feature Extraction ---
# Using TF-IDF (Term Frequency-Inverse Document Frequency)
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
X_vec = vectorizer.fit_transform(X)

# Split data into training and testing sets (using stratification to maintain label proportions)
X_train, X_test, y_train, y_test = train_test_split(
    X_vec, y, test_size=0.2, random_state=42, stratify=y
)

# --- 3. Model Training with Class Weight Balancing ---
# FIX: class_weight='balanced' parameter to counteract data imbalance
# This forces the model to heavily penalize errors on the less-represented class (Human/Label 0),
# directly

Total samples after cleaning: 400
Human samples (0): 200
AI samples (1): 200


In [20]:
# --- 3. Model Training with Class Weight Balancing ---
# The class_weight='balanced' parameter is maintained to stabilize training 
# against feature skew, even with balanced data.

print("\n--- Training Logistic Regression with class_weight='balanced' ---")
# Using Logistic Regression as it's simple and effective for text classification
model = LogisticRegression(class_weight='balanced', solver='liblinear', random_state=42) 
model.fit(X_train, y_train)

# --- 4. Evaluation ---
y_pred = model.predict(X_test)

print("\nClassification Report (Balanced Model):")
# Classification Report is crucial for checking the recall of both classes
print(classification_report(y_test, y_pred, target_names=['Human', 'AI']))

print("\nConfusion Matrix:")
# Confusion Matrix shows the raw prediction counts
print(confusion_matrix(y_test, y_pred))

# Expected Outcome: The Recall for 'Human' should now be significantly higher than 0.03.


--- Training Logistic Regression with class_weight='balanced' ---

Classification Report (Balanced Model):
              precision    recall  f1-score   support

       Human       1.00      0.97      0.99        40
          AI       0.98      1.00      0.99        40

    accuracy                           0.99        80
   macro avg       0.99      0.99      0.99        80
weighted avg       0.99      0.99      0.99        80


Confusion Matrix:
[[39  1]
 [ 0 40]]


In [21]:
# --- 5. Real-Time Inference (New Testing Block) ---
# Testing the model on custom, unseen inputs

def predict_text_class(text):
    """Predicts the class (Human or AI) for a single string of text."""
    # 1. Transform the new text using the *fitted* vectorizer
    text_vec = vectorizer.transform([text])
    
    # 2. Get the prediction (0 or 1)
    prediction = model.predict(text_vec)[0]
    
    # 3. Get the prediction probability
    proba = model.predict_proba(text_vec)[0]
    
    class_name = 'AI' if prediction == 1 else 'Human'
    confidence = proba[prediction]
    
    print(f"\n--- Prediction for: '{text}' ---")
    print(f"Predicted Class: {class_name}")
    print(f"Confidence (P({class_name})): {confidence:.4f}")

print("\n--- Testing Model on New Examples ---")

# Example 1: Human-like text (informal, conversational)
predict_text_class("dude what the heck was that about my boots")

# Example 2: AI-like text (formal, technical)
predict_text_class("Implementation of the recursive temporal parallax algorithm is contingent upon mitigating quantum entanglement flux.")

# Example 3: Ambiguous text (should test the balance)
predict_text_class("The collective melancholy of staplers is why I ran away like a bitch.")


--- Testing Model on New Examples ---

--- Prediction for: 'dude what the heck was that about my boots' ---
Predicted Class: Human
Confidence (P(Human)): 0.5884

--- Prediction for: 'Implementation of the recursive temporal parallax algorithm is contingent upon mitigating quantum entanglement flux.' ---
Predicted Class: AI
Confidence (P(AI)): 0.5276

--- Prediction for: 'The collective melancholy of staplers is why I ran away like a bitch.' ---
Predicted Class: Human
Confidence (P(Human)): 0.5435
