In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib

# --- 1. Data Loading & Preprocessing ---

# Download NLTK resources (stopwords)
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def preprocess_text(text):
    """Cleans and prepares a single review text."""
    # Check if text is a string
    if not isinstance(text, str):
        return "" # Return empty string for non-string (e.g., float, int) inputs
        
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Keep only letters
    text = text.lower()                     # Convert to lowercase
    words = text.split()                    # Split into words
    # Stem and remove stopwords
    words = [ps.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

# --- IMPORTANT: Load your *real* dataset here ---
# Make sure your CSV file is named 'reviews.csv' and is in the same folder
try:
    # Try to read the CSV with a common encoding error handler
    df = pd.read_csv('reviews.csv', encoding='utf-8', encoding_errors='ignore')
except FileNotFoundError:
    print("Error: 'reviews.csv' not found. Make sure it's in the same folder as train_model.py")
    exit()

# --- DEBUGGING STEP 1 ---
# Print the columns *exactly* as pandas sees them *before* the rename
print("\n--- DEBUG: Columns BEFORE rename ---")
print(list(df.columns))

# --- FIX FOR THE KAGGLE DATASET ---
# The dataset from the link has a text column named 'text_' and a label column named 'label'
df = df.rename(columns={'text_': 'text', 'label': 'label'})

print("\n--- DEBUG: Columns AFTER rename ---")
print(list(df.columns))
# ------------------------

# Check if the 'text' or 'label' columns exist after renaming
if 'text' not in df.columns or 'label' not in df.columns:
    print("\n--- ERROR ---")
    print("The 'text' or 'label' column was not found.")
    print("Please check the 'df.rename' line in the code and make sure")
    print("you have replaced the placeholders with your *real* CSV column names.")
    print("\nAvailable columns in your CSV are:")
    print(list(df.columns))
    exit()

# Apply preprocessing to your text column
df['processed_text'] = df['text'].apply(preprocess_text)
print("Data Preprocessing Complete.")

# --- 2. Feature Engineering (TF-IDF) ---

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(df['processed_text'])
y = df['label']

# --- 3. Model Training (IMPROVED) ---

# Split the data into training and testing sets
# We add stratify=y to keep the same percentage of fake/genuine in both sets
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42, 
                                                    stratify=y) # <-- IMPROVEMENT 1

# Initialize and train the classifier
# We add class_weight='balanced' to help the model learn from unbalanced data
model = LogisticRegression(class_weight='balanced') # <-- IMPROVEMENT 2
model.fit(X_train, y_train)
print("Model Training Complete.")

# --- 4. Model Evaluation ---

y_pred = model.predict(X_test)
print(f"\nModel Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print("\nClassification Report:")
# Added zero_division_handling for cases where a class has no predictions
print(classification_report(y_test, y_pred, target_names=['Genuine (0)', 'Fake (1)'], zero_division=0))

# --- 5. Save Model & Vectorizer ---

joblib.dump(model, 'fake_review_model.joblib')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')

print("\nModel and vectorizer saved successfully!")




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anshu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



--- DEBUG: Columns BEFORE rename ---
['category', 'rating', 'label', 'text_']

--- DEBUG: Columns AFTER rename ---
['category', 'rating', 'label', 'text']
Data Preprocessing Complete.
Model Training Complete.

Model Accuracy: 86.29%

Classification Report:
              precision    recall  f1-score   support

 Genuine (0)       0.87      0.86      0.86      4044
    Fake (1)       0.86      0.87      0.86      4043

    accuracy                           0.86      8087
   macro avg       0.86      0.86      0.86      8087
weighted avg       0.86      0.86      0.86      8087


Model and vectorizer saved successfully!
