## MLPclassifier

In [17]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
import time
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler

In [18]:
# Load cleaned dataset with selected columns
df_clean = pd.read_csv('Datasets/news_sample_rows_cleaned.csv', usecols=['content', 'type', 'title'], dtype=str).dropna()

# Map article types to binary labels (1: fake-related, 0: reliable)
type_mapping = {'fake': 1, 'conspiracy': 1, 'junksci': 1, 'bias': 1, 
                'clickbait': 0, 'political': 0, 'reliable': 0}
df_clean['label'] = df_clean['type'].map(type_mapping)
df_clean = df_clean.dropna(subset=['label'])
df_clean['label'] = df_clean['label'].astype(int)

# Drop rows with missing content or title
df_clean = df_clean.dropna(subset=['content', 'title'])

# Split into training (80%), validation (10%), and test (10%) sets using stratification
train_df, temp_df = train_test_split(df_clean, test_size=0.2, random_state=42, stratify=df_clean['label'])
validation_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])

# Extract features and labels for further use
# content_train, title_train, y_train = train_df['content'], train_df['title'], train_df['label']
# content_val, title_val, y_val = validation_df['content'], validation_df['title'], validation_df['label']
# content_test, title_test, y_test = test_df['content'], test_df['title'], test_df['label']

# -- Extract features and labels for further use --
# -- concatenate content and title --
combined_text, y_train = train_df['title'] + " " + train_df["content"], train_df['label']
combined_val, y_val = validation_df['title'] + " " + validation_df["content"], validation_df['label']
combined_test, y_test = test_df['title'] + " " + test_df["content"], test_df['label']

# -- Combine content and title into one --
# train_df["combined_text"] = train_df['title'] + " " + train_df['content']

print("Training set (content sample):")
# print(content_train.head())
print(combined_text.head())
# print("\nTraining set (title sample):")
# print(title_train.head())



Training set (content sample):
134    workplac distract consid part consumpt instead...
129    ttip brexit moder terrorist 21st centuri wire ...
146    european union act like aggress colonialist st...
47     drain swamp doesnt matter long groupthink pers...
59     justic depart ask suprem court overturn daca r...
dtype: object


In [19]:
def train_neural_network(x_train, y_train, x_val, model_name):
    start_time = time.time()
    # Define a simple MLP with one hidden layer
    mlp = MLPClassifier(max_iter=1000, random_state=42)
    params = {
        'hidden_layer_sizes': [(50,), (100,)],
        'alpha': [1e-4, 1e-3, 1e-2]
    }
    # Use GridSearchCV for hypterparameter tuning
    grid = GridSearchCV(
        mlp, params, cv=3, n_jobs=-1, scoring='f1', pre_dispatch=3
    )
    grid.fit(x_train, y_train)
    
    print(f"Neural Network training time: {(time.time() - start_time)/60:.2f} min")
    print("Best Parameters for Neural Network:", grid.best_params_)
    
    # Save the best estimator
    from joblib import dump
    dump(grid, f'models/{model_name}.joblib')
    
    # Return predictions on x_val
    return grid.predict(x_val)


In [20]:
def make_TFIDF(features, ngrams):
    tfidf_pipeline = Pipeline([
        ('vectorizer', TfidfVectorizer(lowercase=False, 
                                       max_features=features, 
                                       min_df=1, 
                                       max_df=0.9, 
                                       token_pattern=r'<[\w]+>|[\w]+',
                                       ngram_range=ngrams)),
        ('scaler', StandardScaler(with_mean=False))
    ])
    
    # Build TF-IDF for both content and title; then combine
    content_train_tfidf = tfidf_pipeline.fit_transform(content_train, y_train)
    content_val_tfidf   = tfidf_pipeline.transform(content_val)
    title_train_tfidf   = tfidf_pipeline.fit_transform(title_train, y_train)
    title_val_tfidf     = tfidf_pipeline.transform(title_val)
    
    X_train = hstack((content_train_tfidf, title_train_tfidf))
    X_val   = hstack((content_val_tfidf, title_val_tfidf))
    return X_train, X_val

# --- Evaluate advanced models using TF-IDF (1-gram) ---
X_train_tfidf, X_val_tfidf = make_TFIDF(3500, (1, 1))

NameError: name 'content_train' is not defined

In [None]:
# TF-IDF feature extraction (using existing code)
X_train_tfidf, X_val_tfidf = make_TFIDF(3500, (1, 1))

# Train & evaluate Neural Network (one hidden layer)
print("\nEvaluating Neural Network with TF-IDF (1-gram):")
y_pred_nn = train_neural_network(X_train_tfidf, y_train, X_val_tfidf, 'nn_1gram')
print("Neural Network F1 score:", metrics.f1_score(y_val, y_pred_nn))
print("Neural Network Accuracy:", metrics.accuracy_score(y_val, y_pred_nn))


In [None]:
# --- Evaluate the trained Neural Network on the test set ---
from joblib import load

# Load the NN model
nn_model_loaded = load('models/nn_1gram.joblib')

# Build TF-IDF for the test set
test_tfidf_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(
        lowercase=False, 
        max_features=3500, 
        min_df=1, 
        max_df=0.9, 
        token_pattern=r'<[\w]+>|[\w]+',
        ngram_range=(1, 1))
    ),
    ('scaler', StandardScaler(with_mean=False))
])

# Convert test content and title to TF-IDF and combine
content_test_tfidf = test_tfidf_pipeline.fit_transform(content_test)
title_test_tfidf   = test_tfidf_pipeline.fit_transform(title_test)
X_test_tfidf       = hstack((content_test_tfidf, title_test_tfidf))

# Predict and evaluate
nn_pred_test = nn_model_loaded.predict(X_test_tfidf)
print("\nNeural Network on Test Set:")
print("Accuracy:", metrics.accuracy_score(y_test, nn_pred_test))
print("F1 score:", metrics.f1_score(y_test, nn_pred_test))
make_confusion_matrix(y_test, nn_pred_test, "Neural Network Model")


# Important note
Vectorizer learns which features to use from the document.
Therefore the learned features from the training data should
be the same one used on the test data. If we learn new features
from the test data we will have a mismatch.