In [1]:
import sys
print(sys.executable)

C:\Users\cheng\anaconda3\envs\tf_env\python.exe


In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, cross_val_score, GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from sklearn.feature_selection import mutual_info_classif, RFE,SelectKBest, f_classif
import xgboost as xgb
from scipy.stats import chi2_contingency
import matplotlib.pyplot as plt


import os

In [3]:
# Load and display data
print("Loading training data...")
X_train = np.load("D:/GitHub/Classer-le-text---Text-classification/Data/data_train.npy").astype(int)
print("Training data loaded.")

print("Loading training labels...")
y_train_raw = np.loadtxt('D:/GitHub/Classer-le-text---Text-classification/Data/label_train.csv', skiprows=1, delimiter=',').astype(int)[:, 1]
print("Training labels loaded.")

print("Loading test data...")
X_test = np.load("D:/GitHub/Classer-le-text---Text-classification/Data/data_test.npy").astype(int)
print("Test data loaded.")

print("Loading vocabulary...")
vocab_data = np.load("D:/GitHub/Classer-le-text---Text-classification/Data/vocab_map.npy", allow_pickle=True)
print("Vocab data loaded.")

# Check dimensions
print(f"Training data shape: {X_train.shape}")
print(f"Training labels shape: {y_train_raw.shape}")
print(f"Test data shape: {X_test.shape}")
print(f"Vocab data shape: {vocab_data.shape}")

Loading training data...
Training data loaded.
Loading training labels...
Training labels loaded.
Loading test data...
Test data loaded.
Loading vocabulary...
Vocab data loaded.
Training data shape: (9422, 26354)
Training labels shape: (9422,)
Test data shape: (2356, 26354)
Vocab data shape: (26354,)


In [4]:
# Step 2: Log transformation to stabilize variance
X_train_log = np.log1p(X_train)
X_test_log = np.log1p(X_test)



In [5]:
# Step 3: Stop word removal
stop_words = set([
    'a', 'an', 'the', 'and', 'or', 'but', 'if', 'while', 'with', 'without', 'of', 'at', 'by', 'for', 'to', 'in', 'on',
    'from', 'up', 'down', 'out', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where',
    'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not',
    'only', 'own', 'same', 'so', 'than', 'too', 'very', 'can', 'will', 'just', 'don', 'should', 'now','about', 'being', 'thereby',    'aiming', 'didn', 'deciding', 'derive', 'foretell', 'concede', 'prepare',
    'behind', 'withstand', 'upper', 'further', 'alreadyoverwhelming', 'minimal'
])

# Create a mask for non-stop-word features
stop_word_indices = [i for i, word in enumerate(vocab_data) if word in stop_words]
mask = np.ones(len(vocab_data), dtype=bool)
mask[stop_word_indices] = False

# Apply the mask to exclude stop words from X_train and X_test
X_train_filtered = X_train_log[:, mask]
X_test_filtered = X_test_log[:, mask]
filtered_vocab = [word for i, word in enumerate(vocab_data) if mask[i]]


In [6]:
# Step 4: Apply ANOVA to select the top N features after stop word removal
num_features_anova = 500  # Adjust based on experimentation
anova_selector = SelectKBest(score_func=f_classif, k=num_features_anova)
X_train_anova = anova_selector.fit_transform(X_train_filtered, y_train_raw)
X_test_anova = anova_selector.transform(X_test_filtered)

# Retrieve selected feature indices from ANOVA selection
anova_selected_indices = anova_selector.get_support(indices=True)
selected_vocab_anova = [filtered_vocab[i] for i in anova_selected_indices]

print(f"Selected vocabulary size after ANOVA: {len(selected_vocab_anova)}")


Selected vocabulary size after ANOVA: 500


In [7]:

print("Selected words after ANOVA selection:")
print(selected_vocab_anova)

Selected words after ANOVA selection:
['abstractive', 'accelerated', 'account', 'achieve', 'achieves', 'acoustic', 'agent', 'agents', 'agnostic', 'ai', 'al', 'algorithm', 'alternating', 'application', 'approximate', 'approximation', 'arabic', 'are', 'area', 'areas', 'armed', 'arms', 'art', 'article', 'artificial', 'asr', 'assessment', 'assumption', 'assumptions', 'asymptotic', 'automated', 'automatic', 'autonomous', 'autoregressive', 'available', 'bandit', 'bandits', 'based', 'baselines', 'basic', 'batch', 'be', 'been', 'behaved', 'belief', 'bleu', 'bound', 'bounds', 'build', 'calculus', 'careful', 'carlo', 'categories', 'category', 'characteristics', 'cifar', 'class', 'classify', 'cognitive', 'compare', 'competitive', 'completion', 'compositional', 'compositionality', 'comprehensive', 'computationally', 'computer', 'concept', 'concepts', 'conditioned', 'conducted', 'conll', 'consider', 'considered', 'constant', 'constituency', 'content', 'contrast', 'convergence', 'convergent', 'conve

In [8]:

# Step 5: Handle class imbalance with SMOTEENN
smote_enn = SMOTEENN(random_state=42)
X_train_resampled, y_train_resampled = smote_enn.fit_resample(X_train_anova, y_train_raw)
print(f"Resampled training data shape: {X_train_resampled.shape}")

Resampled training data shape: (7983, 500)


# XGBoost

In [9]:
# Step 6: Feature Importance with XGBoost on resampled data
xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', random_state=42)
xgb_model.fit(X_train_resampled, y_train_resampled)

# Get feature importances from XGBoost model and select top features
importances = xgb_model.feature_importances_
top_n_features_xgb = 30  # Adjust based on experimentation
top_n_indices_xgb = np.argsort(importances)[-top_n_features_xgb:]

# Map back to the original feature indices after ANOVA
final_selected_indices = [anova_selected_indices[i] for i in top_n_indices_xgb]
final_vocabulary = [filtered_vocab[i] for i in final_selected_indices]

# Step 7: Final feature selection for training and testing
X_train_final = X_train_resampled[:, top_n_indices_xgb]
X_test_final = X_test_anova[:, top_n_indices_xgb]

In [10]:
print(final_vocabulary)

['may', 'three', 'bounds', 'our', 'gradient', 'information', 'sparse', 'convex', 'presented', 'systems', 'mining', 'decision', 'singular', 'paper', 'research', 'was', 'techniques', 'intelligence', 'belief', 'learning', 'system', 'theoretical', 'agents', 'achieves', 'development', 'used', 'art', 'we', 'logic', 'discuss']


In [11]:
# Print shapes of the final selected training and test datasets
print("Shape of X_train_final:", X_train_final.shape)
print("Shape of X_test_final:", X_test_final.shape)

Shape of X_train_final: (7983, 30)
Shape of X_test_final: (2356, 30)


In [12]:
# Calculate scale_pos_weight based on resampled data
num_negative = np.sum(y_train_resampled == 0)
num_positive = np.sum(y_train_resampled == 1)
scale_pos_weight = num_negative / num_positive

In [13]:
param_grid = {
    'n_estimators': [50, 100, 200, 500],
    'learning_rate': [0.001, 0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7, 10],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.5],
    'scale_pos_weight': [1, 5, 10]  # Adjust based on class imbalance
}


In [14]:
# Initialize the model
xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=50,  # Number of parameter settings sampled
    scoring='f1',  # Scoring metric to optimize
    cv=5,  # Number of cross-validation folds
    random_state=42,
    n_jobs=-1  # Use all available cores
)

# Fit RandomizedSearchCV to the data
random_search.fit(X_train_final, y_train_resampled)

# Output the best parameters and the best F1 score
print("Best Parameters:", random_search.best_params_)
print("Best F1 Score:", random_search.best_score_)

Best Parameters: {'subsample': 0.8, 'scale_pos_weight': 1, 'n_estimators': 50, 'min_child_weight': 1, 'max_depth': 7, 'learning_rate': 0.1, 'gamma': 0.5, 'colsample_bytree': 0.6}
Best F1 Score: 0.9624798082526336


In [19]:
final_model = xgb.XGBClassifier(
    objective='binary:logistic', 
    eval_metric='logloss', 
    scale_pos_weight=1,  # Best parameter from tuning
    subsample=0.8,       # Best parameter from tuning
    n_estimators=50,     # Best parameter from tuning
    min_child_weight=1,  # Best parameter from tuning
    max_depth=7,         # Best parameter from tuning
    learning_rate=0.1,   # Best parameter from tuning
    gamma=0.5,           # Best parameter from tuning
    colsample_bytree=0.6, # Best parameter from tuning
    random_state=42,
    reg_alpha=0.1,       # Regularization parameters to control overfitting
    reg_lambda=1.0       
)

In [20]:
# Initialize cross-validation metrics storage
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cross_val_precision = []
cross_val_recall = []
cross_val_f1 = []
conf_matrices = []


In [21]:
# Loop over each fold of resampled data
for train_index, val_index in kf.split(X_train_final, y_train_resampled):
    # Split resampled data into training and validation for the current fold
    X_train_fold, X_val_fold = X_train_final[train_index], X_train_final[val_index]
    y_train_fold, y_val_fold = y_train_resampled[train_index], y_train_resampled[val_index]
    
    # Train the model on the current fold
    final_model.fit(X_train_fold, y_train_fold)
    
    # Predict on the validation fold
    y_val_pred = final_model.predict(X_val_fold)
    
    # Calculate metrics for the validation fold
    precision = precision_score(y_val_fold, y_val_pred, zero_division=0)
    recall = recall_score(y_val_fold, y_val_pred, zero_division=0)
    f1 = f1_score(y_val_fold, y_val_pred, zero_division=0)
    
    # Store metrics for averaging later
    cross_val_precision.append(precision)
    cross_val_recall.append(recall)
    cross_val_f1.append(f1)
    
    # Store confusion matrix for the validation fold
    conf_matrices.append(confusion_matrix(y_val_fold, y_val_pred))

# Calculate and display average cross-validation scores
avg_precision = np.mean(cross_val_precision)
avg_recall = np.mean(cross_val_recall)
avg_f1 = np.mean(cross_val_f1)

print(f"Cross-Validation Precision: {avg_precision:.4f}")
print(f"Cross-Validation Recall: {avg_recall:.4f}")
print(f"Cross-Validation F1 Score: {avg_f1:.4f}")

# Optional: display confusion matrices for each fold
for i, cm in enumerate(conf_matrices):
    print(f"Confusion Matrix for Fold {i + 1}:\n{cm}\n")

Cross-Validation Precision: 0.9545
Cross-Validation Recall: 0.9812
Cross-Validation F1 Score: 0.9676
Confusion Matrix for Fold 1:
[[ 195   64]
 [  28 1310]]

Confusion Matrix for Fold 2:
[[ 198   61]
 [  29 1309]]

Confusion Matrix for Fold 3:
[[ 189   70]
 [  25 1313]]

Confusion Matrix for Fold 4:
[[ 197   62]
 [  21 1316]]

Confusion Matrix for Fold 5:
[[ 202   56]
 [  23 1315]]



In [23]:


# Predict on the test data
y_test_pred = final_model.predict(X_test_final)
print("Predictions on test data:")
print(y_test_pred)
print(y_test_pred[:10])  # Print the first 10 predictions

Predictions on test data:
[1 0 1 ... 0 1 1]
[1 0 1 1 1 1 1 0 1 0]


# Random Forest

In [None]:
# Define the range of features to test
feature_counts = [50, 150,  250, 350,  450]

# Parameters for the Random Forest model
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Storage for results
results = []

# Loop over each feature count
for num_features_anova in feature_counts:
    print(f"\nTesting with {num_features_anova} ANOVA-selected features...")

    # Step 1: Apply ANOVA feature selection
    anova_selector = SelectKBest(score_func=f_classif, k=num_features_anova)
    X_train_anova = anova_selector.fit_transform(X_train_filtered, y_train_raw)
    X_test_anova = anova_selector.transform(X_test_filtered)

    # Retrieve selected feature indices for reference if needed
    anova_selected_indices = anova_selector.get_support(indices=True)
    selected_vocab_anova = [filtered_vocab[i] for i in anova_selected_indices]

    # Step 2: Initialize the Random Forest model
    rf_model = RandomForestClassifier(random_state=42)

    # Step 3: Cross-validation with Grid Search
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid_search = GridSearchCV(
        estimator=rf_model,
        param_grid=param_grid,
        scoring='f1',
        cv=kf,
        n_jobs=-1,
        verbose=2
    )

    # Fit the grid search on the ANOVA-selected features
    print("Running grid search for optimal hyperparameters...")
    grid_search.fit(X_train_anova, y_train_raw)

    # Get the best hyperparameters and F1 score for this feature count
    best_params = grid_search.best_params_
    best_f1_score = grid_search.best_score_
    print(f"Best F1 Score with {num_features_anova} features: {best_f1_score:.4f}")
    
    # Store results
    results.append((num_features_anova, best_params, best_f1_score))

# Display results for each feature count
print("\nFeature Selection and Hyperparameter Tuning Results:")
for num_features, best_params, f1_score in results:
    print(f"{num_features} features: Best Params = {best_params}, F1 Score = {f1_score:.4f}")

In [25]:
def save_array_with_index(array, filename):
    # Create a DataFrame with the array
    df = pd.DataFrame(array, columns=['label'])
    
    # Add the 'id' column using the row index
    df['ID'] = df.index
    
    # Reorder the columns to have 'id' first and 'label' second
    df = df[['ID', 'label']]
    
    # Check if the file exists and save over it
    if os.path.exists(filename):
        print(f"File {filename} exists. Saving over it.")
    else:
        print(f"File {filename} does not exist. Creating a new file.")
    
    # Save the DataFrame as a CSV file
    df.to_csv(filename, index=False)

In [26]:
filename = 'prediction_milestone2_XGBoost.csv'

save_array_with_index(y_test_pred, filename)

File prediction_milestone2_XGBoost.csv exists. Saving over it.
