In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, f_regression

In [7]:
# --- Configuration ---
INPUT_FILE = r"C:\Users\bhatt\OneDrive\Desktop\Sameek\0. University of North Texas\UNT Masters Course\Semester 3 - Fall 2025\CSCE 5310 - Methods in Empirical Analysis\Project\Code\essay_dataset_v1.xlsx"
OUTPUT_FILE = r"C:\Users\bhatt\OneDrive\Desktop\Sameek\0. University of North Texas\UNT Masters Course\Semester 3 - Fall 2025\CSCE 5310 - Methods in Empirical Analysis\Project\Code\top_features_after_only_PCA.xlsx"
TARGET_COLUMN = 'domain1_score'
EXCLUDED_COLUMNS = [
    'document_number', 
    'domain1_score', 
    'word_count', 
    'stopword', 
    'unique_words'
]
TOP_N_FEATURES = 50

In [8]:
# --- Main Execution Block ---
try:
    # 1. Load the Excel file
    df = pd.read_excel(INPUT_FILE)
    print(f"Successfully loaded data from: {INPUT_FILE} (Shape: {df.shape})\n")

    # 2. Define Features (X) and Target (y)
    feature_cols = [col for col in df.columns if col not in EXCLUDED_COLUMNS]
    X = df[feature_cols]
    y = df[TARGET_COLUMN]

    # 3. Perform Univariate Feature Selection (F-test for Regression)
    selector = SelectKBest(score_func=f_regression, k='all')
    selector.fit(X, y)
    
    # Get the results
    f_scores = selector.scores_
    p_values = selector.pvalues_

    # 4. Create Full Ranking DataFrame
    feature_ranking_df = pd.DataFrame({
        'Feature': feature_cols,
        'F_Score': f_scores,
        'P_Value': p_values
    })

    # Sort the features by F-Score in descending order
    feature_ranking_df = feature_ranking_df.sort_values(by='F_Score', ascending=False)
    
    # 5. Extract and format the Top N features for output
    top_features_output = feature_ranking_df.head(TOP_N_FEATURES).copy()
    
    # Format F-Score and P-Value for clear presentation in the Excel file
    top_features_output['F_Score'] = top_features_output['F_Score'].round(4)
    top_features_output['P_Value'] = top_features_output['P_Value'].apply(lambda x: f"{x:.4e}") # scientific notation

    # 6. Save the ranking to the new Excel file
    top_features_output.to_excel(OUTPUT_FILE, index=False)
    
    print("\n‚úÖ Execution Complete.")
    print(f"Top {TOP_N_FEATURES} feature rankings saved successfully.")
    print(f"File saved to: {OUTPUT_FILE}")
    print("\n" + "="*50)
    print("üèÜ Top 5 Features (Preview):")
    print(top_features_output.head(5))
    print("="*50)

except FileNotFoundError:
    print(f"\n‚ùå Error: The input file '{INPUT_FILE}' was not found.")
    print("Please ensure the Excel file is uploaded or the path/name is correct.")
except Exception as e:
    print(f"\n‚ùå An unexpected error occurred during processing: {e}")

Successfully loaded data from: C:\Users\bhatt\OneDrive\Desktop\Sameek\0. University of North Texas\UNT Masters Course\Semester 3 - Fall 2025\CSCE 5310 - Methods in Empirical Analysis\Project\Code\essay_dataset_v1.xlsx (Shape: (141, 1223))


‚úÖ Execution Complete.
Top 50 feature rankings saved successfully.
File saved to: C:\Users\bhatt\OneDrive\Desktop\Sameek\0. University of North Texas\UNT Masters Course\Semester 3 - Fall 2025\CSCE 5310 - Methods in Empirical Analysis\Project\Code\top_features_after_only_PCA.xlsx

üèÜ Top 5 Features (Preview):
             Feature   F_Score     P_Value
11  feature_computer  103.3030  1.7271e-18
80    feature_PERSON   57.8904  3.7955e-12
36   feature_PERCENT   46.9072  2.2051e-10
2   feature_LOCATION   45.0183  4.5501e-10
1       feature_CAPS   43.1291  9.4665e-10
