In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, f_regression

In [2]:
# --- Configuration ---
INPUT_FILE = r"C:\Users\bhatt\OneDrive\Desktop\Sameek\0. University of North Texas\UNT Masters Course\Semester 3 - Fall 2025\CSCE 5310 - Methods in Empirical Analysis\Project\Code\essay_dataset_v1.xlsx"
OUTPUT_FILE = r"C:\Users\bhatt\OneDrive\Desktop\Sameek\0. University of North Texas\UNT Masters Course\Semester 3 - Fall 2025\CSCE 5310 - Methods in Empirical Analysis\Project\Code\dataset_after_only_pca.xlsx"
TARGET_COLUMN = 'domain1_score'
EXCLUDED_COLUMNS = [
    'document_number', 
    'domain1_score', 
    'word_count', 
    'stopword', 
    'unique_words'
]
TOP_N_FEATURES = 50

In [3]:
# --- Main Execution Block ---
try:
    # 1. Load the Excel file
    df = pd.read_excel(INPUT_FILE)
    print(f"Successfully loaded data from: {INPUT_FILE} (Shape: {df.shape})\n")

    # 2. Define Features (X) and Target (y)
    feature_cols = [col for col in df.columns if col not in EXCLUDED_COLUMNS]
    X = df[feature_cols]
    y = df[TARGET_COLUMN]

    # 3. Perform Univariate Feature Selection (F-test)
    selector = SelectKBest(score_func=f_regression, k='all')
    selector.fit(X, y)
    
    # Get the results
    f_scores = selector.scores_
    
    # 4. Create Ranking DataFrame
    feature_ranking_df = pd.DataFrame({
        'Feature': feature_cols,
        'F_Score': f_scores
    })

    # Sort to find the best features
    feature_ranking_df = feature_ranking_df.sort_values(by='F_Score', ascending=False)
    
    # 5. Extract the names of the top N features
    top_20_feature_names = feature_ranking_df.head(TOP_N_FEATURES)['Feature'].tolist()

    # 6. Define the columns for the new dataset
    final_columns = EXCLUDED_COLUMNS + top_20_feature_names
    
    print("="*50)
    print(f"üèÜ Top {TOP_N_FEATURES} Features Selected (F-Score Ranking):")
    print(top_20_feature_names)
    print("="*50)
    
    # 7. Create the new reduced DataFrame
    df_reduced = df[final_columns].copy()
    
    # 8. Save the reduced DataFrame to the new Excel file
    df_reduced.to_excel(OUTPUT_FILE, index=False)
    
    print("\n‚úÖ Execution Complete.")
    print(f"Reduced dataset created with {len(df_reduced.columns)} columns.")
    print(f"File saved to: {OUTPUT_FILE}")

except FileNotFoundError:
    print(f"\n‚ùå Error: The input file '{INPUT_FILE}' was not found.")
    print("Please ensure the Excel file is uploaded or the path/name is correct.")
except Exception as e:
    print(f"\n‚ùå An unexpected error occurred during processing: {e}")

Successfully loaded data from: C:\Users\bhatt\OneDrive\Desktop\Sameek\0. University of North Texas\UNT Masters Course\Semester 3 - Fall 2025\CSCE 5310 - Methods in Empirical Analysis\Project\Code\essay_dataset_v1.xlsx (Shape: (141, 1223))

üèÜ Top 50 Features Selected (F-Score Ranking):
['feature_computer', 'feature_PERSON', 'feature_PERCENT', 'feature_LOCATION', 'feature_CAPS', 'feature_NUM', 'feature_ha', 'feature_even', 'feature_life', 'feature_society', 'feature_friend', 'feature_one', 'feature_ORGANIZATION', 'feature_people', 'feature_hour', 'feature_believe', 'feature_year', 'feature_make', 'feature_world', 'feature_wa', 'feature_time', 'feature_family', 'feature_well', 'feature_part', 'feature_DATE', 'feature_screen', 'feature_show', 'feature_possible', 'feature_get', 'feature_technology', 'feature_take', 'feature_information', 'feature_away', 'feature_day', 'feature_actually', 'feature_also', 'feature_many', 'feature_two', 'feature_using', 'feature_outside', 'feature_spend', '