# **A collection of usefull functions for analysis**

### **Feature selector using chi-squared test**


In [None]:
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

data = pd.read_csv('your_data.csv')  # Replace with your actual data file

X = data.drop('target', axis=1)
y = data['target']

Applying chi-squared test

In [None]:
# Apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X, y)

# Get the scores for each feature
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

# Concatenate the dataframes for better visualization
featuresScores = pd.concat([dfcolumns, dfscores], axis=1)
featuresScores.columns = ['Specs', 'Score']  # Naming the dataframe columns
print(featuresScores.nlargest(10, 'Score'))  # Print 10 best features

**Function for reuseability**

In [None]:
def chi_squared_feature_selection(data, target, k=10):
    """
    Perform feature selection using the Chi-Squared test.

    Parameters:
    data (pd.DataFrame): The dataset containing features and the target variable.
    target (str): The name of the target variable column.
    k (int): The number of top features to select.

    Returns:
    pd.DataFrame: A DataFrame containing the top k features and their Chi-Squared scores.
    """
    X = data.drop(target, axis=1)
    y = data[target]
    
    # Apply SelectKBest class to extract top k best features
    bestfeatures = SelectKBest(score_func=chi2, k=k)
    fit = bestfeatures.fit(X, y)
    
    # Get the scores for each feature
    dfscores = pd.DataFrame(fit.scores_)
    dfcolumns = pd.DataFrame(X.columns)
    
    # Concatenate the dataframes for better visualization
    featuresScores = pd.concat([dfcolumns, dfscores], axis=1)
    featuresScores.columns = ['Specs', 'Score']  # Naming the dataframe columns
    
    return featuresScores.nlargest(k, 'Score')

# Example usage:
top_features = chi_squared_feature_selection(data, 'target', k=10)
print(top_features)