# Chi-Squared Analysis of Variables (Number of Features Selection)

In [None]:
import pandas as pd

In [None]:
import numpy as np

In [None]:
data = pd.read_csv("C:\\Users\\ASHIQ\\Desktop\\acafeteria.csv")
X = data.drop(columns = ['Overall_Satisfaction'], axis=1)
y = data['Overall_Satisfaction']

In [None]:
from sklearn.feature_selection import chi2
chi_scores = chi2(X, y)

In [None]:
score_value= pd.DataFrame({'Feature': X.columns, 'Chi-Squared Score': chi_scores[0], 'p-value': chi_scores[1]})
score_value = score_value.sort_values(by='Chi-Squared Score', ascending=False).reset_index(drop=True)

In [None]:
colors = np.where(scores_table['p-value'] > 0.05, 'red', 'blue')

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(score_value ['Feature'], score_value ['Chi-Squared Score'], color=colors)
plt.xlabel('Variable')
plt.ylabel('Chi-Squared Score')
plt.title('Chi-Squared Score vs. Variable')
plt.xticks(rotation=90)
plt.legend(['Low Importance (p-value > 0.05)', 'High Importance (p-value <= 0.05)'])
plt.tight_layout()

In [None]:
plt.savefig('chi_squared_histogram_figure.png')

In [None]:
with pd.ExcelWriter('chi_squared_test_results.xlsx') as writer:
    score_value.to_excel(writer, sheet_name='Chi-Squared Scores', index=False)
    plt.savefig(writer, sheet_name='Chi-Squared Histogram', format='png')

In [None]:
plt.show()

# Recursive Feature Elimination Process

In [1]:
import numpy as np

In [None]:
import panda as pd

In [None]:
data = pd.read_csv("C:\\Users\\ASHIQ\\Desktop\\acafeteria.csv")

In [None]:
chi_squared_scores = pd.read_csv("C:\\Users\\ASHIQ\\Desktop\\chi_squared_score.csv")["Chi-Squared Score"].values

In [None]:
from docx import Document
from docx.shared import Inches
from sklearn.feature_selection import RFE, chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from tabulate import tabulate
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
X = data.drop(columns=['Overall_Satisfaction'], axis=1)
y = data['Overall_Satisfaction']

In [None]:
estimators = [
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier()),
    ('Logistic Regression', LogisticRegression())
]

results = []

In [None]:
for name, estimator in estimators:
    rfe = RFE(estimator=estimator, n_features_to_select=4)
    rfe.fit(X, y) 

In [None]:
selected_features = X.columns[rfe.support_] 

In [None]:
X_selected = X[selected_features]
scores = cross_val_score(estimator, X_selected, y, cv=5)
mean_score = np.mean(scores)

In [None]:
y_pred = cross_val_predict(estimator, X_selected, y, cv=5)
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred, average='weighted')
recall = recall_score(y, y_pred, average='weighted')
f1 = f1_score(y, y_pred, average='weighted') 

In [None]:
results.append([name, selected_features, mean_score, chi_squared_scores, accuracy, precision, recall, f1])

In [None]:
table_headers = ['Estimator', 'Selected Features', 'Mean Cross-Validation Score','Chi-Squared Scores', 'Accuracy', 'Precision', 'Recall', 'F1 Score']
table_data = []

for name, selected_features, mean_score, _, accuracy, precision, recall, f1 in results:
    table_data.append([name, ', '.join(selected_features), mean_score, '', accuracy, precision, recall, f1])

table = tabulate(table_data, headers=table_headers)

In [None]:
doc = Document()
doc.add_heading('Feature Selection Results', level=1)
table_paragraph = doc.add_paragraph()
table_paragraph.add_run(table)

In [None]:
results_df = pd.DataFrame(results, columns=['Estimator', 'Selected Features', 'Mean CV Score', 'Chi-Squared Scores', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

In [None]:
results_df.to_excel('feature_selection_results.xlsx', index=False)