In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score


In [2]:
# Load in the full dataset
df = pd.read_csv("encoded_data.csv")

# Make a copy of the full dataset; this is the one we will manipulate while still retaining the full original data
df_subset = df
Just_AI_Satisfaction = df_subset['AI_Satisfaction']
df_subset = df_subset.drop('AI_Satisfaction', axis=1) # Taking out target variable


In [3]:
# Scale the data like within LDA
scaler = StandardScaler()
scaled_df = scaler.fit_transform(df_subset)


In [5]:
# Using KNN to calculate the nearest neighbors
knn = NearestNeighbors(n_neighbors = 5) 
knn.fit(scaled_df)
distances, indices = knn.kneighbors(scaled_df)


In [6]:
# Calculating how many points are closest to each of the 5 clusters
# This works by counting how many times each point appears as a neighbor to others

neighbor_counts = np.zeros(scaled_df.shape[0])

for i in range(scaled_df.shape[0]):
    neighbor_counts[indices[i]] += 1  
    

In [7]:
# Finding the two most frequent or biggest clusters based on neighbor counts
# We are selecting the two largest sets of neighbors or dense regions to add back to our data set
top_clusters = np.argsort(neighbor_counts)[-2:]  



In [8]:
# Putting the top clusters back into the dataframe 
df_subset['KNNCluster1'] = np.where(np.isin(np.arange(scaled_df.shape[0]), indices[top_clusters[0]]), 1, 0)
df_subset['KNNCluster2'] = np.where(np.isin(np.arange(scaled_df.shape[0]), indices[top_clusters[1]]), 1, 0)

# Adding the target variable back into the dataset
df_subset['AI_Satisfaction'] = Just_AI_Satisfaction

# Printing the updated dataset
print(df_subset.head())



   Country  Age  Annual_Salary  Gender  Education  \
0      165    2              3       0          3   
1      165    1              1       1          2   
2      165    2              3       1          2   
3      165    2              4       1          2   
4      165    1              1       1          2   

   Payment_Method_Credit_Debit  Living_Region  Online_Service_Preference  \
0                            0              1                          0   
1                            1              1                          0   
2                            1             11                          1   
3                            1             11                          1   
4                            0             11                          1   

   AI_Enhance_Experience  AI_Tools_Used_Chatbots  ...  \
0                      1                       1  ...   
1                      1                       1  ...   
2                      1                       0  ...

In [9]:
from os import read
# Export DataFrame to CSV
df_subset.to_csv('FE_final_data.csv', index=False)
