In [23]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [3]:

data = {
    'Name': ['Alice', 'Bob', np.nan, 'David', np.nan],
    'Age': [25, np.nan, 28, 35, 22],
    'Salary': [50000, 54000, np.nan, 58000, 60000]
}
df=pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Salary
0,Alice,25.0,50000.0
1,Bob,,54000.0
2,,28.0,
3,David,35.0,58000.0
4,,22.0,60000.0


In [6]:
df['Age']=df['Age'].fillna(df['Age'].mean())
df=df.dropna(axis='index',how='any',subset=['Name'])
df['Salary']=df['Salary'].ffill()
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Salary']=df['Salary'].ffill()


Unnamed: 0,Name,Age,Salary
0,Alice,25.0,50000.0
1,Bob,27.5,54000.0
3,David,35.0,58000.0


In [7]:
data = {
    'ID': [101, 102, 103, 104, 105, 101, 102, 103, 104, 105],
    'Score': [85, 90, 78, 92, 88, 85, 90, 78, 92, 88]
}

print(data)

{'ID': [101, 102, 103, 104, 105, 101, 102, 103, 104, 105], 'Score': [85, 90, 78, 92, 88, 85, 90, 78, 92, 88]}


In [8]:
df1=pd.DataFrame(data)
df1

Unnamed: 0,ID,Score
0,101,85
1,102,90
2,103,78
3,104,92
4,105,88
5,101,85
6,102,90
7,103,78
8,104,92
9,105,88


In [9]:
df_cleaned=df1.drop_duplicates(subset='ID')
df_cleaned

Unnamed: 0,ID,Score
0,101,85
1,102,90
2,103,78
3,104,92
4,105,88


In [13]:
scaler=MinMaxScaler()
df['Salary Normalized']=scaler.fit_transform(df[['Salary']])
df1_encoded=pd.get_dummies(df,columns=['Name'])
df1_encoded['Salary Log']=np.log(df1_encoded['Salary'])
df1_encoded

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Salary Normalized']=scaler.fit_transform(df[['Salary']])


Unnamed: 0,Age,Salary,Salary Normalized,Name_Alice,Name_Bob,Name_David,Salary Log
0,25.0,50000.0,0.0,True,False,False,10.819778
1,27.5,54000.0,0.5,False,True,False,10.896739
3,35.0,58000.0,1.0,False,False,True,10.968198


In [24]:
# Select only numeric columns
X = df[['Age', 'Salary']]

# Apply VarianceThreshold
selector = VarianceThreshold(threshold=5.0)
X_reduced = selector.fit_transform(X)

print("\n✅ After Variance Threshold:")
print("Original shape:", X.shape)
print("Reduced shape:", X_reduced.shape)
print("Kept features:", X.columns[selector.get_support()].tolist())



✅ After Variance Threshold:
Original shape: (3, 2)
Reduced shape: (3, 2)
Kept features: ['Age', 'Salary']


In [25]:
X = df[['Age', 'Salary']]

# 1️⃣ Standardize data (very important before PCA)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 2️⃣ Apply PCA to reduce to 1 component (for simplicity)
pca = PCA(n_components=1)
X_pca = pca.fit_transform(X_scaled)

# 3️⃣ Create new DataFrame for PCA result
df_pca = pd.DataFrame(X_pca, columns=['PCA_Component1'])

print("✅ Original Data:")
print(X)
print("\n✅ After PCA (1 component):")
print(df_pca)

✅ Original Data:
    Age   Salary
0  25.0  50000.0
1  27.5  54000.0
3  35.0  58000.0

✅ After PCA (1 component):
   PCA_Component1
0       -1.559401
1       -0.277350
2        1.836751


In [21]:
agg_df=df_cleaned.groupby('ID').mean()
agg_df

Unnamed: 0_level_0,Score
ID,Unnamed: 1_level_1
101,85.0
102,90.0
103,78.0
104,92.0
105,88.0
