In [8]:
import numpy as np
import pandas as pd

# Creating a sample data
data = {'Score': [25, np.nan, 30, np.nan, 29, 27, 32, 31]}
df = pd.DataFrame(data)

# Mean Imputation
df['Score_Mean'] = df['Score'].fillna(df['Score'].mean())

# Median Imputation
df['Score_Median'] = df['Score'].fillna(df['Score'].median())

# Mode Imputation
df['Score_Mode'] = df['Score'].fillna(df['Score'].mode()[0])

print(df)

   Score  Score_Mean  Score_Median  Score_Mode
0   25.0        25.0          25.0        25.0
1    NaN        29.0          29.5        25.0
2   30.0        30.0          30.0        30.0
3    NaN        29.0          29.5        25.0
4   29.0        29.0          29.0        29.0
5   27.0        27.0          27.0        27.0
6   32.0        32.0          32.0        32.0
7   31.0        31.0          31.0        31.0


In [11]:
from sklearn.impute import KNNImputer

# Assuming the same initial data with missing values
data = {'Feature1': [25, 20, 30, 40, 29, 27, 32, 31],
        'Feature2': [20, 25, np.nan, 45, 30, 25, 35, 40]}
df = pd.DataFrame(data)

# Predictive Imputation using KNN
imputer = KNNImputer(n_neighbors=2)
df_filled = imputer.fit_transform(df)

In [12]:
import pandas as pd

# Let's assume a time series data with missing values
time_data = {'Time': pd.date_range(start='1/1/2023', periods=8, freq='D'),
             'Value': [1, np.nan, np.nan, 4, 5, np.nan, 7, 8]}
df_time = pd.DataFrame(time_data)

# LOCF
df_time['Value_LOCF'] = df_time['Value'].fillna(method='ffill')

# NOCB
df_time['Value_NOCB'] = df_time['Value'].fillna(method='bfill')

In [3]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

def select_feature_rfe(data, target, n_features_to_select=5): 
    model = LogisticRegression(solver='liblinear') 
    rfe = RFE(model, n_features_to_select=n_features_to_select)
    fit = rfe.fit(data, target) 

    selected_features = [f for f, s in zip(data.columns, fit.support_) if s]
    return selected_features

In [4]:
from sklearn.decomposition import PCA

def apply_pca(data, n_components=2):
    pca = PCA(n_components=n_components)
    principalComponents = pca.fit_transform(data)
    return pd.DataFrame(data=principalComponents, columns=[f'PC{i+1}' for i in range(n_components)])

In [5]:
from sklearn.preprocessing import StandardScaler

def standardize_features(data):
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)
    return pd.DataFrame(scaled_data, columns=data.columns)

In [6]:
from sklearn.model_selection import GridSearchCV

def tune_hyperparameters(model, param_grid, X_train, y_train, cv=5):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv)
    grid_search.fit(X_train, y_train)
    return grid_search.best_params_

In [7]:
from imblearn.over_sampling import SMOTE

def apply_smote(X, y):
    smote = SMOTE()
    X_res, y_res = smote.fit_resample(X, y)
    return X_res, y_res

ModuleNotFoundError: No module named 'imblearn'