In [1]:
# Import necessary libraries and modules
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt

In [2]:
#Loading the dataset
df = pd.read_csv("iris.csv")
df.head(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa


In [3]:
print(df.isna().sum().sum())

0


In [4]:
# Identify columns with missing values and the appropriate imputation technique
imputation_dict = {
    'sepal_length': 'mean',  # impute with mean
    'sepal_width': 'mean',  # impute with mean
    'petal_length': 'mean',  # impute with mean
    'petal_width': 'mean',  # impute with mean
    'species': 'ffill',  # impute with forward fill
}

# Apply imputation to each column with missing values
for col, imputation in imputation_dict.items():
    if df[col].isnull().sum() > 0:  # check if column has missing values
        if imputation == 'mean':
            df[col].fillna(df[col].mode()[0], inplace=True)
        elif imputation == 'ffill':
            df[col].fillna(method='ffill', inplace=True)

# Save the imputed DataFrame to a new CSV file
df.to_csv('imputed_data.csv', index=False)

### The code should be able to handle both numerical and categorical features, impute the missing values using appropriate methods depending on the feature type.and  handle the outliers by replacing them with either the mean or the median of the respective feature.

In [5]:
import pandas as pd
import numpy as np
from scipy import stats

def impute_missing_values_and_handle_outliers(data):
    # Copy the input data to avoid modifying the original data
    data_imputed = data.copy()

    # Define a list of numerical features
    numerical_features = list(data.select_dtypes(include=[np.number]).columns)

    # Define a list of categorical features
    categorical_features = list(data.select_dtypes(exclude=[np.number]).columns)

    # Impute missing values in numerical features with the median
    data_imputed[numerical_features] = data_imputed[numerical_features].fillna(data_imputed[numerical_features].median())

    # Impute missing values in categorical features with the mode
    data_imputed[categorical_features] = data_imputed[categorical_features].fillna(data_imputed[categorical_features].mode().iloc[0])

    # Handle outliers in numerical features with the Z-score method
    for feature in numerical_features:
        z_scores = np.abs(stats.zscore(data_imputed[feature]))
        threshold = 3
        outliers = data_imputed[z_scores > threshold][feature]
        if outliers.shape[0] > 0:
            data_imputed.loc[z_scores > threshold, feature] = data_imputed[feature].median()

    return data_imputed

In [13]:
input_data

{'session_name': 'test',
 'session_description': 'test',
 'design_state_data': {'session_info': {'project_id': '1',
   'experiment_id': 'kkkk-11',
   'dataset': 'iris_modified.csv',
   'session_name': 'test',
   'session_description': 'test'},
  'target': {'prediction_type': 'Regression',
   'target': 'petal_width',
   'type': 'regression',
   'partitioning': True},
  'train': {'policy': 'Split the dataset',
   'time_variable': 'sepal_length',
   'sampling_method': 'No sampling(whole data)',
   'split': 'Randomly',
   'k_fold': False,
   'train_ratio': 0,
   'random_seed': 0},
  'metrics': {'optomize_model_hyperparameters_for': 'AUC',
   'optimize_threshold_for': 'F1 Score',
   'compute_lift_at': 0,
   'cost_matrix_gain_for_true_prediction_true_result': 1,
   'cost_matrix_gain_for_true_prediction_false_result': 0,
   'cost_matrix_gain_for_false_prediction_true_result': 0,
   'cost_matrix_gain_for_false_prediction_false_result': 0},
  'feature_handling': {'sepal_length': {'feature_name'

### This code is flexible enough to work with different user input by simply changing the value of the "method" key in the input JSON file. 

In [15]:
import json
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_regression, SelectFromModel
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesRegressor

with open('Algoparams.json', 'r') as f:
    input_data = json.load(f)

# Load the dataset from a CSV file
df = pd.read_csv('iris.csv')

# Perform feature reduction based on user input
if input_data['session_name'] == 'test':
    pass  # Do nothing
elif input_data['session_name'] == 'Corr with Target':
    X = df.drop(input_data['target_column'], axis=1)
    y = df[input_data['target_column']]
    k = input_data['k']
    skb = SelectKBest(f_regression, k=k)
    X_new = skb.fit_transform(X, y)
    df = pd.concat([pd.DataFrame(X_new), y], axis=1)
elif input_data['method'] == 'Tree-based':
    X = df.drop(input_data['target_column'], axis=1)
    y = df[input_data['target_column']]
    clf = ExtraTreesRegressor(n_estimators=50)
    clf = clf.fit(X, y)
    sfm = SelectFromModel(clf, prefit=True)
    X_new = sfm.transform(X)
    df = pd.concat([pd.DataFrame(X_new), y], axis=1)
elif input_data['method'] == 'PCA':
    X = df.drop(input_data['target_column'], axis=1)
    y = df[input_data['target_column']]
    n_components = input_data['n_components']
    pca = PCA(n_components=n_components)
    X_new = pca.fit_transform(X)
    df = pd.concat([pd.DataFrame(X_new), y], axis=1)

# Save the resulting dataset to a CSV file
df.to_csv('result.csv', index=False)

### After creating the model object, we can train the model on the data and make predictions using the appropriate methods for the chosen model object. If the prediction type or model type specified in the input JSON is not valid, a ValueError is raised.

In [21]:
import json
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier

# Load the input data from a JSON file
with open('Algoparams.json', 'r') as f:
    input_data = json.load(f)

# Create the appropriate model object based on the prediction type
if input_data['session_description'] == 'test':
    if input_data['prediction_type'] == 'Regression':
        model = LinearRegression()
    elif input_data['prediction_type'] == 'DecisionTreeRegressor':
        model = DecisionTreeRegressor()
    else:
        raise ValueError('Invalid model type for regression')
elif input_data['prediction_type'] == 'Classification':
    if input_data['prediction_type'] == 'LogisticRegression':
        model = LogisticRegression()
    elif input_data['prediction_type'] == 'DecisionTreeClassifier':
        model = DecisionTreeClassifier()
    else:
        raise ValueError('Invalid model type for classification')
else:
    raise ValueError('Invalid prediction type')


KeyError: 'prediction_type'

### Code to import data, take in inputs and use GridSearch CV to train model based on selection

In [25]:
import json
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Load the input data from a JSON file
with open('Algoparams.json', 'r') as f:
    input_data = json.load(f)

# Extract the relevant data from the input
X_train = input_data['X_train']
y_train = input_data['y_train']
X_test = input_data['X_test']

# Define a dictionary of hyperparameters for each model
param_grid = {
    'linear_regression': {'normalize': [True, False]},
    'decision_tree': {'max_depth': [2, 4, 6, 8, 10]},
    'random_forest': {'n_estimators': [50, 100, 200, 500], 'max_depth': [2, 4, 6, 8, 10]},
    'svm': {'C': [0.1, 1, 10, 100], 'gamma': [0.1, 1, 10, 100]},
    'pca_lr': {'pca__n_components': [2, 4, 6, 8, 10], 'lr__normalize': [True, False]},
    'pca_dt': {'pca__n_components': [2, 4, 6, 8, 10], 'dt__max_depth': [2, 4, 6, 8, 10]},
    'pca_rf': {'pca__n_components': [2, 4, 6, 8, 10], 'rf__n_estimators': [50, 100, 200, 500], 'rf__max_depth': [2, 4, 6, 8, 10]}
}

# Define the pipelines for each model
pipelines = {
    'linear_regression': Pipeline([('lr', LinearRegression())]),
    'decision_tree': Pipeline([('dt', DecisionTreeRegressor())]),
    'random_forest': Pipeline([('rf', RandomForestRegressor())]),
    'svm': Pipeline([('svm', SVR())]),
    'pca_lr': Pipeline([('pca', PCA()), ('lr', LinearRegression())]),
    'pca_dt': Pipeline([('pca', PCA()), ('dt', DecisionTreeRegressor())]),
    'pca_rf': Pipeline([('pca', PCA()), ('rf', RandomForestRegressor())])
}

# Define the names of the models to train based on the input method chosen
if input_data['prediction_type'] == 'No Reduction':
    models = ['linear_regression', 'decision_tree', 'random_forest', 'svm']
elif input_data['prediction_type'] == 'Corr with Target':
    models = ['linear_regression', 'decision_tree', 'random_forest']
elif input_data['prediction_type'] == 'Tree-based':
    models = ['decision_tree', 'random_forest']
elif input_data['prediction_type'] == 'PCA':
    models = ['pca_lr', 'pca_dt', 'pca_rf']
else:
    print('Invalid prediction type specified')

# Train the models using GridSearchCV
for model_name in models:
    print(f'Training {model_name}...')

KeyError: 'X_train'

### This will print out the classification report and confusion matrix in the console, which includes standard model metrics such as precision, recall, F1-score, and accuracy. 

In [26]:
from sklearn.metrics import classification_report, confusion_matrix

# train and test the model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# log the classification report and confusion matrix
print('Classification Report:\n', classification_report(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))


NameError: name 'model' is not defined