In [1]:
# Uncomment below if you want to run this file only
%run main.ipynb
#%run data_cleaning.ipynb
#%run data_visualization.ipynb
#%run feature_engineering.ipynb

In [2]:
# Read the CSV From FEATURE ENGINEERING data source file from S3 into a DataFrame
# Use the methods from the S3Utils class
if s3_utils.check_file_exists(output_file_key_data_feature_engineering):
    data = s3_utils.read_csv_from_s3(output_file_key_data_feature_engineering)

In [3]:
# Separate features and target
X = data.drop('target', axis=1)  # Replace 'target' with the name of your target column
y = data['target']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
from sklearn.svm import LinearSVC
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split

# Assuming X_train, X_test, y_train, and y_test are predefined
# ... 

# Preprocess the data
# Setting up imputer and scaler
imputer = SimpleImputer(strategy='mean')
scaler = StandardScaler()

# Impute missing values and scale the training data
X_train_imputed = imputer.fit_transform(X_train)
X_train_scaled = scaler.fit_transform(X_train_imputed)

# Handle class imbalance
smote = SMOTE(random_state=42, n_jobs=-1)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

# Impute and scale the test data using the same transformer
X_test_imputed = imputer.transform(X_test)
X_test_scaled = scaler.transform(X_test_imputed)

# Apply PCA to reduce dimensionality
pca = PCA(n_components=0.95, random_state=42)  # Retain 95% of the variance
X_train_pca = pca.fit_transform(X_train_res)
X_test_pca = pca.transform(X_test_scaled)

# Train the SVM model using LinearSVC
model = LinearSVC(class_weight='balanced', dual=False, random_state=42)
model.fit(X_train_pca, y_train_res)

# Make predictions on the test set
y_pred = model.predict(X_test_pca)

# Save the predictions to a CSV file
predictions_df = pd.DataFrame({'Actual': y_train_res, 'Predicted': y_pred})
predictions_df.to_csv('svm_predictions.csv', index=False)

# Save the trained model and preprocessing objects
with open('model_pipeline.pkl', 'wb') as file:
    pickle.dump({'model': model, 'imputer': imputer, 'scaler': scaler, 'pca': pca, 'smote': smote}, file)


In [None]:
# Evaluate the model
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

In [None]:

param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [1, 0.1, 0.01],
    'kernel': ['rbf', 'linear']
}

grid_search = GridSearchCV(svm.SVC(class_weight='balanced'), param_grid, refit=True, verbose=2)
grid_search.fit(X_train_res, y_train_res)

# Evaluate the best model found by grid search
best_model = grid_search.best_estimator_
best_predictions = best_model.predict(X_test)
print(classification_report(y_test, best_predictions))

In [None]:
best_parameters = grid_search.best_params_
print("Best parameters:", best_parameters)

In [None]:

# # Save the trained model and preprocessing objects
# with open('model.pkl', 'wb') as file:
#     pickle.dump({'model': model, 'imputer': imputer, 'scaler': scaler, 'pca': pca}, file)


In [None]:
# # Create a LIME explainer
# explainer = lime.lime_tabular.LimeTabularExplainer(
#     training_data=X_train_pca, 
#     feature_names=[f'Feature {i+1}' for i in range(X_train_pca.shape[1])], 
#     class_names=['Class 0', 'Class 1'],  # Adjust class names as needed
#     mode='classification'
# )

# # Explain a prediction
# idx = 0  # Index of the instance you want to explain
# exp = explainer.explain_instance(X_test_pca[idx], model.predict_proba)
# exp.show_in_notebook(show_table=True)


In [None]:
# # Create a SHAP explainer
# explainer = shap.Explainer(model, X_train_pca)

# # Compute SHAP values
# shap_values = explainer(X_test_pca)

# # Visualize the explanation for the first instance
# shap.plots.waterfall(shap_values[0])
